Ejemplo n.º 1
0
    def _build_feature_extractor(self, mode, files):
        print('Build feature extraction...')
        corpus = list()

        for path in files:
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = self.cut(question)
                    corpus.append(tokens)

        if mode == 'ngram':
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if mode == 'tfidf':
            print_cn('use {0}'.format(mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
Ejemplo n.º 2
0
def get_train_data(data_path, normal_path, shuffle_path):
    add_dict()
    normal_f = open(normal_path, 'w')
    shuffle_f = open(shuffle_path, 'w')
    # drop_f=open(drop_path)

    with open(data_path, 'r') as f:
        for line in f:
            print_cn(line)
            line = json.loads(line)
            intentions = ','.join(line['intention'])
            questions = line['question']

            for question in questions:
                normal_f.write(intentions + '#' + question + '\n')
                tokens = cut(question)
                if len(tokens) <= 1:
                    continue
                elif len(tokens) == 2:
                    random.shuffle(tokens)
                    shuffle_f.write(intentions + '#' + ''.join(tokens) + '\n')
                else:
                    for _ in range(2):
                        random.shuffle(tokens)
                        shuffle_f.write(intentions + '#' + ''.join(tokens) +
                                        '\n')
    normal_f.close()
    shuffle_f.close()
Ejemplo n.º 3
0
    def _build_feature_extraction(self, data_path):
        print('Build feature extraction...')
        corpus = list()
        with open(data_path, 'r') as f:
            reader = csv.reader(f, delimiter='#')
            for line in reader:
                b = line[1].decode('utf-8')
                # b = QueryUtils.static_remove_stop_words(b)
                tokens = QueryUtils.static_jieba_cut(b)
                corpus.append(tokens)

        if self.mode == 'ngram':
            print_cn('Use {0}'.format(self.mode))
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if self.mode == 'tfidf':
            print_cn('Use {0}'.format(self.mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
        if self.mode == 'fasttext':
            pass
Ejemplo n.º 4
0
def classify():
    try:
        args = request.args
        q = args['q']
        q = urllib.unquote(q).decode('utf8')
        ids = [char2index.get(c, 3) for c in q]

        print_cn(q)
        print(ids)
        inputs_length = [len(ids)] * BATCH_SIZE
        ids_inputs = [ids for _ in range(BATCH_SIZE)]
        # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)]
        # predicting_embed_inputs = np.asarray(predicting_embed_inputs)

        answer_logits = tf_sess.run(tf_model.predicting_logits,
                                    feed_dict={
                                        tf_model.encoder_inputs.name:
                                        ids_inputs,
                                        tf_model.encoder_inputs_length.name:
                                        inputs_length
                                    })

        prediction = recover(answer_logits.tolist()[0], index2char, False)
        # print(answer_logits.tolist()[0])
        print("predict->", prediction)
        print("-----------------------")
        return prediction
    except Exception, e:
        return None
Ejemplo n.º 5
0
def predict(dict_path, model_path):
    sess, model = load_tf_session(dict_path, model_path)
    char2index, index2char = init_dict(dict_path)
    while True:
        line = _get_user_input()
        line = line.strip().decode('utf-8')
        ids = [char2index.get(c, 3) for c in line]

        print_cn(line)
        print(ids)
        inputs_length = [len(ids)] * BATCH_SIZE
        ids_inputs = [ids for _ in range(BATCH_SIZE)]
        # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)]
        # predicting_embed_inputs = np.asarray(predicting_embed_inputs)

        answer_logits = sess.run(model.predicting_logits,
                                 feed_dict={
                                     model.encoder_inputs.name:
                                     ids_inputs,
                                     model.encoder_inputs_length.name:
                                     inputs_length
                                 })

        prediction = recover(answer_logits.tolist()[0], index2char, False)
        # print(answer_logits.tolist()[0])
        print("predict->", prediction)
        print("-----------------------")
Ejemplo n.º 6
0
def predict(data_path,
            encoder_vocab_path,
            decoder_vocab_path,
            model_path,
            embedding='word2vec'):
    if embedding == 'word2vec':
        encoder_vocab, embeddings = load_word2vec(encoder_vocab_path)
    else:
        encoder_vocab, embeddings = load_fasttext(encoder_vocab_path)
    decoder_vocab = load_decoder_vocab(decoder_vocab_path)
    global ENC_VOCAB_SIZE
    ENC_VOCAB_SIZE = len(encoder_vocab)
    global DEC_VOCAB_SIZE
    DEC_VOCAB_SIZE = len(decoder_vocab)
    global EMBEDDING_SIZE
    EMBEDDING_SIZE = len(embeddings[0])

    for word in encoder_vocab:
        jieba.add_word(word)
    for word in decoder_vocab:
        jieba.add_word(word)

    model = BeliefRnn(decoder_vocab, False)
    model.build_graph()

    saver = tf.train.Saver()
    # loaded_graph = tf.Graph()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(model.embedding_init,
                 feed_dict={model.embedding_placeholder: embeddings})
        _check_restore_parameters(sess, saver, model_path)
        while True:
            line = _get_user_input()
            predicting_inputs = list(jieba.cut(line.strip().decode('utf-8')))
            ids = []
            for word in predicting_inputs:
                if word in encoder_vocab:
                    ids.append(encoder_vocab.index(word))
                else:
                    ids.append(encoder_vocab.index('#UNK#'))
            print_cn(predicting_inputs)
            predicting_inputs_length = [len(predicting_inputs)] * BATCH_SIZE
            ids_inputs = [ids for _ in range(BATCH_SIZE)]
            # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)]
            # predicting_embed_inputs = np.asarray(predicting_embed_inputs)

            answer_logits = sess.run(model.predicting_logits,
                                     feed_dict={
                                         model.encoder_inputs.name:
                                         ids_inputs,
                                         model.encoder_inputs_length.name:
                                         predicting_inputs_length
                                     })

            prediction = recover_label(answer_logits.tolist()[0],
                                       decoder_vocab)
            # print(answer_logits.tolist()[0])
            print("predict->", prediction)
            print("-----------------------")
Ejemplo n.º 7
0
 def make(self, q, slot):
     b, q = self.quant_fix(q)
     if b:
         cn_util.print_cn(str(q))
         r = self.jieba_cut(''.join(q))
         # purify
         return self.all_possible(r, slot)
     else:
         return None
Ejemplo n.º 8
0
 def _request_solr(self, q):
     ## cut q into tokens
     tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)]
     q = ' OR '.join(tokens)
     url = self.qa_url.format(q)
     # print('qa_debug:', url)
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
Ejemplo n.º 9
0
    def check_zero_tokens(self, tokens):
        count = 0
        target_len = len(tokens)
        for word in tokens:
            word = word.encode('utf-8')
            if w2v_model.__contains__(word.strip()):
                count += 1
        if count != target_len:
            print_cn(tokens)

        return True if count == target_len else False
Ejemplo n.º 10
0
 def _request_solr(self, q):
     tokenized, exact_q = self.purify_q(q)
     if not self.last_g:
         url = self.i_url % (tokenized, exact_q)
         self.last_g = q
     else:
         last_tkz, last_exact_q = self.purify_q(self.last_g)
         url = self.simple_context_i_url % (tokenized, exact_q, last_tkz,
                                            last_exact_q)
         self.last_g = q
     cn_util.print_cn('debug:interactive_url:' + url)
     r = requests.get(url)
     return r
Ejemplo n.º 11
0
def online(model_path):
    clf = Multilabel_Clf.load(model_path=model_path)

    print('loaded model file...')
    try:
        while True:
            question = raw_input('input something...\n')
            tokens = jieba.cut(question, cut_all=True)
            labels, probs = clf.predict(list(tokens))
            print_cn(labels, probs)
            print('-------------------------')
    except KeyboardInterrupt:
        print('interaction interrupted')
Ejemplo n.º 12
0
 def _request_solr(self, q, key, base_url):
     ## cut q into tokens
     key = '%s:' % key
     tokens = [
         s for s in QueryUtils.static_jieba_cut(
             q, smart=False, remove_single=True)
     ]
     if len(tokens) == 0:
         return None
     q = key + "(" + '%20'.join(tokens) + ")"
     url = base_url % q
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
Ejemplo n.º 13
0
 def _request_solr(self, q, key, base_url):
     try:
         ## cut q into tokens
         key = 'fq=%s:' % key
         tokens = [
             s for s in QueryUtils.static_jieba_cut(
                 q, smart=False, remove_single=True)
         ]
         q = key + '%20'.join(tokens)
         url = base_url.format(q)
         cn_util.print_cn(url)
         r = requests.get(url)
         return r
     except:
         traceback.print_exc()
         return None
Ejemplo n.º 14
0
    def get_w2v_emb(self, tokens):
        embedding = np.zeros((1, 300), dtype=np.float32)
        count = 0
        # print_cn(tokens)
        for word in tokens:
            word = word.encode('utf-8')
            if w2v_model.__contains__(word.strip()):
                vector = w2v_model.__getitem__(word.strip())
                result = [v for v in vector]

                embedding = np.add(embedding, np.asarray(result))
                # print embedding
                count += 1
        if count == 0:
            print('get...', count)
            print_cn(tokens)
            return False
        else:
            embedding = np.divide(embedding, count)
            return np.squeeze(embedding)
Ejemplo n.º 15
0
 def select_max_match_with_sim(self, q, r):
     if not self.bt:
         return None
     matched_questions = SolrUtils.get_dynamic_response(r=r, key='question',
                                                        random_hit=False,
                                                        random_field=True,
                                                        keep_array=False,
                                                        facet=True)
     q_tokens = ' '.join(QueryUtils.static_jieba_cut(q))
     matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions]
     max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens)
     best_sentence = ''.join(max_sim['sentence'].split(' '))
     sim = max_sim['sim']
     cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']')
     if sim > 0.3:
         index = matched_questions.index(best_sentence)
         answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index,
                                                 random_field=True,
                                                 random_hit=False)
         return answer
     return None
Ejemplo n.º 16
0
    def _fasttext_vector(self, tokens):
        if not self.weighted:
            try:
                weights = np.ones(shape=len(tokens))
                url = self.fasttext_url_weighted.format(
                    ','.join(tokens),
                    ",".join([str(weight) for weight in weights]))
            except:
                traceback.print_exc()
        else:
            try:
                idf_url = "http://10.89.100.14:3032/s/{0}".format(
                    "%7C".join(tokens))
                idf_r = requests.get(url=idf_url)
                weights = []
                returned_json = idf_r.json()
                max_weight = 1
                for key, value in returned_json.iteritems():
                    if value > max_weight:
                        max_weight = value
                for token in tokens:
                    if token not in returned_json:
                        weights.append(str(max_weight))
                    else:
                        weights.append(str(returned_json[token]))

                url = self.fasttext_url_weighted.format(
                    ','.join(tokens), ','.join(weights))
            except:
                traceback.print_exc()
                url = self.fasttext_url.format(','.join(tokens))
        try:
            r = requests.get(url=url)
            vector = r.json()['vector']
            return vector
        except:
            print_cn(url)
            traceback.print_exc()
            return None
Ejemplo n.º 17
0
 def travel_with_clf(self, node, tokens, gbdt_recursion=True):
     key = None  # word/tag
     next_node = None
     key_found = False
     num_found = False
     value_types = node.value_types
     query = ''.join(tokens)
     if "RANGE" in value_types and gbdt_recursion:
         try:
             slot, proba = self.gbdt.predict(parent_slot=node.slot,
                                             input_=query)
             next_node = self.graph.get_global_node(slot=slot)
             num_found = next_node is not None and proba > 0.95
             if num_found:
                 cn_util.print_cn('found type by gbdt_range:%s,%s' %
                                  (cn_util.cn(slot), proba))
                 self.last_slot = node.slot
             else:
                 cn_util.print_cn('NOT found type by gbdt_range:%s,%s' %
                                  (cn_util.cn(slot), proba))
         except Exception, e:
             print(e.message)
             num_found = False
Ejemplo n.º 18
0
        sim = max_sim['sim']
        cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']')
        if sim > 0.3:
            index = matched_questions.index(best_sentence)
            answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index,
                                                    random_field=True,
                                                    random_hit=False)
            return answer
        return None

    def _num_answer(self, r):
        return int(r.json()["response"]["numFound"])

    def _get_response(self, r, i=0):
        try:
            a = r.json()["response"]["docs"][i]['answer']
            rr = np.random.choice(a, 1)[0]
            x = random.randint(0, min(0, len(a) - 1))
            return rr.encode('utf8')
        except:
            return None

    def purify_q(self, q):
        q = self.qu.remove_cn_punct(q)
        pos_q = self.qu.corenlp_cut(q, remove_tags=["CD", "VA", "AD", "VC"])
        return ''.join(pos_q), q

if __name__ == '__main__':
    qa = SimpleQAKernel()
    cn_util.print_cn(qa.kernel(u'得基怎么去')[1])
Ejemplo n.º 19
0
            try:
                i = i + 1
                components = line.split('##')
                user = components[0]
                if user != current_user:
                    current_user = user
                    instance.main_kernel.clear_state()
                    instance.last_response = None
                question = components[2]
                question = question.split(":")[1]
                answer = instance.kernel(question.decode('utf-8'))
                print('---%d--%s###%s' % (i, question, answer))
            except:
                instance.main_kernel.clear_state()


if __name__ == '__main__':
    kernel = EntryKernel()
    input_file = '../data/sc/test/test.txt'

    # test(input_file, kernel)
    #
    while True:
        input_ = raw_input()
        input_ = input_.decode('utf-8')
        response = kernel.kernel(input_)
        print_cn(response)
    #
    # response = kernel.kernel(u'我不买实惠的衣服')
    # print(response)
Ejemplo n.º 20
0
    children = line.split('#')[1].split(':')[2].strip()
    children = children.split(',')
    for i, item in enumerate(children):
        children[i] = Node(children[i])
    return Node(parent, children)


def build_tree(data_path):
    tree = Tree()
    with open(data_path, 'r') as inp:
        for line in inp:
            node = get_node(line)
            tree.add(node)

    return tree

def sort_intention(input_):
    tree = build_tree('../../../data/sc/belief_graph.txt')
    tree.pre_order(tree.root)
    pre_order_list = tree.pre_order_list
    input_=input_.split(',')
    sorted_intention = sorted(input_, key=lambda x:pre_order_list.index(x))
    return sorted_intention


if __name__ == '__main__':
    test=['女,购物,衣服','吃饭,低,有','辣,吃饭,低,有,女,购物,衣服']
    for t in test:
        print_cn(sort_intention(t))

Ejemplo n.º 21
0
                        q = line[1].encode('utf-8')
                        response = self.make(q, '存款')
                    elif u'取款' in line[0]:
                        q = line[1].encode('utf-8')
                        response = self.make(q, '取款')
                    elif u'转账' in line[0]:
                        q = line[1].encode('utf-8')
                        response = self.make(q, '转账')

                    if response:
                        write = []
                        for r in response:
                            w = [line[0], r.strip()]
                            write.append(w)

                    for w in write:
                        ww = '\t'.join(w)
                        mm = ww.strip()
                        out.write(mm + '\n')


if __name__ == '__main__':
    qu = QueryUtils()
    jieba.load_userdict("../data/char_table/ext1.dic")
    # qu.process_data('../data/business/intention_pair_q', '../data/business/business_train_v7')
    # # print(QueryUtils.static_remove_cn_punct(u'我在电视上见过你,听说你很聪明啊?'))
    # cn_util.print_cn(qu.quant_bucket_fix('一点钱'))
    # cn_util.print_cn(qu.quant_bucket_fix('我要取1千零1百'))
    # cn_util.print_cn(QueryUtils.static_jieba_cut('紫桂焖大排', smart=True, remove_single=True))
    cn_util.print_cn(QueryUtils.static_remove_stop_words('我来高兴哈'))
    # cn_util.print_cn(','.join(jieba.cut_for_search('南京精菜馆'.decode('utf-8'))))
Ejemplo n.º 22
0
def main():
    model_path = '../model/sc/belief_clf.pkl'
    train_data_path = '../data/sc/train/sale_train0831.txt'
    test_data_path = '../data/sc/train/sale_train0831.txt'
    parser = argparse.ArgumentParser()
    parser.add_argument('-m',
                        choices={'train', 'test'},
                        default='train',
                        help='mode.if not specified,it is in test mode')

    args = parser.parse_args()

    if args.m == 'train':
        train(train_data_path, model_path)
    elif args.m == 'test':
        test(test_data_path, model_path)
    else:
        print('Unknow mode, exit.')


if __name__ == '__main__':
    # main()

    model_path = '../model/sc/belief_clf.pkl'
    clf = Multilabel_Clf.load(model_path=model_path)
    inputs = [u"买热水器"]
    for p in inputs:
        labels, probs = clf.predict(input_=p.decode('utf-8'))
        cn_util.print_cn(','.join(labels))
Ejemplo n.º 23
0
class GKernel:
    def __init__(self, graph_path, clf_path):
        # self.tokenizer = CoreNLP()
        self.graph = None
        self.gbdt = None
        self.state_cleared = True
        self._load_graph(graph_path)
        self._load_clf(clf_path)

        self.qu = QueryUtils()

    last_slot = None

    base_url = "http://localhost:11403/solr/business/select?defType=edismax&indent=on&wt=json&rows=1"
    trick_url = "http://localhost:11403/solr/trick/select?defType=edismax&indent=on&wt=json&rows=10"

    # tokenizer_url = "http://localhost:5000/pos?q="

    def kernel(self, query):
        return self.r_walk(query=query)

    def clear_state(self):
        print 'state cleared'
        self.state_cleared = True
        self.last_slot = None

    def _load_clf(self, path):
        print('loading gbdt classifier...')
        with open(path, 'rb') as f:
            self.gbdt = pickle.load(f)

    def _load_graph(self, path):
        print('loading logic graph...')
        with open(path, "rb") as input_file:
            self.graph = pickle.load(input_file)

    def num_answer(self, r):
        return int(r.json()["response"]["numFound"])

    def travel_with_clf(self, node, tokens, gbdt_recursion=True):
        key = None  # word/tag
        next_node = None
        key_found = False
        num_found = False
        value_types = node.value_types
        query = ''.join(tokens)
        if "RANGE" in value_types and gbdt_recursion:
            try:
                slot, proba = self.gbdt.predict(parent_slot=node.slot,
                                                input_=query)
                next_node = self.graph.get_global_node(slot=slot)
                num_found = next_node is not None and proba > 0.95
                if num_found:
                    cn_util.print_cn('found type by gbdt_range:%s,%s' %
                                     (cn_util.cn(slot), proba))
                    self.last_slot = node.slot
                else:
                    cn_util.print_cn('NOT found type by gbdt_range:%s,%s' %
                                     (cn_util.cn(slot), proba))
            except Exception, e:
                print(e.message)
                num_found = False

        # last try of RANGE
        if not num_found and "RANGE" in value_types:
            for t in tokens:
                try:
                    t = cn2arab.cn2arab(t)[1].replace(' ', '').replace(
                        '\t', '').encode('utf-8')
                    if t.isdigit():
                        try:
                            next_node = node.go(q=float(t), value_type="RANGE")
                            num_found = next_node is not None
                            if num_found:
                                cn_util.print_cn('found type by RANGE: %s' %
                                                 next_node.slot)
                                self.last_slot = node.slot
                                key = t
                                break
                        except Exception, e:
                            print(e.message)
                            num_found = False
                except Exception, e:
                    print(e.message)
                    num_found = False
Ejemplo n.º 24
0
                                break
                        except Exception, e:
                            print(e.message)
                            num_found = False
                except Exception, e:
                    print(e.message)
                    num_found = False

        if not key_found and "KEY" in value_types and gbdt_recursion:
            try:
                slot, proba = self.gbdt.predict(parent_slot=node.slot,
                                                input_=query)
                next_node = self.graph.get_global_node(slot=slot)
                key_found = next_node is not None and proba > 0.95
                if key_found:
                    cn_util.print_cn('found type by gbdt_key:%s,%s' %
                                     (cn_util.cn(slot), proba))
                    self.last_slot = node.slot
                else:
                    cn_util.print_cn('NOT found type by gbdt:%s,%s' %
                                     (cn_util.cn(slot), proba))
            except Exception, e:
                print(e.message)
                key_found = False

        # last try of KEY
        if not key_found and not num_found and "KEY" in value_types:
            for t in tokens:
                try:
                    value_types = node.value_types
                    if "KEY" in value_types and not t.isdigit():
                        try:
Ejemplo n.º 25
0
                    response = None
                    if u'存款' in line[0]:
                        q = line[1].encode('utf-8')
                        response = self.make(q, '存款')
                    elif u'取款' in line[0]:
                        q = line[1].encode('utf-8')
                        response = self.make(q, '取款')
                    elif u'转账' in line[0]:
                        q = line[1].encode('utf-8')
                        response = self.make(q, '转账')

                    if response:
                        write = []
                        for r in response:
                            w = [line[0], r.strip()]
                            write.append(w)

                    for w in write:
                        ww = '\t'.join(w)
                        mm = ww.strip()
                        out.write(mm + '\n')


if __name__ == '__main__':
    qu = QueryUtils()
    qu.process_data('../data/business/intention_pair_q',
                    '../data/business/business_train_v7')
    # print(QueryUtils.static_remove_cn_punct(u'我在电视上见过你,听说你很聪明啊?'))
    cn_util.print_cn(qu.quant_bucket_fix('一点钱'))
    # cn_util.print_cn(qu.quant_bucket_fix('我要取1千零1百'))
Ejemplo n.º 26
0
    def r_walk_with_pointer_with_clf(self, query, given_slot=None):
        r = None
        response = None
        if given_slot == '#NULL#':
            given_slot = None
            self.last_slot = None
            self.clear_state()
        if self.state_cleared:
            if given_slot:
                self.should_clear_state(self.last_slot)
                url = self.base_url + "&q=exact_question:" + \
                    query + "%20AND%20exact_intention:" + given_slot
            else:
                url = self.base_url + "&q=exact_question:" + query
            cn_util.print_cn('extact_try_url %s' % url)
            r = requests.get(url)

            if self.num_answer(r) > 0:
                self.state_cleared = False
                return_slot = self.last_slot = self.get_intention(r)
                self.should_clear_state(self.last_slot)
                return_response = self.get_response(r)
                cn_util.print_cn('clear exact_%s, %s' %
                                 (return_slot, self.get_response(r)))
                return return_slot, return_response
            else:
                tokenized = self.qu.corenlp_cut(query, self.qu.remove_tags)
                if len(tokenized) == 0:
                    # do trick
                    self.clear_state()
                    return None, self.trick(query)
                if given_slot:
                    cn_util.print_cn('given:%s' % given_slot)
                    given = self.graph.get_global_node(given_slot)
                else:
                    given = self.graph
                fixed_query_tokens = self.qu.quant_bucket_fix(query)
                node = self.travel_with_clf(given, fixed_query_tokens)
                self.state_cleared = False
                if node != given:
                    if self.last_slot == given.slot:
                        url = self.base_url + "&q=exact_intention:" + node.slot
                    else:
                        url = self.base_url + "&q=exact_last_intention:" + \
                            self.last_slot + "%20AND%20exact_intention:" + node.slot
                    cn_util.print_cn("gbdt_result_url %s" % url)
                    r = requests.get(url)
                    if self.num_answer(r) > 0:
                        self.last_slot = node.slot
                        self.should_clear_state(node.slot)
                        cn_util.print_cn('clear deepest_ %s, %s' %
                                         (node.slot, self.get_response(r)))
                        return node.slot, self.get_response(r)
                else:
                    return None, self.trick(query)

        else:
            if given_slot:
                self.last_slot = given_slot
            else:
                if not self.last_slot:
                    parent_slot = self.graph.get_global_node(
                        self.last_slot).parent_node.slot
                    cn_util.print_cn('retrace...', self.last_slot, parent_slot)
                    self.clear_state()
                    slot, response = self.r_walk_with_pointer_with_clf(
                        query, parent_slot)
                    return slot, response
            url = self.base_url + "&q=exact_question:" + query + \
                "%20AND%20exact_last_intention:" + self.last_slot
            cn_util.print_cn('non_clear_url_first_try', url)
            r = requests.get(url)

            if self.num_answer(r) > 0:
                self.state_cleared = False
                self.last_slot = slot_ = self.get_intention(r)
                self.should_clear_state(self.last_slot)
                cn_util.print_cn('non clear exact_', slot_,
                                 self.get_response(r))
                return slot_, self.get_response(r)
            else:

                node = self.graph.all_nodes[self.last_slot]
                next_node = None
                tks = self.qu.corenlp_cut(query)
                if len(tks) == 0:
                    self.clear_state()
                    return self.r_walk_with_pointer_with_clf(query)
                fixed_query_tokens = self.qu.quant_bucket_fix(query)
                next_node = self.travel_with_clf(node, fixed_query_tokens)
                if next_node == node:
                    # query solr
                    parent_slot = self.graph.get_global_node(
                        self.last_slot).parent_node.slot
                    cn_util.print_cn('retrace...', self.last_slot, parent_slot)
                    self.clear_state()
                    return self.r_walk_with_pointer_with_clf(
                        query, parent_slot)
                else:
                    url = self.base_url + "&q=last_intention:" + \
                        self.last_slot + "%20AND%20intention:" + next_node.slot
                    cn_util.print_cn("non_clear_go_deeper_url:", url)
                    r = requests.get(url)

                if self.num_answer(r) > 0:
                    response = self.get_response(r)
                    slot_ = self.get_intention(r)
                    self.last_slot = slot_
                    self.state_cleared = False
                    # but
                    cn_util.print_cn(
                        str(
                            self.graph.get_global_node(
                                slot_).classified_out_neighbors))
                    self.should_clear_state(slot_)
                    cn_util.print_cn('non clear deepest _', slot_,
                                     self.get_response(r))
                    return slot_, self.get_response(r)
                else:
                    # do trick
                    self.clear_state()
                    url = self.concat_solr_request(query=query,
                                                   base_url=self.trick_url)
                    r = requests.get(url)
                    response = self.get_response(r)
                    cn_util.print_cn("None-CLEAR-Trick", self.get_response(r))
                    return None, response
Ejemplo n.º 27
0
            else:
                return False, np.random.choice(self.null_anwer, 1)[0]
        except:
            return False, np.random.choice(self.null_anwer, 1)[0]

    def _request_solr(self, q, key, base_url):
        try:
            ## cut q into tokens
            key = 'fq=%s:' % key
            tokens = [
                s for s in QueryUtils.static_jieba_cut(
                    q, smart=False, remove_single=True)
            ]
            q = key + '%20'.join(tokens)
            url = base_url.format(q)
            cn_util.print_cn(url)
            r = requests.get(url)
            return r
        except:
            traceback.print_exc()
            return None

    def _num_answer(self, r):
        return int(r.json()["response"]["numFound"])


if __name__ == '__main__':
    qa = QAKernel()
    # result = qa.kernel(u'三星手机在哪', u"Omega,一期三楼")
    cn_util.print_cn(qa.kernel(u'星巴克好吃吗', u"Omega,一期三楼")[1])