Ejemplo n.º 1
0
 def __init__(self, corpus_path):
     with open(corpus_path) as f:
         self.corpus = [strip_all_punctuations(s.strip()) for s in f.readlines()]
     self.tokenizer = jieba.cut
     self.vectorizer = CountVectorizer(tokenizer=self.tokenizer)
     self.transformer = TfidfTransformer()
     self.tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(self.corpus))
     self.word = self.vectorizer.get_feature_names()
     self.weight = self.tfidf.toarray()
Ejemplo n.º 2
0
    def cosine(self, q, match_corpus=[], verbose=False):
        """
        计算余弦相似,加了一点规则
        :param q: 问句
        :param match_corpus: 搜索得到的相似自己
        :param verbose: 是否打印一些中间过程,用于调试
        :return: 相似问句或者None
        """

        q_tfidf_weight = self.transformer.transform(
            self.vectorizer.transform([strip_all_punctuations(q.strip())])).toarray()

        if match_corpus:
            match_corpus_strip = [strip_all_punctuations(s.strip()) for s in match_corpus]
            corpus_tfidf_weight = self.transformer.transform(
                self.vectorizer.transform(match_corpus_strip)).toarray()
            sim_vec = np.dot(corpus_tfidf_weight, np.transpose(q_tfidf_weight))
            sim_arg = np.argmax(sim_vec)
            best_q = match_corpus[sim_arg]
            best_score = np.max(sim_vec)
        else:
            sim_vec = np.dot(self.weight, np.transpose(q_tfidf_weight))
            sim_arg = np.argmax(sim_vec)
            best_q = self.corpus[sim_arg]
            best_score = np.max(sim_vec)

        if verbose:
            print "best_score:", best_score
            self.verbose_q(q)
            print best_q
            self.verbose_q(best_q)

        # 词数小于4的,所有词都必须在相似问句中出现
        q_tokens = "|".join(self.tokenizer(q.strip())).split("|")
        best_q_tokens = "|".join(self.tokenizer(best_q.strip())).split("|")
        if len(q_tokens) < 4:
            if not set(q_tokens).issubset(set(best_q_tokens)):
                best_score = 0

        if best_score > 0.4:
            return best_q, sim_arg
        else:
            return None, None
Ejemplo n.º 3
0
 def verbose_q(self, q):
     q_tfidf_weight = self.transformer.transform(
         self.vectorizer.transform([strip_all_punctuations(q.strip())])).toarray()
     q_tokens = "|".join(self.tokenizer(q.strip())).split("|")
     print q_tokens
     print q_tfidf_weight
     print self.word
     for w, t in zip(q_tfidf_weight[0], self.word):
         if w > 0.0:
             print t, w
Ejemplo n.º 4
0
 def get_search_results(self, q):
     answers, err_msg = greeting_service(q)
     if err_msg:  # 无返回数据也会报错
         pretty_print(err_msg)
         return [], []
     return [strip_all_punctuations(s.get('ask')) for s in answers], [s.get('answer') for s in answers]
Ejemplo n.º 5
0
 def check_customerService_intention(self, query):
     return match_patterns(unicode(strip_all_punctuations(query)), Intention.customerService_set)
Ejemplo n.º 6
0
 def check_greeting_intention(self, query):
     return match_patterns(unicode(strip_all_punctuations(query)), Intention.greeting_set)