def cal_doc_distance(self, doc_text1, doc_text2): """ This interface calculates the distance between documents. Args: doc_text1(str): the input document text 1. doc_text2(str): the input document text 2. Returns: jsd(float): Jensen-Shannon Divergence distance of two documents. hd(float): Hellinger Distance of two documents. """ doc1_tokens = self.__tokenizer.tokenize(doc_text1) doc2_tokens = self.__tokenizer.tokenize(doc_text2) # Document topic inference. doc1, doc2 = LDADoc(), LDADoc() self.__engine.infer(doc1_tokens, doc1) self.__engine.infer(doc2_tokens, doc2) # To calculate jsd, we need dense document topic distribution. dense_dict1 = doc1.dense_topic_dist() dense_dict2 = doc2.dense_topic_dist() # Calculate the distance between distributions. # The smaller the distance, the higher the document semantic similarity. sm = SemanticMatching() jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2) hd = sm.hellinger_distance(dense_dict1, dense_dict2) return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10): """ This interface can be used to find topk keywords of document. Args: document(str): the input document text. top_k(int): top k keywords of this document. Returns: results(list): contains top_k keywords and their corresponding similarity compared to document. """ d_tokens = self.__tokenizer.tokenize(document) # Do topic inference on documents to obtain topic distribution. doc = LDADoc() self.__engine.infer(d_tokens, doc) doc_topic_dist = doc.sparse_topic_dist() items = [] words = set() for word in d_tokens: if word in words: continue words.add(word) wd = WordAndDis() wd.word = word sm = SemanticMatching() wd.distance = sm.likelihood_based_similarity( terms=[word], doc_topic_dist=doc_topic_dist, model=self.__engine.get_model()) items.append(wd) def take_elem(word_dis): return word_dis.distance items.sort(key=take_elem, reverse=True) results = [] size = len(items) for i in range(top_k): if i >= size: break results.append({ "word": items[i].word, "similarity": items[i].distance }) return results
def infer_doc_topic_distribution(self, document): """ This interface infers the topic distribution of document. Args: document(str): the input document text. Returns: results(list): returns the topic distribution of document. """ tokens = self.__tokenizer.tokenize(document) if tokens == []: return [] results = [] doc = LDADoc() self.__engine.infer(tokens, doc) topics = doc.sparse_topic_dist() for topic in topics: results.append({"topic id": topic.tid, "distribution": topic.prob}) return results
def cal_query_doc_similarity(self, query, document): """ This interface calculates the similarity between query and document. Args: query(str): the input query text. document(str): the input document text. Returns: lda_sim(float): likelihood based similarity between query and document based on LDA. """ q_tokens = self.__tokenizer.tokenize(query) d_tokens = self.__tokenizer.tokenize(document) doc = LDADoc() self.__engine.infer(d_tokens, doc) doc_topic_dist = doc.sparse_topic_dist() sm = SemanticMatching() lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist, self.__engine.get_model()) return lda_sim