Beispiel #1
0
    def cal_doc_distance(self, doc_text1, doc_text2):
        """
        This interface calculates the distance between documents.

        Args:
            doc_text1(str): the input document text 1.
            doc_text2(str): the input document text 2.

        Returns:
            jsd(float): Jensen-Shannon Divergence distance of two documents.
            hd(float): Hellinger Distance of two documents.
        """
        doc1_tokens = self.__tokenizer.tokenize(doc_text1)
        doc2_tokens = self.__tokenizer.tokenize(doc_text2)

        # Document topic inference.
        doc1, doc2 = LDADoc(), LDADoc()
        self.__engine.infer(doc1_tokens, doc1)
        self.__engine.infer(doc2_tokens, doc2)

        # To calculate jsd, we need dense document topic distribution.
        dense_dict1 = doc1.dense_topic_dist()
        dense_dict2 = doc2.dense_topic_dist()
        # Calculate the distance between distributions.
        # The smaller the distance, the higher the document semantic similarity.
        sm = SemanticMatching()
        jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
        hd = sm.hellinger_distance(dense_dict1, dense_dict2)

        return jsd, hd
Beispiel #2
0
    def cal_doc_keywords_similarity(self, document, top_k=10):
        """
        This interface can be used to find topk keywords of document.

        Args:
            document(str): the input document text.
            top_k(int): top k keywords of this document.

        Returns:
            results(list): contains top_k keywords and their corresponding
                           similarity compared to document.
        """
        d_tokens = self.__tokenizer.tokenize(document)

        # Do topic inference on documents to obtain topic distribution.
        doc = LDADoc()
        self.__engine.infer(d_tokens, doc)
        doc_topic_dist = doc.sparse_topic_dist()

        items = []
        words = set()
        for word in d_tokens:
            if word in words:
                continue
            words.add(word)
            wd = WordAndDis()
            wd.word = word
            sm = SemanticMatching()
            wd.distance = sm.likelihood_based_similarity(
                terms=[word],
                doc_topic_dist=doc_topic_dist,
                model=self.__engine.get_model())
            items.append(wd)

        def take_elem(word_dis):
            return word_dis.distance

        items.sort(key=take_elem, reverse=True)

        results = []
        size = len(items)
        for i in range(top_k):
            if i >= size:
                break
            results.append({
                "word": items[i].word,
                "similarity": items[i].distance
            })

        return results
Beispiel #3
0
    def infer_doc_topic_distribution(self, document):
        """
        This interface infers the topic distribution of document.

        Args:
            document(str): the input document text.

        Returns:
            results(list): returns the topic distribution of document.
        """
        tokens = self.__tokenizer.tokenize(document)
        if tokens == []:
            return []
        results = []
        doc = LDADoc()
        self.__engine.infer(tokens, doc)
        topics = doc.sparse_topic_dist()
        for topic in topics:
            results.append({"topic id": topic.tid, "distribution": topic.prob})
        return results
Beispiel #4
0
    def cal_query_doc_similarity(self, query, document):
        """
        This interface calculates the similarity between query and document.

        Args:
            query(str): the input query text.
            document(str): the input document text.

        Returns:
            lda_sim(float): likelihood based similarity between query and document
                            based on LDA.
        """
        q_tokens = self.__tokenizer.tokenize(query)
        d_tokens = self.__tokenizer.tokenize(document)

        doc = LDADoc()
        self.__engine.infer(d_tokens, doc)
        doc_topic_dist = doc.sparse_topic_dist()

        sm = SemanticMatching()
        lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
                                                 self.__engine.get_model())

        return lda_sim