Example #1
0
    def cal_doc_keywords_similarity(self, document, top_k=10):
        """
        This interface can be used to find topk keywords of document.

        Args:
            document(str): the input document text.
            top_k(int): top k keywords of this document.

        Returns:
            results(list): contains top_k keywords and their
                     corresponding similarity compared to document.
        """
        d_tokens = self.__tokenizer.tokenize(document)

        # Do topic inference on documents to obtain topic distribution.
        doc = LDADoc()
        self.__engine.infer(d_tokens, doc)
        doc_topic_dist = doc.sparse_topic_dist()

        items = []
        words = set()
        for word in d_tokens:
            if word in words:
                continue
            words.add(word)
            wd = WordAndDis()
            wd.word = word
            sm = SemanticMatching()
            wd.distance = sm.likelihood_based_similarity(
                terms=[word],
                doc_topic_dist=doc_topic_dist,
                model=self.__engine.get_model())
            items.append(wd)

        def take_elem(word_dis):
            return word_dis.distance

        items.sort(key=take_elem, reverse=True)

        results = []
        size = len(items)
        for i in range(top_k):
            if i >= size:
                break
            results.append({
                "word": items[i].word,
                "similarity": items[i].distance
            })

        return results
Example #2
0
    def infer_doc_topic_distribution(self, document):
        """
        This interface infers the topic distribution of document.

        Args:
            document(str): the input document text.

        Returns:
            results(list): returns the topic distribution of document.
        """
        tokens = self.__tokenizer.tokenize(document)
        if tokens == []:
            return []
        results = []
        doc = LDADoc()
        self.__engine.infer(tokens, doc)
        topics = doc.sparse_topic_dist()
        for topic in topics:
            results.append({"topic id": topic.tid, "distribution": topic.prob})
        return results
Example #3
0
    def cal_query_doc_similarity(self, query, document):
        """
        This interface calculates the similarity between query and document.

        Args:
            query(str): the input query text.
            document(str): the input document text.

        Returns:
            lda_sim(float): likelihood based similarity between query and document based on LDA.
        """
        q_tokens = self.__tokenizer.tokenize(query)
        d_tokens = self.__tokenizer.tokenize(document)

        doc = LDADoc()
        self.__engine.infer(d_tokens, doc)
        doc_topic_dist = doc.sparse_topic_dist()

        sm = SemanticMatching()
        lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist, self.__engine.get_model())

        return lda_sim