def __init__(self, keyword):
     self.kView = 100  # 滑动窗口大小
     self.cut_words = ""
     self.keywords = keyword
     self.index = Indexer("article")
     self.searcher = Searcher(self.index)
     self.doc_list = self.searcher.search(self.keywords)
Exemple #2
0
 def __init__(self, index_path="./index"):
     self._models = ["tfidf", "bm25", "fasttext", "elmo", "bert"]
     self.index_path = index_path
     self.models = self.load_models()
     self._models_to_index = None
     if not self.check_index_dir():
         indexer = Indexer(index_path=self.index_path, models=self.models)
         if not self._models_to_index:
             self._models_to_index = self._models
         self._models += indexer.index(models=self._models_to_index)
Exemple #3
0
def main():
    global indexer, spell_checker

    docs = read_id_file_into_docs("ID.txt")
    # docs = read_file_into_docs("doc_dump.txt")
    # create_id_file_from_docs("ID.txt", docs)
    indexer = Indexer(docs)
    indexer.create()

    # 1. Parameter: Indexer
    # 2. Parameter: Jaccard threshold
    # 3. Parameter: k-Gram k
    # 4. Parameter: Limit corrected words
    spell_checker = SpellChecker(indexer, 0.7, 2, 3)
Exemple #4
0
        candidate_doc_id = {}
        for term in tf:
            if term in self.index.inverted:
                term_weight = tf[term] * 1.0 / (1 + self.index.df[term])
                for doc_id in self.index.inverted[term]:
                    if doc_id in candidate_doc_id:
                        candidate_doc_id[doc_id] += term_weight
                    else:
                        candidate_doc_id[doc_id] = term_weight

        #rank by length
        for doc_id in candidate_doc_id:
            candidate_doc_id[doc_id] /= len(self.index.id_doc[doc_id].text)

        sorted_doc = sorted(candidate_doc_id.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

        res = []
        for (doc_id, weight) in sorted_doc[0:10]:
            res.append(self.index.id_doc[doc_id])
        return res


if __name__ == '__main__':
    index = Indexer("docs.txt")
    searcher = Searcher(index)
    doclist = searcher.search("中央调查")
    for doc in doclist:
        print doc.id, doc.name, doc.text