def __init__(self, keyword): self.kView = 100 # 滑动窗口大小 self.cut_words = "" self.keywords = keyword self.index = Indexer("article") self.searcher = Searcher(self.index) self.doc_list = self.searcher.search(self.keywords)
def __init__(self, index_path="./index"): self._models = ["tfidf", "bm25", "fasttext", "elmo", "bert"] self.index_path = index_path self.models = self.load_models() self._models_to_index = None if not self.check_index_dir(): indexer = Indexer(index_path=self.index_path, models=self.models) if not self._models_to_index: self._models_to_index = self._models self._models += indexer.index(models=self._models_to_index)
def main(): global indexer, spell_checker docs = read_id_file_into_docs("ID.txt") # docs = read_file_into_docs("doc_dump.txt") # create_id_file_from_docs("ID.txt", docs) indexer = Indexer(docs) indexer.create() # 1. Parameter: Indexer # 2. Parameter: Jaccard threshold # 3. Parameter: k-Gram k # 4. Parameter: Limit corrected words spell_checker = SpellChecker(indexer, 0.7, 2, 3)
candidate_doc_id = {} for term in tf: if term in self.index.inverted: term_weight = tf[term] * 1.0 / (1 + self.index.df[term]) for doc_id in self.index.inverted[term]: if doc_id in candidate_doc_id: candidate_doc_id[doc_id] += term_weight else: candidate_doc_id[doc_id] = term_weight #rank by length for doc_id in candidate_doc_id: candidate_doc_id[doc_id] /= len(self.index.id_doc[doc_id].text) sorted_doc = sorted(candidate_doc_id.items(), key=operator.itemgetter(1), reverse=True) res = [] for (doc_id, weight) in sorted_doc[0:10]: res.append(self.index.id_doc[doc_id]) return res if __name__ == '__main__': index = Indexer("docs.txt") searcher = Searcher(index) doclist = searcher.search("中央调查") for doc in doclist: print doc.id, doc.name, doc.text