def main(): max_id = str(args.max_id) logger.info("MAX:{}".format(max_id)) query = "select item_id, text from " + sqlconfig.tweet_table_name + \ " where id <= " + max_id + " and success = 1" logger.info("SQL running...") start = time.time() rows = read_table(query) elapsed_time = time.time() - start logger.info("sql_time:{0}[sec]".format(elapsed_time)) logger.info("Indexing...") start = time.time() p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start() indexer = Indexer() for i, row in enumerate(rows): indexer.add(row[0], noun_list(row[1])) p.update(i + 1) p.finish() elapsed_time = time.time() - start logger.info("indexing_time:{0}[sec]".format(elapsed_time)) indexer.save("./index.pkl")
def __init__(self, index_path="./index"): self._models = ["tfidf", "bm25", "fasttext", "elmo", "bert"] self.index_path = index_path self.models = self.load_models() self._models_to_index = None if not self.check_index_dir(): indexer = Indexer(index_path=self.index_path, models=self.models) if not self._models_to_index: self._models_to_index = self._models self._models += indexer.index(models=self._models_to_index)
def main(): global indexer, spell_checker docs = read_id_file_into_docs("ID.txt") # docs = read_file_into_docs("doc_dump.txt") # create_id_file_from_docs("ID.txt", docs) indexer = Indexer(docs) indexer.create() # 1. Parameter: Indexer # 2. Parameter: Jaccard threshold # 3. Parameter: k-Gram k # 4. Parameter: Limit corrected words spell_checker = SpellChecker(indexer, 0.7, 2, 3)
def __init__(self, keyword): self.kView = 100 # 滑动窗口大小 self.cut_words = "" self.keywords = keyword self.index = Indexer("article") self.searcher = Searcher(self.index) self.doc_list = self.searcher.search(self.keywords)
def index(self): """ Index the got pages, include two steps: 1, index the pages for all page information. 2, pageranke the page and also store the scores to accelerate the query process.""" idb = self.config.indexdb odb = self.config.oridb sort = int(self.config.sort) indexer = Indexer(idb) indexer.index(odb) del indexer pr = PageRanker(idb, sort) pr.build_links(odb) pr.pagerank() del pr
def save_index(file_path): query = get_query(args.only_reply) logger.info('query: {}'.format(query)) logger.info("SQL running...") start = time.time() rows = read_table(query) elapsed_time = time.time() - start logger.info("sql_time:{0}[sec]".format(elapsed_time)) logger.info("Indexing...") start = time.time() p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start() indexer = Indexer() for i, row in enumerate(rows): indexer.add(row[0], noun_list(row[1])) p.update(i + 1) p.finish() elapsed_time = time.time() - start logger.info("indexing_time:{0}[sec]".format(elapsed_time)) logger.info("Saving...") indexer.save(file_path) logger.info('Done')
candidate_doc_id = {} for term in tf: if term in self.index.inverted: term_weight = tf[term] * 1.0 / (1 + self.index.df[term]) for doc_id in self.index.inverted[term]: if doc_id in candidate_doc_id: candidate_doc_id[doc_id] += term_weight else: candidate_doc_id[doc_id] = term_weight #rank by length for doc_id in candidate_doc_id: candidate_doc_id[doc_id] /= len(self.index.id_doc[doc_id].text) sorted_doc = sorted(candidate_doc_id.items(), key=operator.itemgetter(1), reverse=True) res = [] for (doc_id, weight) in sorted_doc[0:10]: res.append(self.index.id_doc[doc_id]) return res if __name__ == '__main__': index = Indexer("docs.txt") searcher = Searcher(index) doclist = searcher.search("中央调查") for doc in doclist: print doc.id, doc.name, doc.text
import make_dic as md from subnetwork import SubNetwork from index import Indexer from six.moves import cPickle from sqltostc import all_tweets import sqlconfig net = SubNetwork() print "load tweet pairs....." with open('tweet_dic.pkl', 'r') as f: source_dic = cPickle.load(f) net.set_source(source_dic) print "Tweet Pairs loaded: len(pairs) -> " + str(len(source_dic)) print "load index....." indexer = Indexer() indexer.load("./index.pkl") print "Index loaded" print "word count / tweet dic....." with open('./word_count.pkl', 'r') as f: wc_dic = cPickle.load(f) print "dic loaded" def retrieve_replies(input): text = input noun_list = md.noun_list(text) net.gen_sub_network(noun_list) queries = net.page_rank()
def test_save_and_load(self): indexer = Indexer() indexer.add("1", ["今日", "天気", "晴れ", "今日"]) indexer.add("2", ["今日", "天気", "雨"]) indexer.save("./tests/index.pkl") indexer.load("./tests/index.pkl") tweet_ids = indexer.search("今日") eq_(tweet_ids[0][0], "1") eq_(tweet_ids[0][1], 2) eq_(tweet_ids[1][0], "2") eq_(tweet_ids[1][1], 1) tweet_ids = indexer.search("雨") eq_(len(tweet_ids), 1) eq_(tweet_ids[0][0], "2")