Beispiel #1
0
def main():
    max_id = str(args.max_id)
    logger.info("MAX:{}".format(max_id))

    query = "select item_id, text from " + sqlconfig.tweet_table_name + \
        " where id <= " + max_id + " and success = 1"

    logger.info("SQL running...")
    start = time.time()
    rows = read_table(query)
    elapsed_time = time.time() - start
    logger.info("sql_time:{0}[sec]".format(elapsed_time))

    logger.info("Indexing...")
    start = time.time()
    p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start()
    indexer = Indexer()
    for i, row in enumerate(rows):
        indexer.add(row[0], noun_list(row[1]))
        p.update(i + 1)
    p.finish()
    elapsed_time = time.time() - start
    logger.info("indexing_time:{0}[sec]".format(elapsed_time))

    indexer.save("./index.pkl")
Beispiel #2
0
 def __init__(self, index_path="./index"):
     self._models = ["tfidf", "bm25", "fasttext", "elmo", "bert"]
     self.index_path = index_path
     self.models = self.load_models()
     self._models_to_index = None
     if not self.check_index_dir():
         indexer = Indexer(index_path=self.index_path, models=self.models)
         if not self._models_to_index:
             self._models_to_index = self._models
         self._models += indexer.index(models=self._models_to_index)
Beispiel #3
0
def main():
    global indexer, spell_checker

    docs = read_id_file_into_docs("ID.txt")
    # docs = read_file_into_docs("doc_dump.txt")
    # create_id_file_from_docs("ID.txt", docs)
    indexer = Indexer(docs)
    indexer.create()

    # 1. Parameter: Indexer
    # 2. Parameter: Jaccard threshold
    # 3. Parameter: k-Gram k
    # 4. Parameter: Limit corrected words
    spell_checker = SpellChecker(indexer, 0.7, 2, 3)
 def __init__(self, keyword):
     self.kView = 100  # 滑动窗口大小
     self.cut_words = ""
     self.keywords = keyword
     self.index = Indexer("article")
     self.searcher = Searcher(self.index)
     self.doc_list = self.searcher.search(self.keywords)
Beispiel #5
0
	def index(self):
		""" Index the got pages, include two steps:
		1, index the pages for all page information.
		2, pageranke the page and also store the scores to
		   accelerate the query process."""
		idb  = self.config.indexdb
		odb  = self.config.oridb
		sort = int(self.config.sort)
		
		indexer = Indexer(idb)
		indexer.index(odb)
		del indexer
		
		pr = PageRanker(idb, sort)
		pr.build_links(odb)
		pr.pagerank()
		del pr
Beispiel #6
0
def save_index(file_path):
    query = get_query(args.only_reply)
    logger.info('query: {}'.format(query))

    logger.info("SQL running...")
    start = time.time()
    rows = read_table(query)
    elapsed_time = time.time() - start
    logger.info("sql_time:{0}[sec]".format(elapsed_time))

    logger.info("Indexing...")
    start = time.time()
    p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start()
    indexer = Indexer()
    for i, row in enumerate(rows):
        indexer.add(row[0], noun_list(row[1]))
        p.update(i + 1)
    p.finish()
    elapsed_time = time.time() - start
    logger.info("indexing_time:{0}[sec]".format(elapsed_time))

    logger.info("Saving...")
    indexer.save(file_path)
    logger.info('Done')
Beispiel #7
0
        candidate_doc_id = {}
        for term in tf:
            if term in self.index.inverted:
                term_weight = tf[term] * 1.0 / (1 + self.index.df[term])
                for doc_id in self.index.inverted[term]:
                    if doc_id in candidate_doc_id:
                        candidate_doc_id[doc_id] += term_weight
                    else:
                        candidate_doc_id[doc_id] = term_weight

        #rank by length
        for doc_id in candidate_doc_id:
            candidate_doc_id[doc_id] /= len(self.index.id_doc[doc_id].text)

        sorted_doc = sorted(candidate_doc_id.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

        res = []
        for (doc_id, weight) in sorted_doc[0:10]:
            res.append(self.index.id_doc[doc_id])
        return res


if __name__ == '__main__':
    index = Indexer("docs.txt")
    searcher = Searcher(index)
    doclist = searcher.search("中央调查")
    for doc in doclist:
        print doc.id, doc.name, doc.text
Beispiel #8
0
import make_dic as md
from subnetwork import SubNetwork
from index import Indexer
from six.moves import cPickle
from sqltostc import all_tweets
import sqlconfig


net = SubNetwork()
print "load tweet pairs....."
with open('tweet_dic.pkl', 'r') as f:
    source_dic = cPickle.load(f)
net.set_source(source_dic)
print "Tweet Pairs loaded: len(pairs) -> " + str(len(source_dic))
print "load index....."
indexer = Indexer()
indexer.load("./index.pkl")
print "Index loaded"
print "word count / tweet dic....."
with open('./word_count.pkl', 'r') as f:
    wc_dic = cPickle.load(f)
print "dic loaded"


def retrieve_replies(input):
  text = input
  noun_list = md.noun_list(text)

  net.gen_sub_network(noun_list)
  queries = net.page_rank()
  
Beispiel #9
0
    def test_save_and_load(self):
        indexer = Indexer()
        indexer.add("1", ["今日", "天気", "晴れ", "今日"])
        indexer.add("2", ["今日", "天気", "雨"])

        indexer.save("./tests/index.pkl")
        indexer.load("./tests/index.pkl")

        tweet_ids = indexer.search("今日")
        eq_(tweet_ids[0][0], "1")
        eq_(tweet_ids[0][1], 2)
        eq_(tweet_ids[1][0], "2")
        eq_(tweet_ids[1][1], 1)

        tweet_ids = indexer.search("雨")
        eq_(len(tweet_ids), 1)
        eq_(tweet_ids[0][0], "2")