def main(): max_id = str(args.max_id) logger.info("MAX:{}".format(max_id)) query = "select item_id, text from " + sqlconfig.tweet_table_name + \ " where id <= " + max_id + " and success = 1" logger.info("SQL running...") start = time.time() rows = read_table(query) elapsed_time = time.time() - start logger.info("sql_time:{0}[sec]".format(elapsed_time)) logger.info("Indexing...") start = time.time() p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start() indexer = Indexer() for i, row in enumerate(rows): indexer.add(row[0], noun_list(row[1])) p.update(i + 1) p.finish() elapsed_time = time.time() - start logger.info("indexing_time:{0}[sec]".format(elapsed_time)) indexer.save("./index.pkl")
def retrieve_replies(input): text = input noun_list = md.noun_list(text) net.gen_sub_network(noun_list) queries = net.page_rank() results = {} results = defaultdict(int) for query in queries: word = query score = queries[word] tuple_list = indexer.search(word) df = len(tuple_list) for tup in tuple_list: s = score * idf(df) results = indexer.update_replies(results, tup, s) results = tuples_from_dict(normalize(results, wc_dic)) return results
def save_index(file_path): query = get_query(args.only_reply) logger.info('query: {}'.format(query)) logger.info("SQL running...") start = time.time() rows = read_table(query) elapsed_time = time.time() - start logger.info("sql_time:{0}[sec]".format(elapsed_time)) logger.info("Indexing...") start = time.time() p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start() indexer = Indexer() for i, row in enumerate(rows): indexer.add(row[0], noun_list(row[1])) p.update(i + 1) p.finish() elapsed_time = time.time() - start logger.info("indexing_time:{0}[sec]".format(elapsed_time)) logger.info("Saving...") indexer.save(file_path) logger.info('Done')
import os from os import path import argparse import logging import time from progressbar import ProgressBar, Percentage, Bar from collections import defaultdict from six.moves import cPickle import re import MeCab import make_dic as md import sqlconfig import sqltostc parser = argparse.ArgumentParser() parser.add_argument('--file-path', type=str, default='./word_count.pkl') parser.add_argument('--overwrite', type=bool, default=False) args = parser.parse_args() if __name__ == '__main__': dic = defaultdict(int) tweets = sqltostc.all_tweet_pairs() for i, tweet in enumerate(tweets): length = len(md.noun_list(tweet['R_TEXT'])) dic[tweet['R_ID']] = length with open(args.file_path, 'wb') as f: cPickle.dump(dic, f, protocol=cPickle.HIGHEST_PROTOCOL)
def main(): for item_id in args.ids: t = Tweet(item_id) logger.info('ITEM_ID:{0}, TEXT:{1}'.format(t.item_id, t.text.encode('utf-8'))) for word in noun_list(t.text): logger.info(' {}'.format(word))