Example #1
0
def main():
    max_id = str(args.max_id)
    logger.info("MAX:{}".format(max_id))

    query = "select item_id, text from " + sqlconfig.tweet_table_name + \
        " where id <= " + max_id + " and success = 1"

    logger.info("SQL running...")
    start = time.time()
    rows = read_table(query)
    elapsed_time = time.time() - start
    logger.info("sql_time:{0}[sec]".format(elapsed_time))

    logger.info("Indexing...")
    start = time.time()
    p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start()
    indexer = Indexer()
    for i, row in enumerate(rows):
        indexer.add(row[0], noun_list(row[1]))
        p.update(i + 1)
    p.finish()
    elapsed_time = time.time() - start
    logger.info("indexing_time:{0}[sec]".format(elapsed_time))

    indexer.save("./index.pkl")
Example #2
0
File: main.py Project: re-born/stc
def retrieve_replies(input):
  text = input
  noun_list = md.noun_list(text)

  net.gen_sub_network(noun_list)
  queries = net.page_rank()
  
  results = {}
  results = defaultdict(int)
  for query in queries:
    word = query
    score = queries[word]
    tuple_list = indexer.search(word)
    df = len(tuple_list)
    for tup in tuple_list:
        s = score * idf(df)
        results = indexer.update_replies(results, tup, s)
  results = tuples_from_dict(normalize(results, wc_dic))
  return results
Example #3
0
def save_index(file_path):
    query = get_query(args.only_reply)
    logger.info('query: {}'.format(query))

    logger.info("SQL running...")
    start = time.time()
    rows = read_table(query)
    elapsed_time = time.time() - start
    logger.info("sql_time:{0}[sec]".format(elapsed_time))

    logger.info("Indexing...")
    start = time.time()
    p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start()
    indexer = Indexer()
    for i, row in enumerate(rows):
        indexer.add(row[0], noun_list(row[1]))
        p.update(i + 1)
    p.finish()
    elapsed_time = time.time() - start
    logger.info("indexing_time:{0}[sec]".format(elapsed_time))

    logger.info("Saving...")
    indexer.save(file_path)
    logger.info('Done')
Example #4
0
import os
from os import path
import argparse
import logging
import time
from progressbar import ProgressBar, Percentage, Bar
from collections import defaultdict
from six.moves import cPickle
import re
import MeCab
import make_dic as md

import sqlconfig
import sqltostc

parser = argparse.ArgumentParser()
parser.add_argument('--file-path', type=str, default='./word_count.pkl')
parser.add_argument('--overwrite', type=bool, default=False)
args = parser.parse_args()

if __name__ == '__main__':
    dic = defaultdict(int)
    tweets = sqltostc.all_tweet_pairs()
    for i, tweet in enumerate(tweets):
        length = len(md.noun_list(tweet['R_TEXT']))
        dic[tweet['R_ID']] = length
    with open(args.file_path, 'wb') as f:
        cPickle.dump(dic, f, protocol=cPickle.HIGHEST_PROTOCOL)

Example #5
0
def main():
    for item_id in args.ids:
        t = Tweet(item_id)
        logger.info('ITEM_ID:{0}, TEXT:{1}'.format(t.item_id, t.text.encode('utf-8')))
        for word in noun_list(t.text):
            logger.info('  {}'.format(word))