Beispiel #1
0
def get_nouns(clusters):
    cur = stats.get_main_cursor(DB_DIR)
    cl_nouns = get_cluster_nouns(clusters)

    nouns = stats.get_nouns(cur, cl_nouns)

    return nouns
Beispiel #2
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")

    used_nouns = get_used_nouns(cur)

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend[total_md5] = 0.0
    logging.info("nouns len %s" % len(nouns))

    logging.info("get sim_dict")
    sim_dict = get_sims(cur)

    cl = get_clusters(sim_dict, nouns, noun_trend)

    json.dump(cl, open("./clusters_raw.json", "w"), indent=2)

    logging.info("Done")
Beispiel #3
0
def main():
    logging.info("Start")

    parser = util.get_dates_range_parser()
    parser.add_argument("-i", "--in-file")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)
    cur = stats.get_main_cursor(DB_DIR)

    stats.create_given_tables(cur, ["noun_similarity"])
    cur.execute(
        "create table if not exists noun_sim_new as select * from noun_similarity limit 0"
    )
    cur.execute("delete from noun_sim_new")

    in_file = open(args.in_file, 'r')
    sims = []
    for line in in_file:
        sims.append(line.split(";"))
        if len(sims) > 20000:
            save_sims(cur, sims)
            sims = []

    save_sims(cur, sims)

    cur.execute("begin transaction")

    cur.execute("delete from noun_similarity")
    cur.execute(
        "insert or ignore into noun_similarity select * from noun_sim_new")

    cur.execute("commit")

    logging.info("Done")
Beispiel #4
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-i")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")
    words_db = DB_DIR + "/tweets_lemma.db"
    bigram_db = DB_DIR + "/tweets_bigram.db"

    used_nouns = get_used_nouns(cur)        

    total_md5 = util.digest("__total__")

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend["__total__"] = 0.0  
    logging.info("nouns len %s" % len(nouns))
    post_cnt = stats.get_noun_cnt(cur_word_cnt)
    post_cnt[total_md5] = 0
    
    logging.info("get sim_dict")
    sim_dict = get_sims(cur) 

    cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt)

    json.dump(cl, open("./clusters_raw.json","w"), indent=2)

    logging.info("Done")
Beispiel #5
0
def get_sources(bigram_stats):
    cur = stats.get_main_cursor(DB_DIR)

    source_ids = set()
    for item in bigram_stats:
        source_ids.add(item["source1"])
        source_ids.add(item["source2"])

    sources = stats.get_sources(cur, source_ids)

    return sources
Beispiel #6
0
    def __init__(self,
                 db_dir,
                 headers,
                 days_back=7,
                 seconds_till_user_retry=3600):
        db_basename = "tweets"
        self.db_dir = db_dir
        self.db_basename = db_basename
        self.dates_db = {}
        self.days_back = days_back
        self.recent_users = {}
        self.seconds_till_user_retry = seconds_till_user_retry
        self.log = logging.getLogger('fetcher-' + db_basename)

        cur = stats.get_main_cursor(self.db_dir)
        self.main_db = cur
        stats.create_given_tables(cur, ["users"])

        self.client = TwitterClient(headers)
Beispiel #7
0
def parse_facts_file(tweet_index, facts, date):
    ind = Indexer(DB_DIR)

    cur = ind.get_db_for_date(date)
    cur_main = stats.get_main_cursor(DB_DIR)
    cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db")

    mcur = stats.get_mysql_cursor(settings)
    word_time_cnt_table = "word_time_cnt_%s" % date
    word_hour_cnt_table = "word_hour_cnt_%s" % date
    word_mates_table = "word_mates_%s" % date
    bigram_table = "bigram_%s" % date
    stats.create_mysql_tables(
        mcur, {
            word_time_cnt_table: "word_time_cnt",
            word_hour_cnt_table: "word_hour_cnt",
            word_mates_table: "word_mates",
            bigram_table: "bigram_day"
        })

    stats.create_given_tables(
        cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"])
    stats.create_given_tables(cur_bigram, ["lemma_word_pairs"])
    stats.create_given_tables(cur, {"sources": "nouns"})
    stats.create_given_tables(cur_main, ["nouns"])
    stats.create_given_tables(cur_main, {"sources": "nouns"})

    logging.info("Parse index: %s; facts: %s" % (tweet_index, facts))

    ids = []
    for l in open(tweet_index, 'r').read().split("\n"):
        if l is None or l == '':
            break
        tw_id, created_at = l.split("\t")
        ids.append((tw_id, created_at))

    logging.info("Got tweet %s ids" % (len(ids)))

    tree = ElementTree.iterparse(facts, events=('start', 'end'))

    # set larger cache, default 2000 * 1024, this 102400*1024
    #cur_bigram.execute("pragma cache_size = -102400")

    nouns_total = set()
    sources_total = set()
    noun_sources = []
    tweets_nouns = []
    lemma_word_pairs = []
    word_mates = []
    word_cnt = []

    match_type_cnt = MatchTypeCnt()

    for event, elem in tree:
        if event == 'end' and elem.tag == 'document':
            cur_doc = int(elem.attrib['di'])
            post_id, create_time = ids[cur_doc - 1]
            nouns_preps = get_nouns_preps(elem)
            match_type_cnt.add_cnt(nouns_preps)
            lemmas = []
            nouns = []
            for np in nouns_preps:
                try:
                    lemmas.append(util.digest(np.with_prep()))
                    nouns.append(util.digest(np.noun_lemma))
                    nouns_total.add(np.noun_lemma)
                    sources_total.add(np.with_prep())

                    noun_sources.append((post_id, util.digest(np.noun_lemma),
                                         util.digest(np.with_prep())))
                    word_cnt.append((util.digest(np.noun_lemma),
                                     cut_to_tenminute(create_time)))
                except Exception as e:
                    traceback.print_exc()
                    logging.error(e)

            lemma_word_pairs += make_lemma_word_pairs(
                nouns, lemmas, cut_to_tenminute(create_time))
            word_mates += make_word_pairs_with_time(nouns,
                                                    create_time,
                                                    bag_size=BAG_SIZE)

            if len(noun_sources) > 10000:
                logging.info("seen %s docid" % (cur_doc))
                save_tweet_nouns(cur, noun_sources)
                save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
                save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
                noun_sources = []
                word_cnt = []

            if len(lemma_word_pairs) >= CHUNK_SIZE:
                save_bigram_day(mcur, lemma_word_pairs, bigram_table)
                lemma_word_pairs = []

            if len(word_mates) >= CHUNK_SIZE:
                logging.info("save %s word_mates" % len(word_mates))
                save_word_mates2(mcur, word_mates, word_mates_table)
                word_mates = []

            elem.clear()

    save_tweet_nouns(cur, noun_sources)
    save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
    save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
    save_bigram_day(mcur, lemma_word_pairs, bigram_table)
    save_word_mates2(mcur, word_mates, word_mates_table)

    save_nouns(cur, nouns_total)
    save_nouns(cur, sources_total, table="sources")
    save_nouns(cur_main, nouns_total)
    save_nouns(cur_main, sources_total, table="sources")

    logging.info(str(match_type_cnt))

    return