Ejemplo n.º 1
0
    def get_word_time_cnt(self, word_md5, time1, time2):
        logging.info("Get word time cnt: %s, %s, %s" %
                     (word_md5, time1, time2))
        utc_now = datetime.utcnow()
        res = []
        default_left_time_bound = (utc_now -
                                   timedelta(3)).strftime("%Y%m%d%H%M%S")[:10]
        time = ""
        if time1 is not None:
            time += " and hour >= " + str(time1)[:10]
        else:
            time += " and hour >= " + default_left_time_bound
        if time2 is not None:
            time += " and hour < " + str(time2)[:10]

        where = "word_md5 = %s" % word_md5
        if word_md5 == util.digest('0'):
            where = "1"

        mcur = stats.get_mysql_cursor(settings)
        try:
            for day in [3, 2, 1, 0]:
                date = (utc_now - timedelta(day)).strftime("%Y%m%d")
                #stats.create_mysql_tables(mcur, {"word_hour_cnt_"+date: "word_hour_cnt"})
                mcur.execute("""
                    SELECT word_md5, hour, cnt
                    FROM word_hour_cnt_%(date)s
                    WHERE %(where)s 
                    %(time)s
                """ % {
                    "where": where,
                    "time": time,
                    "date": date
                })
                while True:
                    r = mcur.fetchone()
                    if r is None:
                        break
                    word, hour, cnt = r
                    utctime = str(hour) + "0000"
                    utc_unixtime = datetime.strptime(
                        utctime, '%Y%m%d%H%M%S').strftime('%s')
                    res.append((str(word), utc_to_local(utctime), int(cnt),
                                utc_unixtime))
            logging.info("word time cnt: %s" % len(res))
        except Exception as e:
            logging.error(e)

        return res
Ejemplo n.º 2
0
def build_post_cnt(db_dir):
    utc_now = datetime.utcnow()
    word_cnt = stats.get_word_cnt(db_dir)
    word_cnt_tuples = map(lambda x: (int(x), int(word_cnt[x])),
                          word_cnt.keys())

    f_tmp = db_dir + "/word_cnt.db.tmp"
    f = db_dir + "/word_cnt.db"

    util.delete_if_exists(f_tmp)

    cur = stats.get_cursor(f_tmp)
    stats.create_given_tables(cur,
                              ["chains_nouns", "post_cnt", "post_reply_cnt"])

    save_word_cnt(cur, word_cnt_tuples)
    words = get_trending_words(db_dir, word_cnt_tuples)

    mcur = stats.get_mysql_cursor(settings)
    count_currents2(cur, mcur, utc_now, words)

    os.rename(f_tmp, f)
Ejemplo n.º 3
0
def get_word_stats(clusters):
    cl_nouns = get_cluster_nouns(clusters)
    cl_nouns_joined = ",".join(map(str, cl_nouns))

    utc_now = datetime.utcnow()
    date_1day = (utc_now - timedelta(1)).strftime("%Y%m%d%H%M%S")
    date_1day_tenminute = date_1day[:11]

    word_cnt = {}
    for day in [1, 0]:
        date = (utc_now - timedelta(day)).strftime("%Y%m%d")
        word_time_cnt_table = "word_time_cnt_%s" % date
        mcur = stats.get_mysql_cursor(settings)
        stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"})
        mcur.execute("""
                select word_md5, sum(cnt) 
                from %s
                where tenminute > %s
                and word_md5 in (%s)
                group by word_md5
        """ % (word_time_cnt_table, date_1day_tenminute, cl_nouns_joined))

        row_cnt = 0
        while True:
            res = mcur.fetchone()
            if res is None:
                break
            word_md5, cnt = map(int, res)
            if word_md5 not in word_cnt:
                word_cnt[word_md5] = 0
            word_cnt[word_md5] += cnt

            row_cnt += 1
            if row_cnt % 100000 == 0:
                logging.info('Seen %s rows' % row_cnt)

    return word_cnt
Ejemplo n.º 4
0
def get_bigram_stats(clusters, word_stats):
    cl_nouns = get_cluster_nouns(clusters)
    cl_nouns_joined = ",".join(map(str, cl_nouns))

    utc_now = datetime.utcnow()
    date_1day = (utc_now - timedelta(1)).strftime("%Y%m%d%H%M%S")
    date_1day_tenminute = date_1day[:11]

    bigram_cnt = {}
    for day in [1, 0]:
        date = (utc_now - timedelta(day)).strftime("%Y%m%d")
        bigram_table = "bigram_%s" % date
        mcur = stats.get_mysql_cursor(settings)
        stats.create_mysql_tables(mcur, {bigram_table: "bigram_day"})
        mcur.execute("""
                select source1, word1, source2, word2, sum(cnt) 
                from %s
                where tenminute > %s
                and word1 in (%s) and word2 in (%s)
                group by source1, word1, source2, word2
        """ % (bigram_table, date_1day_tenminute, cl_nouns_joined,
               cl_nouns_joined))

        row_cnt = 0
        while True:
            res = mcur.fetchone()
            if res is None:
                break
            s1, w1, s2, w2, cnt = map(int, res)
            bigram = (s1, w1, s2, w2)
            if bigram not in bigram_cnt:
                bigram_cnt[bigram] = 0
            bigram_cnt[bigram] += cnt

            row_cnt += 1
            if row_cnt % 100000 == 0:
                logging.info('Seen %s rows' % row_cnt)

    bigram_list = []
    skipped_cnt = 0
    for bigram in bigram_cnt:
        s1, w1, s2, w2 = bigram
        cnt = bigram_cnt[bigram]
        cnt_ratio = float(cnt) / word_stats[w1]
        if cnt_ratio < settings["wordstats_noise_treshold"]:
            skipped_cnt += 1
            continue
        item = {
            "source1": s1,
            "word1": w1,
            "source2": s2,
            "word2": w2,
            "count": cnt,
            "word1_count": word_stats[w1]
        }
        bigram_list.append(item)

    logging.info("Skipped noisy bigrams: %d; noise treshold: %f" %
                 (skipped_cnt, settings["wordstats_noise_treshold"]))

    return bigram_list
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db-dir", default=DB_DIR)
    args = parser.parse_args()

    cur_display = stats.get_cursor(args.db_dir + "/tweets_display.db")
    cur_main = stats.get_cursor(args.db_dir + "/tweets.db")
    #cur_main = stats.get_cursor(args.db_dir + "/tweets_20150221.db")
    #nouns = stats.get_nouns(cur_main)

    #logging.info(type(nouns.keys()[0]))

    utc_now = datetime.utcnow()
    date_3day = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S")
    date_3day_tenminute = date_3day[:11]
    logging.info("Time left bound: %s" % date_3day_tenminute)
    hour_word_cnt = {}
    word_cnt = {}
    for day in [3, 2, 1, 0]:
        date = (utc_now - timedelta(day)).strftime("%Y%m%d")
        word_time_cnt_table = "word_time_cnt_%s" % date
        mcur = stats.get_mysql_cursor(settings)
        stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"})
        mcur.execute("""
                select word_md5, substr(tenminute, 1, 10) as hour, sum(cnt) 
                from %s
                where tenminute > %s
                group by word_md5, hour
        """ % (word_time_cnt_table, date_3day_tenminute))

        row_cnt = 0
        while True:
            res = mcur.fetchone()
            if res is None:
                break
            word_md5, hour, cnt = map(int, res)
            if hour not in hour_word_cnt:
                hour_word_cnt[hour] = {}
            hour_word_cnt[hour][word_md5] = cnt
            if word_md5 not in word_cnt:
                word_cnt[word_md5] = 0
            word_cnt[word_md5] += cnt

            row_cnt += 1
            if row_cnt % 100000 == 0:
                logging.info('Seen %s rows' % row_cnt)

    word_series = []
    hours = sorted(hour_word_cnt.keys())
    for word in word_cnt.keys():
        series = []
        series_max = 0
        for hour in hours:
            if word in hour_word_cnt[hour]:
                series.append(hour_word_cnt[hour][word])
                if hour_word_cnt[hour][word] > series_max:
                    series_max = hour_word_cnt[hour][word]
            else:
                series.append(0)
        # normalize by maxfreq in series
        if series_max > 0:
            series = [(float(x) / series_max) * 100 for x in series]
        approx = least_squares(series)
        a, b, app_ser = approx
        word_series.append({
            "word_md5": word,
            "word_cnt": word_cnt[word],
            "line_c": a,
            "slope": b,
            "delta": app_ser[-1] - app_ser[0]
        })

    word_series = sorted(word_series, key=lambda x: x["slope"],
                         reverse=True)[:2000]

    for cur in [cur_main, cur_display]:
        stats.create_given_tables(cur, {"noun_trend_new": "noun_trend"})
        cur.execute("begin transaction")
        for s in word_series:
            cur.execute("insert into noun_trend_new values (%s, %s)" %
                        (s["word_md5"], s["slope"]))

        cur.execute("drop table noun_trend")
        cur.execute("alter table noun_trend_new rename to noun_trend")
        cur.execute("commit")

    logging.info("Done")
Ejemplo n.º 6
0
def parse_facts_file(tweet_index, facts, date):
    ind = Indexer(DB_DIR)

    cur = ind.get_db_for_date(date)
    cur_main = stats.get_main_cursor(DB_DIR)
    cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db")

    mcur = stats.get_mysql_cursor(settings)
    word_time_cnt_table = "word_time_cnt_%s" % date
    word_hour_cnt_table = "word_hour_cnt_%s" % date
    word_mates_table = "word_mates_%s" % date
    bigram_table = "bigram_%s" % date
    stats.create_mysql_tables(
        mcur, {
            word_time_cnt_table: "word_time_cnt",
            word_hour_cnt_table: "word_hour_cnt",
            word_mates_table: "word_mates",
            bigram_table: "bigram_day"
        })

    stats.create_given_tables(
        cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"])
    stats.create_given_tables(cur_bigram, ["lemma_word_pairs"])
    stats.create_given_tables(cur, {"sources": "nouns"})
    stats.create_given_tables(cur_main, ["nouns"])
    stats.create_given_tables(cur_main, {"sources": "nouns"})

    logging.info("Parse index: %s; facts: %s" % (tweet_index, facts))

    ids = []
    for l in open(tweet_index, 'r').read().split("\n"):
        if l is None or l == '':
            break
        tw_id, created_at = l.split("\t")
        ids.append((tw_id, created_at))

    logging.info("Got tweet %s ids" % (len(ids)))

    tree = ElementTree.iterparse(facts, events=('start', 'end'))

    # set larger cache, default 2000 * 1024, this 102400*1024
    #cur_bigram.execute("pragma cache_size = -102400")

    nouns_total = set()
    sources_total = set()
    noun_sources = []
    tweets_nouns = []
    lemma_word_pairs = []
    word_mates = []
    word_cnt = []

    match_type_cnt = MatchTypeCnt()

    for event, elem in tree:
        if event == 'end' and elem.tag == 'document':
            cur_doc = int(elem.attrib['di'])
            post_id, create_time = ids[cur_doc - 1]
            nouns_preps = get_nouns_preps(elem)
            match_type_cnt.add_cnt(nouns_preps)
            lemmas = []
            nouns = []
            for np in nouns_preps:
                try:
                    lemmas.append(util.digest(np.with_prep()))
                    nouns.append(util.digest(np.noun_lemma))
                    nouns_total.add(np.noun_lemma)
                    sources_total.add(np.with_prep())

                    noun_sources.append((post_id, util.digest(np.noun_lemma),
                                         util.digest(np.with_prep())))
                    word_cnt.append((util.digest(np.noun_lemma),
                                     cut_to_tenminute(create_time)))
                except Exception as e:
                    traceback.print_exc()
                    logging.error(e)

            lemma_word_pairs += make_lemma_word_pairs(
                nouns, lemmas, cut_to_tenminute(create_time))
            word_mates += make_word_pairs_with_time(nouns,
                                                    create_time,
                                                    bag_size=BAG_SIZE)

            if len(noun_sources) > 10000:
                logging.info("seen %s docid" % (cur_doc))
                save_tweet_nouns(cur, noun_sources)
                save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
                save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
                noun_sources = []
                word_cnt = []

            if len(lemma_word_pairs) >= CHUNK_SIZE:
                save_bigram_day(mcur, lemma_word_pairs, bigram_table)
                lemma_word_pairs = []

            if len(word_mates) >= CHUNK_SIZE:
                logging.info("save %s word_mates" % len(word_mates))
                save_word_mates2(mcur, word_mates, word_mates_table)
                word_mates = []

            elem.clear()

    save_tweet_nouns(cur, noun_sources)
    save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
    save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
    save_bigram_day(mcur, lemma_word_pairs, bigram_table)
    save_word_mates2(mcur, word_mates, word_mates_table)

    save_nouns(cur, nouns_total)
    save_nouns(cur, sources_total, table="sources")
    save_nouns(cur_main, nouns_total)
    save_nouns(cur_main, sources_total, table="sources")

    logging.info(str(match_type_cnt))

    return