コード例 #1
0
def get_related_tweets(cur, words):
    word_md5s = set()
    for w in words:
        if w.word_md5 is not None:
            word_md5s.add(str(w.word_md5))
        if w.lemma_md5 is not None:
            word_md5s.add(str(w.lemma_md5))
    word_md5s = list(word_md5s)

    stats.create_given_tables(cur, ["tweets", "tweets_nouns"])

    tweets = {}

    cur.execute("""
        select n.id, n.noun_md5, t.id, t.tw_text, t.created_at, t.username
        from tweets_nouns n
        inner join tweets t
        on n.id = t.id
        where noun_md5 in (%s)
        limit 10000
    """ % ",".join(word_md5s))

    r = cur.fetchall()

    for l in r:
        tw_id, noun_md5, tw_id, tw_text, created_at, username = l
        if tw_id not in tweets:
            tweets[tw_id] = Tweet(tw_text, tw_id, created_at, username, tw_id)

        tweets[tw_id].words.append(noun_md5)

    get_tweets_nouns(cur, tweets)

    return tweets
コード例 #2
0
def get_trending_words(db_dir, word_cnt_tuples):
    cur = stats.get_cursor(db_dir + "/tweets_display.db")

    stats.create_given_tables(cur, ["noun_trend"])
    cur.execute("""
        select noun_md5, trend 
        from noun_trend
        order by trend desc
        limit 2000
    """)
    word_trends = map(lambda x: (int(x[0]), float(x[1])), cur.fetchall())

    word_ranks = make_tf_idf_ranks(word_cnt_tuples)

    for w in word_trends:
        word, trend = w
        if word not in word_ranks:
            logging.warn("No such word_md5 at word_ranks %s" % word)
            continue
        word_ranks[word].trend.value = trend

    Rank.weight_ranks(map(lambda x: x.trend, word_ranks.values()))
    Rank.weight_ranks(map(lambda x: x.cnt, word_ranks.values()))

    words = []
    for word_rank in sorted(word_ranks.values(),
                            key=lambda x: x.cnt.rank + x.trend.rank)[:2000]:
        words.append(str(word_rank.word))

    return words
コード例 #3
0
ファイル: post-profiles.py プロジェクト: lonlylocly/molva
def main():
    logging.info("Start")

    parser = util.get_dates_range_parser()
    parser.add_argument("-i", "--in-file")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)
    cur = stats.get_main_cursor(DB_DIR)

    stats.create_given_tables(cur, ["noun_similarity"])
    cur.execute(
        "create table if not exists noun_sim_new as select * from noun_similarity limit 0"
    )
    cur.execute("delete from noun_sim_new")

    in_file = open(args.in_file, 'r')
    sims = []
    for line in in_file:
        sims.append(line.split(";"))
        if len(sims) > 20000:
            save_sims(cur, sims)
            sims = []

    save_sims(cur, sims)

    cur.execute("begin transaction")

    cur.execute("delete from noun_similarity")
    cur.execute(
        "insert or ignore into noun_similarity select * from noun_sim_new")

    cur.execute("commit")

    logging.info("Done")
コード例 #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--dir")
    parser.add_argument("--num")
    parser.add_argument("--clusters")
    parser.add_argument("--clusters-out")

    args = parser.parse_args()

    f_out = codecs.open(args.clusters_out, 'w', encoding="utf8")

    today = date.today().strftime('%Y%m%d')
    ystd = (date.today() - timedelta(1)).strftime('%Y%m%d')

    cl = json.load(codecs.open(args.clusters, 'r', encoding="utf8"))

    today_time = (datetime.utcnow()).strftime("%Y%m%d%H%M%S")
    update_time = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S")

    cur1 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, today))
    cur2 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, ystd))

    rel_tweets = []
    for x in cl:
        put_trend(x)
    filtered_cl = [x for x in cl if x["trend"] > 0.0]
    logging.info("Filtered out %d of %d (trend > 0.0)" %
                 (len(cl) - len(filtered_cl), len(cl)))

    top_cl = sorted(cl, key=lambda x: x["trend"], reverse=True)[:15]
    tw_with_embed_cnt = 0
    for cluster in top_cl:
        r = get_relevant_tweets(cur1, cur2, cluster)
        rel_tweets.append(r)
        cluster["topic_density"] = r["density"]

    logging.info("Have %d topics with tweets embeds out of %d" %
                 (tw_with_embed_cnt, len(top_cl)))

    cur_rel = stats.get_cursor("%s/tweets_relevant.db" % args.dir)
    stats.create_given_tables(cur_rel, ["relevant"])
    save_relevant(cur_rel, today_time, rel_tweets)

    final_cl = {
        "clusters": top_cl,
        "update_time": update_time,
        "cluster_id": today_time
    }
    cl_json = json.dump(final_cl, f_out)
    f_out.close()

    return
コード例 #5
0
 def add_new_tweets_for_tomita(self, date):
     self.log.info("Index day %s" %date)
     cur = self.get_db_for_filename(self.dates_dbs[date])
     stats.create_given_tables(cur, ["tomita_progress", "tweets"])
 
     cur.execute("""
         INSERT OR IGNORE INTO tomita_progress (id)
         SELECT t.id from tweets t
         LEFT OUTER JOIN tomita_progress p
         ON t.id = p.id
         WHERE p.id is Null
     """)  
コード例 #6
0
ファイル: Fetcher.py プロジェクト: lonlylocly/molva
    def get_db_for_date(self, date):
        date = date[:8]  # assume date format %Y%m%d_%H%M%S

        if date in self.dates_db:
            return self.dates_db[date]
        else:
            self.log.info("Setup db connection for date " + date)
            cur = stats.get_cursor(self.db_dir + "/tweets_" + date + ".db")
            self.dates_db[date] = cur
            stats.create_given_tables(cur, ["tweets"])

            return cur
コード例 #7
0
 def add_new_tweets_for_statuses(self, date):
     cur = self.get_db_for_filename(self.dates_dbs[date])
     stats.create_given_tables(cur, ["statuses_progress"])
 
     cur.execute("""
         INSERT OR IGNORE INTO statuses_progress (id)
         SELECT t.in_reply_to_id from tweets t
         LEFT OUTER JOIN statuses_progress p
         ON t.in_reply_to_id = p.id
         WHERE 
         t.in_reply_to_id is not Null 
         and p.id is Null
     """)  
コード例 #8
0
ファイル: Fetcher.py プロジェクト: lonlylocly/molva
    def __init__(self,
                 db_dir,
                 headers,
                 days_back=7,
                 seconds_till_user_retry=3600):
        db_basename = "tweets"
        self.db_dir = db_dir
        self.db_basename = db_basename
        self.dates_db = {}
        self.days_back = days_back
        self.recent_users = {}
        self.seconds_till_user_retry = seconds_till_user_retry
        self.log = logging.getLogger('fetcher-' + db_basename)

        cur = stats.get_main_cursor(self.db_dir)
        self.main_db = cur
        stats.create_given_tables(cur, ["users"])

        self.client = TwitterClient(headers)
コード例 #9
0
def build_post_cnt(db_dir):
    utc_now = datetime.utcnow()
    word_cnt = stats.get_word_cnt(db_dir)
    word_cnt_tuples = map(lambda x: (int(x), int(word_cnt[x])),
                          word_cnt.keys())

    f_tmp = db_dir + "/word_cnt.db.tmp"
    f = db_dir + "/word_cnt.db"

    util.delete_if_exists(f_tmp)

    cur = stats.get_cursor(f_tmp)
    stats.create_given_tables(cur,
                              ["chains_nouns", "post_cnt", "post_reply_cnt"])

    save_word_cnt(cur, word_cnt_tuples)
    words = get_trending_words(db_dir, word_cnt_tuples)

    mcur = stats.get_mysql_cursor(settings)
    count_currents2(cur, mcur, utc_now, words)

    os.rename(f_tmp, f)
コード例 #10
0
    def post(self):
        req_data = None
        try:
            req_data = json.loads(self.request.body)

            if req_data is not None:
                cur = stats.get_cursor(settings["db_dir"] +
                                       "/quality_marks.db")
                stats.create_given_tables(cur, ["quality_marks"])
                username = ""
                if "username" in req_data and req_data["username"] is not None:
                    username = req_data["username"]
                update_time = ""
                if "update_time" in req_data and req_data[
                        "update_time"] is not None:
                    update_time = req_data["update_time"]
                    update_time = int(re.sub('[-\s:]', '', update_time))
                exp_name = ""
                if "experiment_name" in req_data and req_data[
                        "experiment_name"] is not None:
                    exp_name = req_data["experiment_name"]
                exp_descr = ""
                if "experiment_descr" in req_data and req_data[
                        "experiment_descr"] is not None:
                    exp_descr = req_data["experiment_descr"]

                cur.execute(
                    """
                    insert into quality_marks 
                    (update_time, username, exp_name, exp_descr,  marks) 
                    values (?, ?, ?, ?, ?)
                """, (update_time, username, exp_name, exp_descr,
                      json.dumps(req_data["marks"])))

        except Exception as e:
            logging.error(e)
            raise (e)

        self.write("")
コード例 #11
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db-dir", default=DB_DIR)
    args = parser.parse_args()

    cur_display = stats.get_cursor(args.db_dir + "/tweets_display.db")
    cur_main = stats.get_cursor(args.db_dir + "/tweets.db")
    #cur_main = stats.get_cursor(args.db_dir + "/tweets_20150221.db")
    #nouns = stats.get_nouns(cur_main)

    #logging.info(type(nouns.keys()[0]))

    utc_now = datetime.utcnow()
    date_3day = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S")
    date_3day_tenminute = date_3day[:11]
    logging.info("Time left bound: %s" % date_3day_tenminute)
    hour_word_cnt = {}
    word_cnt = {}
    for day in [3, 2, 1, 0]:
        date = (utc_now - timedelta(day)).strftime("%Y%m%d")
        word_time_cnt_table = "word_time_cnt_%s" % date
        mcur = stats.get_mysql_cursor(settings)
        stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"})
        mcur.execute("""
                select word_md5, substr(tenminute, 1, 10) as hour, sum(cnt) 
                from %s
                where tenminute > %s
                group by word_md5, hour
        """ % (word_time_cnt_table, date_3day_tenminute))

        row_cnt = 0
        while True:
            res = mcur.fetchone()
            if res is None:
                break
            word_md5, hour, cnt = map(int, res)
            if hour not in hour_word_cnt:
                hour_word_cnt[hour] = {}
            hour_word_cnt[hour][word_md5] = cnt
            if word_md5 not in word_cnt:
                word_cnt[word_md5] = 0
            word_cnt[word_md5] += cnt

            row_cnt += 1
            if row_cnt % 100000 == 0:
                logging.info('Seen %s rows' % row_cnt)

    word_series = []
    hours = sorted(hour_word_cnt.keys())
    for word in word_cnt.keys():
        series = []
        series_max = 0
        for hour in hours:
            if word in hour_word_cnt[hour]:
                series.append(hour_word_cnt[hour][word])
                if hour_word_cnt[hour][word] > series_max:
                    series_max = hour_word_cnt[hour][word]
            else:
                series.append(0)
        # normalize by maxfreq in series
        if series_max > 0:
            series = [(float(x) / series_max) * 100 for x in series]
        approx = least_squares(series)
        a, b, app_ser = approx
        word_series.append({
            "word_md5": word,
            "word_cnt": word_cnt[word],
            "line_c": a,
            "slope": b,
            "delta": app_ser[-1] - app_ser[0]
        })

    word_series = sorted(word_series, key=lambda x: x["slope"],
                         reverse=True)[:2000]

    for cur in [cur_main, cur_display]:
        stats.create_given_tables(cur, {"noun_trend_new": "noun_trend"})
        cur.execute("begin transaction")
        for s in word_series:
            cur.execute("insert into noun_trend_new values (%s, %s)" %
                        (s["word_md5"], s["slope"]))

        cur.execute("drop table noun_trend")
        cur.execute("alter table noun_trend_new rename to noun_trend")
        cur.execute("commit")

    logging.info("Done")
コード例 #12
0
ファイル: parsefacts.py プロジェクト: lonlylocly/molva
def create_tables(cur):
    stats.create_given_tables(cur, ["nouns", "tweets_nouns"])
コード例 #13
0
ファイル: parsefacts.py プロジェクト: lonlylocly/molva
def parse_facts_file(tweet_index, facts, date):
    ind = Indexer(DB_DIR)

    cur = ind.get_db_for_date(date)
    cur_main = stats.get_main_cursor(DB_DIR)
    cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db")

    mcur = stats.get_mysql_cursor(settings)
    word_time_cnt_table = "word_time_cnt_%s" % date
    word_hour_cnt_table = "word_hour_cnt_%s" % date
    word_mates_table = "word_mates_%s" % date
    bigram_table = "bigram_%s" % date
    stats.create_mysql_tables(
        mcur, {
            word_time_cnt_table: "word_time_cnt",
            word_hour_cnt_table: "word_hour_cnt",
            word_mates_table: "word_mates",
            bigram_table: "bigram_day"
        })

    stats.create_given_tables(
        cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"])
    stats.create_given_tables(cur_bigram, ["lemma_word_pairs"])
    stats.create_given_tables(cur, {"sources": "nouns"})
    stats.create_given_tables(cur_main, ["nouns"])
    stats.create_given_tables(cur_main, {"sources": "nouns"})

    logging.info("Parse index: %s; facts: %s" % (tweet_index, facts))

    ids = []
    for l in open(tweet_index, 'r').read().split("\n"):
        if l is None or l == '':
            break
        tw_id, created_at = l.split("\t")
        ids.append((tw_id, created_at))

    logging.info("Got tweet %s ids" % (len(ids)))

    tree = ElementTree.iterparse(facts, events=('start', 'end'))

    # set larger cache, default 2000 * 1024, this 102400*1024
    #cur_bigram.execute("pragma cache_size = -102400")

    nouns_total = set()
    sources_total = set()
    noun_sources = []
    tweets_nouns = []
    lemma_word_pairs = []
    word_mates = []
    word_cnt = []

    match_type_cnt = MatchTypeCnt()

    for event, elem in tree:
        if event == 'end' and elem.tag == 'document':
            cur_doc = int(elem.attrib['di'])
            post_id, create_time = ids[cur_doc - 1]
            nouns_preps = get_nouns_preps(elem)
            match_type_cnt.add_cnt(nouns_preps)
            lemmas = []
            nouns = []
            for np in nouns_preps:
                try:
                    lemmas.append(util.digest(np.with_prep()))
                    nouns.append(util.digest(np.noun_lemma))
                    nouns_total.add(np.noun_lemma)
                    sources_total.add(np.with_prep())

                    noun_sources.append((post_id, util.digest(np.noun_lemma),
                                         util.digest(np.with_prep())))
                    word_cnt.append((util.digest(np.noun_lemma),
                                     cut_to_tenminute(create_time)))
                except Exception as e:
                    traceback.print_exc()
                    logging.error(e)

            lemma_word_pairs += make_lemma_word_pairs(
                nouns, lemmas, cut_to_tenminute(create_time))
            word_mates += make_word_pairs_with_time(nouns,
                                                    create_time,
                                                    bag_size=BAG_SIZE)

            if len(noun_sources) > 10000:
                logging.info("seen %s docid" % (cur_doc))
                save_tweet_nouns(cur, noun_sources)
                save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
                save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
                noun_sources = []
                word_cnt = []

            if len(lemma_word_pairs) >= CHUNK_SIZE:
                save_bigram_day(mcur, lemma_word_pairs, bigram_table)
                lemma_word_pairs = []

            if len(word_mates) >= CHUNK_SIZE:
                logging.info("save %s word_mates" % len(word_mates))
                save_word_mates2(mcur, word_mates, word_mates_table)
                word_mates = []

            elem.clear()

    save_tweet_nouns(cur, noun_sources)
    save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
    save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
    save_bigram_day(mcur, lemma_word_pairs, bigram_table)
    save_word_mates2(mcur, word_mates, word_mates_table)

    save_nouns(cur, nouns_total)
    save_nouns(cur, sources_total, table="sources")
    save_nouns(cur_main, nouns_total)
    save_nouns(cur_main, sources_total, table="sources")

    logging.info(str(match_type_cnt))

    return