def get_related_tweets(cur, words): word_md5s = set() for w in words: if w.word_md5 is not None: word_md5s.add(str(w.word_md5)) if w.lemma_md5 is not None: word_md5s.add(str(w.lemma_md5)) word_md5s = list(word_md5s) stats.create_given_tables(cur, ["tweets", "tweets_nouns"]) tweets = {} cur.execute(""" select n.id, n.noun_md5, t.id, t.tw_text, t.created_at, t.username from tweets_nouns n inner join tweets t on n.id = t.id where noun_md5 in (%s) limit 10000 """ % ",".join(word_md5s)) r = cur.fetchall() for l in r: tw_id, noun_md5, tw_id, tw_text, created_at, username = l if tw_id not in tweets: tweets[tw_id] = Tweet(tw_text, tw_id, created_at, username, tw_id) tweets[tw_id].words.append(noun_md5) get_tweets_nouns(cur, tweets) return tweets
def get_trending_words(db_dir, word_cnt_tuples): cur = stats.get_cursor(db_dir + "/tweets_display.db") stats.create_given_tables(cur, ["noun_trend"]) cur.execute(""" select noun_md5, trend from noun_trend order by trend desc limit 2000 """) word_trends = map(lambda x: (int(x[0]), float(x[1])), cur.fetchall()) word_ranks = make_tf_idf_ranks(word_cnt_tuples) for w in word_trends: word, trend = w if word not in word_ranks: logging.warn("No such word_md5 at word_ranks %s" % word) continue word_ranks[word].trend.value = trend Rank.weight_ranks(map(lambda x: x.trend, word_ranks.values())) Rank.weight_ranks(map(lambda x: x.cnt, word_ranks.values())) words = [] for word_rank in sorted(word_ranks.values(), key=lambda x: x.cnt.rank + x.trend.rank)[:2000]: words.append(str(word_rank.word)) return words
def main(): logging.info("Start") parser = util.get_dates_range_parser() parser.add_argument("-i", "--in-file") args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) stats.create_given_tables(cur, ["noun_similarity"]) cur.execute( "create table if not exists noun_sim_new as select * from noun_similarity limit 0" ) cur.execute("delete from noun_sim_new") in_file = open(args.in_file, 'r') sims = [] for line in in_file: sims.append(line.split(";")) if len(sims) > 20000: save_sims(cur, sims) sims = [] save_sims(cur, sims) cur.execute("begin transaction") cur.execute("delete from noun_similarity") cur.execute( "insert or ignore into noun_similarity select * from noun_sim_new") cur.execute("commit") logging.info("Done")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--dir") parser.add_argument("--num") parser.add_argument("--clusters") parser.add_argument("--clusters-out") args = parser.parse_args() f_out = codecs.open(args.clusters_out, 'w', encoding="utf8") today = date.today().strftime('%Y%m%d') ystd = (date.today() - timedelta(1)).strftime('%Y%m%d') cl = json.load(codecs.open(args.clusters, 'r', encoding="utf8")) today_time = (datetime.utcnow()).strftime("%Y%m%d%H%M%S") update_time = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S") cur1 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, today)) cur2 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, ystd)) rel_tweets = [] for x in cl: put_trend(x) filtered_cl = [x for x in cl if x["trend"] > 0.0] logging.info("Filtered out %d of %d (trend > 0.0)" % (len(cl) - len(filtered_cl), len(cl))) top_cl = sorted(cl, key=lambda x: x["trend"], reverse=True)[:15] tw_with_embed_cnt = 0 for cluster in top_cl: r = get_relevant_tweets(cur1, cur2, cluster) rel_tweets.append(r) cluster["topic_density"] = r["density"] logging.info("Have %d topics with tweets embeds out of %d" % (tw_with_embed_cnt, len(top_cl))) cur_rel = stats.get_cursor("%s/tweets_relevant.db" % args.dir) stats.create_given_tables(cur_rel, ["relevant"]) save_relevant(cur_rel, today_time, rel_tweets) final_cl = { "clusters": top_cl, "update_time": update_time, "cluster_id": today_time } cl_json = json.dump(final_cl, f_out) f_out.close() return
def add_new_tweets_for_tomita(self, date): self.log.info("Index day %s" %date) cur = self.get_db_for_filename(self.dates_dbs[date]) stats.create_given_tables(cur, ["tomita_progress", "tweets"]) cur.execute(""" INSERT OR IGNORE INTO tomita_progress (id) SELECT t.id from tweets t LEFT OUTER JOIN tomita_progress p ON t.id = p.id WHERE p.id is Null """)
def get_db_for_date(self, date): date = date[:8] # assume date format %Y%m%d_%H%M%S if date in self.dates_db: return self.dates_db[date] else: self.log.info("Setup db connection for date " + date) cur = stats.get_cursor(self.db_dir + "/tweets_" + date + ".db") self.dates_db[date] = cur stats.create_given_tables(cur, ["tweets"]) return cur
def add_new_tweets_for_statuses(self, date): cur = self.get_db_for_filename(self.dates_dbs[date]) stats.create_given_tables(cur, ["statuses_progress"]) cur.execute(""" INSERT OR IGNORE INTO statuses_progress (id) SELECT t.in_reply_to_id from tweets t LEFT OUTER JOIN statuses_progress p ON t.in_reply_to_id = p.id WHERE t.in_reply_to_id is not Null and p.id is Null """)
def __init__(self, db_dir, headers, days_back=7, seconds_till_user_retry=3600): db_basename = "tweets" self.db_dir = db_dir self.db_basename = db_basename self.dates_db = {} self.days_back = days_back self.recent_users = {} self.seconds_till_user_retry = seconds_till_user_retry self.log = logging.getLogger('fetcher-' + db_basename) cur = stats.get_main_cursor(self.db_dir) self.main_db = cur stats.create_given_tables(cur, ["users"]) self.client = TwitterClient(headers)
def build_post_cnt(db_dir): utc_now = datetime.utcnow() word_cnt = stats.get_word_cnt(db_dir) word_cnt_tuples = map(lambda x: (int(x), int(word_cnt[x])), word_cnt.keys()) f_tmp = db_dir + "/word_cnt.db.tmp" f = db_dir + "/word_cnt.db" util.delete_if_exists(f_tmp) cur = stats.get_cursor(f_tmp) stats.create_given_tables(cur, ["chains_nouns", "post_cnt", "post_reply_cnt"]) save_word_cnt(cur, word_cnt_tuples) words = get_trending_words(db_dir, word_cnt_tuples) mcur = stats.get_mysql_cursor(settings) count_currents2(cur, mcur, utc_now, words) os.rename(f_tmp, f)
def post(self): req_data = None try: req_data = json.loads(self.request.body) if req_data is not None: cur = stats.get_cursor(settings["db_dir"] + "/quality_marks.db") stats.create_given_tables(cur, ["quality_marks"]) username = "" if "username" in req_data and req_data["username"] is not None: username = req_data["username"] update_time = "" if "update_time" in req_data and req_data[ "update_time"] is not None: update_time = req_data["update_time"] update_time = int(re.sub('[-\s:]', '', update_time)) exp_name = "" if "experiment_name" in req_data and req_data[ "experiment_name"] is not None: exp_name = req_data["experiment_name"] exp_descr = "" if "experiment_descr" in req_data and req_data[ "experiment_descr"] is not None: exp_descr = req_data["experiment_descr"] cur.execute( """ insert into quality_marks (update_time, username, exp_name, exp_descr, marks) values (?, ?, ?, ?, ?) """, (update_time, username, exp_name, exp_descr, json.dumps(req_data["marks"]))) except Exception as e: logging.error(e) raise (e) self.write("")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--db-dir", default=DB_DIR) args = parser.parse_args() cur_display = stats.get_cursor(args.db_dir + "/tweets_display.db") cur_main = stats.get_cursor(args.db_dir + "/tweets.db") #cur_main = stats.get_cursor(args.db_dir + "/tweets_20150221.db") #nouns = stats.get_nouns(cur_main) #logging.info(type(nouns.keys()[0])) utc_now = datetime.utcnow() date_3day = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S") date_3day_tenminute = date_3day[:11] logging.info("Time left bound: %s" % date_3day_tenminute) hour_word_cnt = {} word_cnt = {} for day in [3, 2, 1, 0]: date = (utc_now - timedelta(day)).strftime("%Y%m%d") word_time_cnt_table = "word_time_cnt_%s" % date mcur = stats.get_mysql_cursor(settings) stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"}) mcur.execute(""" select word_md5, substr(tenminute, 1, 10) as hour, sum(cnt) from %s where tenminute > %s group by word_md5, hour """ % (word_time_cnt_table, date_3day_tenminute)) row_cnt = 0 while True: res = mcur.fetchone() if res is None: break word_md5, hour, cnt = map(int, res) if hour not in hour_word_cnt: hour_word_cnt[hour] = {} hour_word_cnt[hour][word_md5] = cnt if word_md5 not in word_cnt: word_cnt[word_md5] = 0 word_cnt[word_md5] += cnt row_cnt += 1 if row_cnt % 100000 == 0: logging.info('Seen %s rows' % row_cnt) word_series = [] hours = sorted(hour_word_cnt.keys()) for word in word_cnt.keys(): series = [] series_max = 0 for hour in hours: if word in hour_word_cnt[hour]: series.append(hour_word_cnt[hour][word]) if hour_word_cnt[hour][word] > series_max: series_max = hour_word_cnt[hour][word] else: series.append(0) # normalize by maxfreq in series if series_max > 0: series = [(float(x) / series_max) * 100 for x in series] approx = least_squares(series) a, b, app_ser = approx word_series.append({ "word_md5": word, "word_cnt": word_cnt[word], "line_c": a, "slope": b, "delta": app_ser[-1] - app_ser[0] }) word_series = sorted(word_series, key=lambda x: x["slope"], reverse=True)[:2000] for cur in [cur_main, cur_display]: stats.create_given_tables(cur, {"noun_trend_new": "noun_trend"}) cur.execute("begin transaction") for s in word_series: cur.execute("insert into noun_trend_new values (%s, %s)" % (s["word_md5"], s["slope"])) cur.execute("drop table noun_trend") cur.execute("alter table noun_trend_new rename to noun_trend") cur.execute("commit") logging.info("Done")
def create_tables(cur): stats.create_given_tables(cur, ["nouns", "tweets_nouns"])
def parse_facts_file(tweet_index, facts, date): ind = Indexer(DB_DIR) cur = ind.get_db_for_date(date) cur_main = stats.get_main_cursor(DB_DIR) cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db") mcur = stats.get_mysql_cursor(settings) word_time_cnt_table = "word_time_cnt_%s" % date word_hour_cnt_table = "word_hour_cnt_%s" % date word_mates_table = "word_mates_%s" % date bigram_table = "bigram_%s" % date stats.create_mysql_tables( mcur, { word_time_cnt_table: "word_time_cnt", word_hour_cnt_table: "word_hour_cnt", word_mates_table: "word_mates", bigram_table: "bigram_day" }) stats.create_given_tables( cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"]) stats.create_given_tables(cur_bigram, ["lemma_word_pairs"]) stats.create_given_tables(cur, {"sources": "nouns"}) stats.create_given_tables(cur_main, ["nouns"]) stats.create_given_tables(cur_main, {"sources": "nouns"}) logging.info("Parse index: %s; facts: %s" % (tweet_index, facts)) ids = [] for l in open(tweet_index, 'r').read().split("\n"): if l is None or l == '': break tw_id, created_at = l.split("\t") ids.append((tw_id, created_at)) logging.info("Got tweet %s ids" % (len(ids))) tree = ElementTree.iterparse(facts, events=('start', 'end')) # set larger cache, default 2000 * 1024, this 102400*1024 #cur_bigram.execute("pragma cache_size = -102400") nouns_total = set() sources_total = set() noun_sources = [] tweets_nouns = [] lemma_word_pairs = [] word_mates = [] word_cnt = [] match_type_cnt = MatchTypeCnt() for event, elem in tree: if event == 'end' and elem.tag == 'document': cur_doc = int(elem.attrib['di']) post_id, create_time = ids[cur_doc - 1] nouns_preps = get_nouns_preps(elem) match_type_cnt.add_cnt(nouns_preps) lemmas = [] nouns = [] for np in nouns_preps: try: lemmas.append(util.digest(np.with_prep())) nouns.append(util.digest(np.noun_lemma)) nouns_total.add(np.noun_lemma) sources_total.add(np.with_prep()) noun_sources.append((post_id, util.digest(np.noun_lemma), util.digest(np.with_prep()))) word_cnt.append((util.digest(np.noun_lemma), cut_to_tenminute(create_time))) except Exception as e: traceback.print_exc() logging.error(e) lemma_word_pairs += make_lemma_word_pairs( nouns, lemmas, cut_to_tenminute(create_time)) word_mates += make_word_pairs_with_time(nouns, create_time, bag_size=BAG_SIZE) if len(noun_sources) > 10000: logging.info("seen %s docid" % (cur_doc)) save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) noun_sources = [] word_cnt = [] if len(lemma_word_pairs) >= CHUNK_SIZE: save_bigram_day(mcur, lemma_word_pairs, bigram_table) lemma_word_pairs = [] if len(word_mates) >= CHUNK_SIZE: logging.info("save %s word_mates" % len(word_mates)) save_word_mates2(mcur, word_mates, word_mates_table) word_mates = [] elem.clear() save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) save_bigram_day(mcur, lemma_word_pairs, bigram_table) save_word_mates2(mcur, word_mates, word_mates_table) save_nouns(cur, nouns_total) save_nouns(cur, sources_total, table="sources") save_nouns(cur_main, nouns_total) save_nouns(cur_main, sources_total, table="sources") logging.info(str(match_type_cnt)) return