Example #1
0
def get_clusters(args, sim_dict, nouns, noun_trend, post_cnt):
    trash_words_md5 = map(util.digest, settings["trash_words"])
    total_md5 = util.digest("__total__")
    best_ratio = 10 
    cl = []
    for k in [800, 900, 1000, 1100]:
        for i in range(0, int(args.i)): 
            logging.info("get %s clusters, iteration %s" % (k, i))
            resp = KMeanCluster.get_clusters(sim_dict, int(k), nouns, trash_words=trash_words_md5,
                pre_clusters=[total_md5])
            ratio = resp["intra_dist"] / resp["extra_dist"]
            if (ratio) < best_ratio:
                best_ratio = ratio
                cl = resp["clusters"]
   
    logging.info("Best ratio: %s" % best_ratio) 
    logging.info("Best clusters size: %s" % len(cl)) 
    for c in cl:
        for m in c["members"]:
            try:
                m["post_cnt"] = post_cnt[m["id"]]
            except Exception as e:
                logging.info("Mess with noun_md5 %s (%s)" % (m["id"], type(m["id"])))
                logging.error(e)
            trend = noun_trend[m["id"]] if m["id"] in noun_trend else 0
            m["trend"] = "%.3f" % trend 
   
    return util.filter_trash_words_cluster(cl)
Example #2
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-i")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")
    words_db = DB_DIR + "/tweets_lemma.db"
    bigram_db = DB_DIR + "/tweets_bigram.db"

    used_nouns = get_used_nouns(cur)        

    total_md5 = util.digest("__total__")

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend["__total__"] = 0.0  
    logging.info("nouns len %s" % len(nouns))
    post_cnt = stats.get_noun_cnt(cur_word_cnt)
    post_cnt[total_md5] = 0
    
    logging.info("get sim_dict")
    sim_dict = get_sims(cur) 

    cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt)

    json.dump(cl, open("./clusters_raw.json","w"), indent=2)

    logging.info("Done")
Example #3
0
    def get(self):
        try:
            word = self.get_argument("word", default=None)
            time1 = self.get_argument("time1", default=None)
            time2 = self.get_argument("time2", default=None)
            logging.info("Request: %s, %s, %s" % (word, time1, time2))

            if word is None:
                return

            time1, time2 = self.parse_times(time1, time2)

            word_md5 = util.digest(word.strip())
            logging.info("Get time series for '%s' (%s)" % (word, word_md5))

            res = self.get_word_time_cnt(word_md5, time1, time2)

            res = sorted(res, key=lambda x: x[1])
            res = map(
                lambda x: {
                    "hour": x[1],
                    "count": x[2],
                    "utc_unixtime": x[3]
                }, res)
            #mov_av = [0]
            #for i in range(1, len(res) -1):
            #    ma = float(res[i-1]["count"] + res[i]["count"] + res[i+1]["count"]) / 3
            #    mov_av.append(ma)
            #mov_av.append(0)

            self.write(json.dumps({"word": word_md5, "dataSeries": res}))
        except Exception as e:
            logging.error(e)
            raise e
Example #4
0
def _save_nouns(cur, nouns, table="nouns"):
    cur.execute("begin transaction")
    for n in nouns:
        cur.execute(
            "insert or ignore into %s (noun_md5, noun) values (?, ?)" %
            (table), (util.digest(n), n))

    cur.execute("commit")
Example #5
0
def get_words_from_query(query):
    tokens = re.split('\s+', query)

    words = []

    for t in tokens:
        w = Word(t)
        w.word_md5 = util.digest(t)
        words.append(w)

    return words
Example #6
0
def filter_silly_spam(tw):
    tw_text = {}
    for tw_id in tw:
        tw_text[util.digest(tw[tw_id].text)] = tw_id

    tw2 = {}

    for tw_md5 in tw_text:
        tw_id = tw_text[tw_md5]
        tw2[tw_id] = tw[tw_id]

    return tw2
Example #7
0
def _add_total_to_profiles(profiles_dict, trash_words):
    trash_words_md5 = map(util.digest, trash_words)
    total_md5 = util.digest('__total__')
    total = NounProfile(total_md5, post_cnt=0)
    for p in profiles_dict:
        if p not in trash_words_md5:
            continue
        profile = profiles_dict[p]
        for reply in profile.replys:
            if reply not in total.replys:
                total.replys[reply] = 0
            total.replys[reply] += profile.replys[reply]
        total.post_cnt += profile.post_cnt
    profiles_dict[total_md5] = total
Example #8
0
    def get_word_time_cnt(self, word_md5, time1, time2):
        logging.info("Get word time cnt: %s, %s, %s" %
                     (word_md5, time1, time2))
        utc_now = datetime.utcnow()
        res = []
        default_left_time_bound = (utc_now -
                                   timedelta(3)).strftime("%Y%m%d%H%M%S")[:10]
        time = ""
        if time1 is not None:
            time += " and hour >= " + str(time1)[:10]
        else:
            time += " and hour >= " + default_left_time_bound
        if time2 is not None:
            time += " and hour < " + str(time2)[:10]

        where = "word_md5 = %s" % word_md5
        if word_md5 == util.digest('0'):
            where = "1"

        mcur = stats.get_mysql_cursor(settings)
        try:
            for day in [3, 2, 1, 0]:
                date = (utc_now - timedelta(day)).strftime("%Y%m%d")
                #stats.create_mysql_tables(mcur, {"word_hour_cnt_"+date: "word_hour_cnt"})
                mcur.execute("""
                    SELECT word_md5, hour, cnt
                    FROM word_hour_cnt_%(date)s
                    WHERE %(where)s 
                    %(time)s
                """ % {
                    "where": where,
                    "time": time,
                    "date": date
                })
                while True:
                    r = mcur.fetchone()
                    if r is None:
                        break
                    word, hour, cnt = r
                    utctime = str(hour) + "0000"
                    utc_unixtime = datetime.strptime(
                        utctime, '%Y%m%d%H%M%S').strftime('%s')
                    res.append((str(word), utc_to_local(utctime), int(cnt),
                                utc_unixtime))
            logging.info("word time cnt: %s" % len(res))
        except Exception as e:
            logging.error(e)

        return res
Example #9
0
def dedup_tweets(tweets, all_words=True):
    dedup_tw = {}
    for tw_id in tweets:
        wordset = tweets[tw_id].all_words if all_words else tweets[tw_id].words
        words_str = [str(x) for x in sorted(wordset)]
        text_md5 = util.digest(",".join(words_str))
        if text_md5 not in dedup_tw:
            dedup_tw[text_md5] = tw_id
        elif tweets[dedup_tw[text_md5]].created_at < tweets[tw_id].created_at:
            dedup_tw[text_md5] = tw_id
    groupped_tw = {}
    for text_md5 in dedup_tw:
        groupped_tw[dedup_tw[text_md5]] = tweets[dedup_tw[text_md5]]

    return groupped_tw
Example #10
0
from molva.Indexer import Indexer
import molva.util as util

logging.config.fileConfig("logging.conf")

POST_MIN_FREQ = 10

settings = {}
try:
    settings = json.load(open('global-settings.json', 'r'))
except Exception as e:
    logging.warn(e)

DB_DIR = settings["db_dir"] if "db_dir" in settings else os.environ["MOLVA_DIR"]

total_md5 = util.digest("__total__")
trash_words = [util.digest(x) for x in settings["trash_words"]]


def get_sims(cur):
    res = cur.execute("select post1_md5, post2_md5, sim from noun_similarity")

    sim_dict = {}
    while True:
        r = cur.fetchone()
        if r is None:
            break
        p1, p2, sim = r
        if p1 not in sim_dict:
            sim_dict[p1] = {}
        if p2 not in sim_dict:
Example #11
0
logging.config.fileConfig("logging.conf")

settings = {}
try:
    settings = json.load(open('global-settings.json', 'r'))
except Exception as e:
    logging.warn(e)

POST_MIN_FREQ = settings["post_min_freq"] if "post_min_freq" in settings else 10

DB_DIR = settings["db_dir"] if "db_dir" in settings else os.environ["MOLVA_DIR"]

BLOCKED_NOUNS_LIST = u"\n".join(list(u"абвгдеёжзиклмнопрстуфхцчшщыьъэюя"))

BLOCKED_NOUNS = ",".join(
    map(lambda x: str(util.digest(x)), BLOCKED_NOUNS_LIST.split("\n")))

NOUNS_LIMIT = 2000


def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-c", "--clear", action="store_true")
    parser.add_argument("-p", "--profiles-table", default="post_reply_cnt")
    parser.add_argument("-o", "--out-file")
    args = parser.parse_args()

    cur = stats.get_cursor(DB_DIR + "/word_cnt.db")

    profiles_dict = stats.setup_noun_profiles(
Example #12
0
def parse_facts_file(tweet_index, facts, date):
    ind = Indexer(DB_DIR)

    cur = ind.get_db_for_date(date)
    cur_main = stats.get_main_cursor(DB_DIR)
    cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db")

    mcur = stats.get_mysql_cursor(settings)
    word_time_cnt_table = "word_time_cnt_%s" % date
    word_hour_cnt_table = "word_hour_cnt_%s" % date
    word_mates_table = "word_mates_%s" % date
    bigram_table = "bigram_%s" % date
    stats.create_mysql_tables(
        mcur, {
            word_time_cnt_table: "word_time_cnt",
            word_hour_cnt_table: "word_hour_cnt",
            word_mates_table: "word_mates",
            bigram_table: "bigram_day"
        })

    stats.create_given_tables(
        cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"])
    stats.create_given_tables(cur_bigram, ["lemma_word_pairs"])
    stats.create_given_tables(cur, {"sources": "nouns"})
    stats.create_given_tables(cur_main, ["nouns"])
    stats.create_given_tables(cur_main, {"sources": "nouns"})

    logging.info("Parse index: %s; facts: %s" % (tweet_index, facts))

    ids = []
    for l in open(tweet_index, 'r').read().split("\n"):
        if l is None or l == '':
            break
        tw_id, created_at = l.split("\t")
        ids.append((tw_id, created_at))

    logging.info("Got tweet %s ids" % (len(ids)))

    tree = ElementTree.iterparse(facts, events=('start', 'end'))

    # set larger cache, default 2000 * 1024, this 102400*1024
    #cur_bigram.execute("pragma cache_size = -102400")

    nouns_total = set()
    sources_total = set()
    noun_sources = []
    tweets_nouns = []
    lemma_word_pairs = []
    word_mates = []
    word_cnt = []

    match_type_cnt = MatchTypeCnt()

    for event, elem in tree:
        if event == 'end' and elem.tag == 'document':
            cur_doc = int(elem.attrib['di'])
            post_id, create_time = ids[cur_doc - 1]
            nouns_preps = get_nouns_preps(elem)
            match_type_cnt.add_cnt(nouns_preps)
            lemmas = []
            nouns = []
            for np in nouns_preps:
                try:
                    lemmas.append(util.digest(np.with_prep()))
                    nouns.append(util.digest(np.noun_lemma))
                    nouns_total.add(np.noun_lemma)
                    sources_total.add(np.with_prep())

                    noun_sources.append((post_id, util.digest(np.noun_lemma),
                                         util.digest(np.with_prep())))
                    word_cnt.append((util.digest(np.noun_lemma),
                                     cut_to_tenminute(create_time)))
                except Exception as e:
                    traceback.print_exc()
                    logging.error(e)

            lemma_word_pairs += make_lemma_word_pairs(
                nouns, lemmas, cut_to_tenminute(create_time))
            word_mates += make_word_pairs_with_time(nouns,
                                                    create_time,
                                                    bag_size=BAG_SIZE)

            if len(noun_sources) > 10000:
                logging.info("seen %s docid" % (cur_doc))
                save_tweet_nouns(cur, noun_sources)
                save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
                save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
                noun_sources = []
                word_cnt = []

            if len(lemma_word_pairs) >= CHUNK_SIZE:
                save_bigram_day(mcur, lemma_word_pairs, bigram_table)
                lemma_word_pairs = []

            if len(word_mates) >= CHUNK_SIZE:
                logging.info("save %s word_mates" % len(word_mates))
                save_word_mates2(mcur, word_mates, word_mates_table)
                word_mates = []

            elem.clear()

    save_tweet_nouns(cur, noun_sources)
    save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
    save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
    save_bigram_day(mcur, lemma_word_pairs, bigram_table)
    save_word_mates2(mcur, word_mates, word_mates_table)

    save_nouns(cur, nouns_total)
    save_nouns(cur, sources_total, table="sources")
    save_nouns(cur_main, nouns_total)
    save_nouns(cur_main, sources_total, table="sources")

    logging.info(str(match_type_cnt))

    return