Exemple #1
0
def merge(path):
    for pos, i in enumerate(glob(path + "/*")):
        print pos, i
        for word, topic_freq in tofromfile.fromfile(i).iteritems():

            if len(word.strip()) <= 3:
                continue

            word = name_tidy(word)
            s = [word]
            for topic, freq in topic_freq.iteritems():
                topic = int(topic)
                redis.hincrby(word, topic, int(freq * 100))
Exemple #2
0
def merge(path):
    for pos, i in enumerate(glob(path+"/*")):
        print pos, i
        for word, topic_freq in tofromfile.fromfile(i).iteritems():

            if len(word.strip()) <= 3:
                continue

            word = name_tidy(word)
            s = [word]
            for topic, freq in topic_freq.iteritems():
                topic = int(topic)
                redis.hincrby(word, topic, int(freq*100))
Exemple #3
0
def merge():
    CACHE_PATH = "/home/work/wanfang/tag"
    for pos, i in enumerate(glob(CACHE_PATH+"/*")):
        for word, topic_freq in tofromfile.fromfile(i).iteritems():

            if len(word.strip()) <= 3:
                continue

            word = name_tidy(word)
            s = [word]
            for topic, freq in topic_freq.iteritems():
                topic = int(topic)
                s.append((topic, freq))
            print dumps(s)
Exemple #4
0
def wangfang_parser(fn):
    with open(fn) as infile:
        for line in infile:
            s = loads(line.strip())
            if not s[2]:
                continue
            txt = "\n".join(filter(bool,s[:2])).strip()
            if not txt:
                continue
            tag_list = set(name_tidy(i) for i in s[2])
            exist_tag = tag_list&NAME2ID_SET
            if exist_tag:
                exist_id_list = list(NAME2ID[i] for i in exist_tag)
                yield exist_id_list , txt
Exemple #5
0
def wangfang_parser(fn):
    with open(fn) as infile:
        for line in infile:
            s = loads(line.strip())
            if not s[2]:
                continue
            txt = "\n".join(filter(bool, s[:2])).strip()
            if not txt:
                continue
            tag_list = set(name_tidy(i) for i in s[2])
            exist_tag = tag_list & NAME2ID_SET
            if exist_tag:
                exist_id_list = list(NAME2ID[i] for i in exist_tag)
                yield exist_id_list, txt
Exemple #6
0
def merge():
    CACHE_PATH = "/home/work/wanfang/tag"
    for pos, i in enumerate(glob(CACHE_PATH + "/*")):
        for word, topic_freq in tofromfile.fromfile(i).iteritems():

            if len(word.strip()) <= 3:
                continue

            word = name_tidy(word)
            s = [word]
            for topic, freq in topic_freq.iteritems():
                topic = int(topic)
                s.append((topic, freq))
            print dumps(s)
Exemple #7
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1",pos, key
        for k,v in l.iteritems():
            topic_count[int(k)]+=int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq)*500000/count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id,0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f*10000/fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(
                    dumps([word, t])+"\n"
                )
Exemple #8
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1", pos, key
        for k, v in l.iteritems():
            topic_count[int(k)] += int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq) * 500000 / count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id, 0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f * 10000 / fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(dumps([word, t]) + "\n")