Exemple #1
0
def main():
    t = {}
    for k, v in NAME2ID.iteritems():
        t[name_tidy(k)] = v
    print "#coding: utf-8"
    print "NAME2ID = ",
    pprint(t)
Exemple #2
0
def main():
    t = {}
    for k,v in NAME2ID.iteritems():
        t[name_tidy(k)] = v
    print "#coding: utf-8"
    print "NAME2ID = ",
    pprint(t)
def tag_to_id(tag):
    tag = str(tag)
    id = id2topic.get(tag, 0)
    if id in ID2MY:
        id = ID2MY[id]
        if id not in myidset:
            id = 0
    else:
        id = 0

    if not id:
        tag = name_tidy(tag)
        id = NAME2ID.get(tag, 0)
    return id
def tag_to_id(tag):
    tag = str(tag)
    id = id2topic.get(tag, 0)
    if id in ID2MY:
        id = ID2MY[id]
        if id not in myidset:
            id = 0
    else:
        id = 0

    if not id:
        tag = name_tidy(tag)
        id = NAME2ID.get(tag, 0)
    return id
Exemple #5
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1",pos, key
        for k,v in l.iteritems():
            topic_count[int(k)]+=int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq)*500000/count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id,0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f*10000/fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(
                    dumps([word, t])+"\n"
                )
Exemple #6
0
def merge():
    topic_count = defaultdict(int)

    f = "word2count.txt"

    keys = redis.keys("*")
    for pos, key in enumerate(keys):
        l = redis.hgetall(key)
        print "1", pos, key
        for k, v in l.iteritems():
            topic_count[int(k)] += int(v)

    #word_topic_freq = defaultdict(list)

    with open("word_tf.txt", "w") as word_freq:
        for pos, word in enumerate(keys):
            tf = []
            l = redis.hgetall(word)
            for topic, freq in l.iteritems():
                topic = int(topic)
                count = topic_count[topic]
                if count < 10000:
                    continue
                freq = int(freq) * 500000 / count
                if freq > 0:
                    tf.append((topic, freq))

            fcount = sum(i[1] for i in tf)

            tf = dict(tf)
            id = NAME2ID.get(name_tidy(word), 0)
            if id:
                t = tf.get(id, 0)
                diff = fcount - t
                tf[id] = fcount
                fcount += diff

            if not fcount:
                continue

            t = []
            for topic, f in tf.iteritems():
                rank = int(f * 10000 / fcount)
                if rank:
                    t.append((topic, rank))
            if t:
                word_freq.write(dumps([word, t]) + "\n")
#coding:utf-8

import _env
from json import loads
from zhihu_topic_data_with_follow import ZHIHU_TOPIC
from name2id import NAME2ID
from zdata.tag.name_tidy import name_tidy
from zhihu_topic_url2id import ID2MY
from itertools import chain
from zhihu_question_load import zhihu_to_dump

id2topic = dict([(i[1], i[0]) for i in ZHIHU_TOPIC])

myidset = set(NAME2ID.itervalues())
myiddict = dict([(k, v) for v, k in NAME2ID.iteritems()])


def tag_id_list_by_str_list(tags):
    tag_list = []

    for tag in tags:
        id = tag_to_id(tag)
        if not id:
            continue
        else:
            tag_list.append(id)

    return tag_list


def tag_to_id(tag):
Exemple #8
0
#coding:utf-8
import _env
from name2id import NAME2ID
from zkit.txt_cleanup import sp_txt
from collections import defaultdict
from zkit.pprint import pprint

sp2id = defaultdict(list)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        sp2id[i].append(k)

word_parent = defaultdict(set)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        for j in sp2id[i]:
            if j != k and k in j:
                #print k, j
                word_parent[NAME2ID[j]].add(NAME2ID[k])

id2name = dict((k, v) for v, k in NAME2ID.iteritems())

#for id, pid_list in word_parent.iteritems():
#    print id2name[id]
#    for i in pid_list:
#        print id2name[i],
#    print "\n" 

word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())
Exemple #9
0
#coding:utf-8
import _env
from name2id import NAME2ID
from zkit.txt_cleanup import sp_txt
from collections import defaultdict
from zkit.pprint import pprint

sp2id = defaultdict(list)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        sp2id[i].append(k)

word_parent = defaultdict(set)

for k, v in NAME2ID.iteritems():
    for i in sp_txt(k):
        for j in sp2id[i]:
            if j != k and k in j:
                #print k, j
                word_parent[NAME2ID[j]].add(NAME2ID[k])

id2name = dict((k, v) for v, k in NAME2ID.iteritems())

#for id, pid_list in word_parent.iteritems():
#    print id2name[id]
#    for i in pid_list:
#        print id2name[i],
#    print "\n"

word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())
#coding:utf-8

import _env
from json import loads
from zhihu_topic_data_with_follow import ZHIHU_TOPIC
from name2id import NAME2ID
from zdata.tag.name_tidy import name_tidy
from zhihu_topic_url2id import ID2MY
from itertools import chain
from zhihu_question_load import zhihu_to_dump

id2topic = dict([(i[1], i[0]) for i in ZHIHU_TOPIC])

myidset = set(NAME2ID.itervalues())
myiddict = dict([(k, v) for v, k in NAME2ID.iteritems()])

def tag_id_list_by_str_list(tags):
    tag_list = []

    for tag in tags:
        id = tag_to_id(tag)
        if not id:
            continue
        else:
            tag_list.append(id)

    return tag_list

def tag_to_id(tag):
    tag = str(tag)
    id = id2topic.get(tag, 0)
Exemple #11
0
def main():
    for k, v in NAME2ID.iteritems():
        alias, id = k, v
        #        autocomplete_tag.append(alias, id)
        tag_alias_new(alias=alias, id=id)