def merge(path): for pos, i in enumerate(glob(path + "/*")): print pos, i for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) redis.hincrby(word, topic, int(freq * 100))
def merge(path): for pos, i in enumerate(glob(path+"/*")): print pos, i for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) redis.hincrby(word, topic, int(freq*100))
def merge(): CACHE_PATH = "/home/work/wanfang/tag" for pos, i in enumerate(glob(CACHE_PATH+"/*")): for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) s.append((topic, freq)) print dumps(s)
def wangfang_parser(fn): with open(fn) as infile: for line in infile: s = loads(line.strip()) if not s[2]: continue txt = "\n".join(filter(bool,s[:2])).strip() if not txt: continue tag_list = set(name_tidy(i) for i in s[2]) exist_tag = tag_list&NAME2ID_SET if exist_tag: exist_id_list = list(NAME2ID[i] for i in exist_tag) yield exist_id_list , txt
def wangfang_parser(fn): with open(fn) as infile: for line in infile: s = loads(line.strip()) if not s[2]: continue txt = "\n".join(filter(bool, s[:2])).strip() if not txt: continue tag_list = set(name_tidy(i) for i in s[2]) exist_tag = tag_list & NAME2ID_SET if exist_tag: exist_id_list = list(NAME2ID[i] for i in exist_tag) yield exist_id_list, txt
def merge(): CACHE_PATH = "/home/work/wanfang/tag" for pos, i in enumerate(glob(CACHE_PATH + "/*")): for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) s.append((topic, freq)) print dumps(s)
def merge(): topic_count = defaultdict(int) f = "word2count.txt" keys = redis.keys("*") for pos, key in enumerate(keys): l = redis.hgetall(key) print "1",pos, key for k,v in l.iteritems(): topic_count[int(k)]+=int(v) #word_topic_freq = defaultdict(list) with open("word_tf.txt", "w") as word_freq: for pos, word in enumerate(keys): tf = [] l = redis.hgetall(word) for topic, freq in l.iteritems(): topic = int(topic) count = topic_count[topic] if count < 10000: continue freq = int(freq)*500000/count if freq > 0: tf.append((topic, freq)) fcount = sum(i[1] for i in tf) tf = dict(tf) id = NAME2ID.get(name_tidy(word), 0) if id: t = tf.get(id,0) diff = fcount - t tf[id] = fcount fcount += diff if not fcount: continue t = [] for topic, f in tf.iteritems(): rank = int(f*10000/fcount) if rank: t.append((topic, rank)) if t: word_freq.write( dumps([word, t])+"\n" )
def merge(): topic_count = defaultdict(int) f = "word2count.txt" keys = redis.keys("*") for pos, key in enumerate(keys): l = redis.hgetall(key) print "1", pos, key for k, v in l.iteritems(): topic_count[int(k)] += int(v) #word_topic_freq = defaultdict(list) with open("word_tf.txt", "w") as word_freq: for pos, word in enumerate(keys): tf = [] l = redis.hgetall(word) for topic, freq in l.iteritems(): topic = int(topic) count = topic_count[topic] if count < 10000: continue freq = int(freq) * 500000 / count if freq > 0: tf.append((topic, freq)) fcount = sum(i[1] for i in tf) tf = dict(tf) id = NAME2ID.get(name_tidy(word), 0) if id: t = tf.get(id, 0) diff = fcount - t tf[id] = fcount fcount += diff if not fcount: continue t = [] for topic, f in tf.iteritems(): rank = int(f * 10000 / fcount) if rank: t.append((topic, rank)) if t: word_freq.write(dumps([word, t]) + "\n")