def merge(path): for pos, i in enumerate(glob(path + "/*")): print pos, i for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) redis.hincrby(word, topic, int(freq * 100))
def merge(path): for pos, i in enumerate(glob(path+"/*")): print pos, i for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) redis.hincrby(word, topic, int(freq*100))
def merge(): CACHE_PATH = "/home/work/wanfang/tag" for pos, i in enumerate(glob(CACHE_PATH+"/*")): for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) s.append((topic, freq)) print dumps(s)
def merge(): CACHE_PATH = "/home/work/wanfang/tag" for pos, i in enumerate(glob(CACHE_PATH + "/*")): for word, topic_freq in tofromfile.fromfile(i).iteritems(): if len(word.strip()) <= 3: continue word = name_tidy(word) s = [word] for topic, freq in topic_freq.iteritems(): topic = int(topic) s.append((topic, freq)) print dumps(s)
def fromfile(self, path): self._dict = fromfile(path) return self
#!/usr/bin/env python # -*- coding: utf-8 -*- import _env from config import ZDATA_PATH from zkit.tofromfile import fromfile from idf import tf_idf as _tf_idf from os.path import join from mmseg import seg_txt IDF = fromfile(join(ZDATA_PATH, 'data/idf')) def tf_idf(word_list): return _tf_idf(word_list, IDF) def tf_idf_seg_txt(txt): txt = txt.replace('。', ' ').replace(',', ' ') word_list = list(seg_txt(txt)) return tf_idf(word_list) from kyotocabinet import DB from collections import defaultdict from array import array from zkit.zitertools import chunkiter from operator import itemgetter from zdata.tag.name2id import NAME2ID from zkit.txt_cleanup import sp_txt ID2NAME = defaultdict(list)
def extend_by_file(self, filename): _count , _df = fromfile(filename) self._count += _count for k,v in _df.iteritems(): self._df[k]+=v
def extend_by_file(self, filename): _count, _df = fromfile(filename) self._count += _count for k, v in _df.iteritems(): self._df[k] += v
#!/usr/bin/env python # -*- coding: utf-8 -*- import _env from config import ZDATA_PATH from zkit.tofromfile import fromfile from idf import tf_idf as _tf_idf from os.path import join from mmseg import seg_txt IDF = fromfile(join(ZDATA_PATH, "data/idf")) def tf_idf(word_list): return _tf_idf(word_list, IDF) def tf_idf_seg_txt(txt): txt = txt.replace("。", " ").replace(",", " ") word_list = list(seg_txt(txt)) return tf_idf(word_list) from kyotocabinet import DB from collections import defaultdict from array import array from zkit.zitertools import chunkiter from operator import itemgetter from zdata.tag.name2id import NAME2ID from zkit.txt_cleanup import sp_txt ID2NAME = defaultdict(list)