def feed_import_new(zsite_id, rid, title, txt, url, rank): title = utf8_ftoj(unescape(title)) txt = utf8_ftoj(format_txt(txt)) if import_feed_duplicator.txt_is_duplicate(txt): return #print zsite_id, rid, title #sleep(0.1) feed_user = user_by_feed_id_zsite_id(zsite_id, rid) if feed_user: po_meta_user_id = feed_user.id else: po_meta_user_id = 0 new_feed = FeedImport(title=title, txt=txt, zsite_id=zsite_id, rid=rid, url=url, tag_id_list='', state=FEED_IMPORT_STATE_WITHOUT_TAG, rank=rank, po_meta_user_id=po_meta_user_id) new_feed.save() id = new_feed.id import_feed_duplicator.set_record(txt, id) if feed_user: user_id = feed_user.user_id if user_id: feed_import_user_new(user_id, id) return new_feed
def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1+log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1 + log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def feed_import_new(zsite_id, rid, title, txt, url, rank): title = utf8_ftoj(unescape(title)) txt = utf8_ftoj(format_txt(txt)) if import_feed_duplicator.txt_is_duplicate(txt): return #print zsite_id, rid, title #sleep(0.1) feed_user = user_by_feed_id_zsite_id(zsite_id, rid) if feed_user: po_meta_user_id = feed_user.id else: po_meta_user_id = 0 new_feed = FeedImport( title=title, txt=txt, zsite_id=zsite_id, rid=rid, url=url, tag_id_list='', state=FEED_IMPORT_STATE_WITHOUT_TAG, rank=rank, po_meta_user_id=po_meta_user_id ) new_feed.save() id = new_feed.id import_feed_duplicator.set_record(txt, id) if feed_user: user_id = feed_user.user_id if user_id: feed_import_user_new(user_id, id) return new_feed
def name_tidy(name): return utf8_ftoj(str(name)).lower().replace("·"," ").replace("《","").replace("》","").replace("(","(").split("(",1)[0]
def name_tidy(name): return utf8_ftoj(str(name)).lower().replace("·", " ").replace( "《", "").replace("》", "").replace("(", "(").split("(", 1)[0]
def txt2word(txt): return seg_txt( utf8_ftoj(str(txt.lower())) )
def txt2word(txt): return seg_txt(utf8_ftoj(str(txt.lower())))