def load_account_doc(): nload = 2000 docs = [ i[0] for i in statist.get_DB('select doc from %s limit %d' % (ACCOUNT_LAB, nload + 1)) ] write_doctitle(docs, DOC_NAME, TITLE_NAME)
def t_guess(fdoc, ftest): docs = codecs.open(fdoc, encoding='utf-8').readlines() idf = get_idf(docs) max_idf = max(idf.values()) if ftest: data = codecs.open(ftest, encoding='utf-8').read() words = list(gen_nounword(data)) print len(data), len(words) test = u' '.join(words) else: (screen_name, test) = statist.get_DB( 'select screen_name, doc from %s where uid=%s limit 1' % (ACCOUNT_LAB, 1711064324))[0] print 'screen_name:', screen_name print 'test doc:', test trans = {} res = statist.get_DB('select * from %s where p>%s' % (WORD_DICT, P_THRESHOLD)) for dw, tw, p in res: trans.setdefault(dw, {}) trans[dw][tw] = p title = guess(trans, test, idf, max_idf, 200) for i in title: print i[0], i[1]
def load_account_doc(): nload = 2000 docs = [i[0] for i in statist.get_DB('select doc from %s limit %d' % (ACCOUNT_LAB, nload + 1))] write_doctitle(docs, DOC_NAME, TITLE_NAME)