output_file = codecs.open("data/doc_vec.txt", 'wb', encoding='utf-8') pg.pg_init() # 将教育经历的两个属性合并 '''for ele in segment.seg(pg.get_edu()): print len(ele)''' # 测试 '''for mlist in segment.seg(pg.get_edu()): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t||\t') output_file.write('\n') ''' raw = list() people = pg.get_edu(0, 0) raw = map(lambda x: x[0] + x[1], segment.seg(people)) topic_model.build_lsi(raw) '''topic_model._build_corpus(raw) tfidf = topic_model._build_tfidf() corpus_tfidf = tfidf[topic_model.corpus] for ele in corpus_tfidf: print ele ''' '''for ele in topic_model.dictionary.token2id:
from __future__ import absolute_import import codecs import segment import pg output_file = codecs.open("data/seg_data1.txt", 'wb', encoding='utf-8') pg.pg_init() # print segment.seg(pg.get_edu()) bias = 1000 counter = 0 raw = '1' while len(raw) != 0: raw = pg.get_edu(bias, counter) for mlist in segment.seg(raw): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t') output_file.write('\n') counter += 1 '''raw = pg.get_edu(bias, counter) for mlist in segment.seg(raw): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t||\t') output_file.write('\n')'''