Exemple #1
0
output_file = codecs.open("data/doc_vec.txt", 'wb', encoding='utf-8')
pg.pg_init()

# 将教育经历的两个属性合并
'''for ele in segment.seg(pg.get_edu()):
    print len(ele)'''
# 测试
'''for mlist in segment.seg(pg.get_edu()):
    for ele in mlist:
        for x in ele:
            output_file.write(x+' ')
        output_file.write('\t||\t')
    output_file.write('\n')
'''
raw = list()
people = pg.get_edu(0, 0)
raw = map(lambda x: x[0] + x[1], segment.seg(people))

topic_model.build_lsi(raw)


'''topic_model._build_corpus(raw)

tfidf = topic_model._build_tfidf()
corpus_tfidf = tfidf[topic_model.corpus]

for ele in corpus_tfidf:
    print ele
'''

'''for ele in topic_model.dictionary.token2id:
Exemple #2
0
from __future__ import absolute_import

import codecs
import segment
import pg

output_file = codecs.open("data/seg_data1.txt", 'wb', encoding='utf-8')
pg.pg_init()

# print segment.seg(pg.get_edu())
bias = 1000
counter = 0
raw = '1'
while len(raw) != 0:
    raw = pg.get_edu(bias, counter)
    for mlist in segment.seg(raw):
        for ele in mlist:
            for x in ele:
                output_file.write(x+' ')
            output_file.write('\t')
        output_file.write('\n')
    counter += 1

'''raw = pg.get_edu(bias, counter)
for mlist in segment.seg(raw):
    for ele in mlist:
        for x in ele:
            output_file.write(x+' ')
        output_file.write('\t||\t')
    output_file.write('\n')'''