Beispiel #1
0

def save_tokenlization_result(data,
                              target,
                              file_path='./data/tags_token_results'):
    with codecs.open(file_path, 'w', 'utf-8') as f:
        for x in data:
            f.write(' '.join(x) + '\n')

    with open(file_path + '_tag', 'w') as f:
        for x in target:
            f.write(x + '\n')


if __name__ == '__main__':
    # seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    data, target = read_train_data()
    data = Pool().map(jieba.lcut, data)
    # data = jieba.lcut(data)
    save_tokenlization_result(data, target)

    with codecs.open('./data/tags_token_results', 'r', 'utf-8') as f:
        data = [line.strip().split() for line in f.read().split('\n')]
        if not data[-1]: data.pop()
        t = [Counter(d) for d in data]  # 每一行为一个短信, 值就是TF
        v = DictVectorizer()
        t = v.fit_transform(t)  # 稀疏矩阵表示sparse matrix,词编好号
        TrainData.save(t)