Esempio n. 1
0
def get_num(train_file, model_road=u"../april_model"):
    cate1 = list()
    data = list()
    with codecs.open(train_file, "r", "utf-8") as fr:
        for line in fr:
            line1 = line.strip().split("\t", 2)
            cate1.append(line1[1])
            one_data = " ".join(jieba.cut(line1[2]))
            data.append(one_data)
    cate_set = list(set(cate1))  # 标签的集合
    cate = [cate_set.index(cat) for cat in cate1]  # 根据集合转换为数字标签

    vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.9, stop_words='english'
    )  #最大频率大于0.9的停用词删除。对数词频调整,sublinear_tf设置为true。文档转换成特征矩阵
    all_train_data = vectorizer.fit_transform(data)
    # for word, num in vectorizer.vocabulary_.iteritems():
    #     print word, num
    print len(vectorizer.vocabulary_.keys())
    vectorizer.cate_set = cate_set
    print " ".join(cate_set)

    if not os.path.exists(model_road):  # 如果不存在模型文件,则创建
        os.makedirs(model_road)
    tf_idf_file = os.path.join(model_road, "svmTFIDFModel.pkl")
    cPickle.dump(vectorizer, open(tf_idf_file, "wb"))  # 模型dump下来
    return all_train_data, cate