Exemple #1
0
def main():
    corpus, labels = get_data()  # 获取数据集

    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    # print('样本之一:', corpus[10])
    # print('样本的label:', labels[10])
    # label_name_map = ["垃圾邮件", "正常邮件"]
    # print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) #labels[0:4999]为1.0,labels[5000:10001]为0.0
    # print('实际类型:', label_name_map[1], label_name_map[0])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)
    #对数据进行规整化和预处理
    from normalization import normalize_corpus

    # 进行归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    # print(norm_train_corpus[:3])
    norm_test_corpus = [
        '中信(国际)电子科技有限公司推出新产品:升职步步高、做生意发大财、连找情人都用的上,详情进入网址httpwwwusa5588comccc电话:02033770208服务热线:013650852999',
        '向专利局递交申请需要将文件转为PDF格式。我已经将说明书、说明书附图、权利要求书、摘要转化为PDF格式。由于WORED文档转化为PDF文档时公式和变量容易变形,而这种错误在申请递交给专利局之后将无法弥补,所以,请你逐字对照检查,确保PDF文件中没有变形错误,尤其是变量的上标、下标、运算符。'
    ]

    # norm_test_corpus = normalize_corpus(test_corpus)
    # print(norm_test_corpus)
    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    """
    bow_train_features:
    (0, 173)	1  第0个列表元素,**词典中索引为173的元素**, 词频
    (0, 54)	1
    (0, 4)	1

    """
    # bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # 训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()  # 朴素贝叶斯
    svm = SGDClassifier(loss='hinge', n_iter=100)  # 支持向量机
    lr = LogisticRegression()  # 逻辑回归
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    print(svm_tfidf_predictions)
Exemple #2
0
# Next compute tfidf using idf matrix from the train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

# Check the new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)



# sklearn's TfidfVectorizer provides a transformer to extract tfidf scores directly
# from raw data - avoiding the need for CountVectorizer based bow scores
from feature_extractors import tfidf_extractor
    
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)

nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)    



# We can also do more sophisticated word-vector models using Google's word2vec algorithm
# using the gensim python package
import gensim
import nltk

TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) 
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) 
Exemple #3
0
def main():
    corpus, labels = get_data()
    print("total data size:", len(labels))
    corpus, labels = remove_empty_docs(corpus, labels)
    print("sample:", corpus[10])
    print("label of sample:", labels[10])
    label_name_map = ['spam', 'normal']  # 0代表spam,1代表normal
    print("actual type:", label_name_map[int(labels[10])])

    # 划分数据集
    train_corpus, train_labels, test_corpus, test_labels = prepare_datasets(
        corpus, labels)
    # 对语料进行预处理
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    # 词袋模型
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf模型
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # 对处理后的语料进行分词
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    # 词向量Word2Vec
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)

    # 分别以多项分布朴素贝叶斯、SVM、逻辑回归算法训练分类器并评估各个分类器性能
    mnb = MultinomialNB()  # 朴素贝叶斯
    svm = SGDClassifier()  # SVM
    lr = LogisticRegression()  # 逻辑斯特回归

    print("\nNavie Bayes based on BOW")
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nLogistic Regression based on BOW")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nSVM based on BOW")
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nNavie Bayes based on tfidf")
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    print("\nLogistic Regression based on tfidf")
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    print("\nSVM based on tfidf")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
Exemple #4
0
def main():
    label_name_map = ["垃圾邮件", "正常邮件"]

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = data_preprocess()

    # 标准化,去除特殊字符
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    # tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    # tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    # build word2vec 模型
    # logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)
    # model = gensim.models.Word2Vec(tokenized_train,
    #                                size=500,
    #                                window=100,
    #                                min_count=30,
    #                                sample=1e-3)
    # model.save("./vector.model")
    # model=gensim.models.Word2Vec.load("./vector.model")
    # print("已加载词向量模型....")

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression

    mnb = MultinomialNB()
    svm = SGDClassifier(loss='hinge', n_iter=100)
    lr = LogisticRegression()

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)

    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
                                                      train_features=bow_train_features,
                                                      train_labels=train_labels,
                                                      test_features=bow_test_features,
                                                      test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)


    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)


    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)

#取出一部分正确分类的样本和一部分错误分类的样本
    import re
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
Exemple #5
0
norm_test_corpus = normalize_corpus(test_corpus)

''.strip()

from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
bow_test_features = bow_vectorizer.transform(norm_test_corpus)

# tfidf features
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text) for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text) for text in norm_test_corpus]
# build word2vec model
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)

# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
Exemple #6
0
def main():
    corpus, labels = get_data()  # 获取数据集

    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    print('样本之一:', corpus[10])
    print('样本的label:', labels[10])
    label_name_map = ["垃圾邮件", "正常邮件"]
    print('实际类型:', label_name_map[int(labels[10])],
          label_name_map[int(labels[5900])])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)

    from normalization import normalize_corpus

    # 进行归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    ''.strip()

    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]
    # build word2vec 模型
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()
    svm = SGDClassifier(loss='hinge', n_iter=100)
    lr = LogisticRegression()

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    import re

    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break

    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
Exemple #7
0
#val_reviews = pre_process_corpus(val_reviews)
test_headlines = pre_process_corpus(test_headlines)

#feature_extraction
from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(train_headlines)
bow_test_features = bow_vectorizer.transform(test_headlines)

# tfidf features
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_headlines)
tfidf_test_features = tfidf_vectorizer.transform(test_headlines)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_headlines]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_headlines]

# build word2vec model
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)
Exemple #8
0
def main():
    corpus, labels = get_data()  #获取数据集
    print('总的数据量:', len(corpus))
    print('labels数据量:', len(labels))
    corpus, labels = remove_empty_docs(corpus, labels)
    print('样本之一:', corpus[0])
    print('样本的label:', labels[243])
    label_name_map = ['垃圾邮件', '正常邮件']
    print('实际:', label_name_map[int(labels[10])],
          label_name_map[int(labels[8908])])
    #对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)
    print('训练数据量:', len(train_corpus))
    print('测试数据量:', len(test_corpus))

    from normalization import normalize_corpus
    #对数据归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    ''.strip()

    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    #词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    #tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    #训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()
    # svm = SGDClassifier(loss='hinge', n_iter=100)
    svm = SGDClassifier(loss='hinge')
    lr = LogisticRegression()

    #基于词袋模型的多项朴素贝叶斯
    print('基于词袋模型特征的贝叶斯分类器')
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)
    # 基于词袋模型的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)
    # 基于词袋模型的支持向量机
    print('基于词袋模型特征的支持向量机')
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于tfidf的多项式朴素贝叶斯模型
    print('基于tfidf的多项式朴素贝叶斯模型')
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print('基于tfidf的逻辑回归模型')
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的支持向量机模型
    print('基于tfidf支持向量机模型')
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    #显示部分正确归类和部分错误归类
    import re
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))
            num += 1
            if num == 4:
                break
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))
            num += 1
            if num == 4:
                break
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)


from feature_extractors import tfidf_extractor
    
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)

nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)    


import gensim
import nltk

TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) 
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) 
                    for sentence in new_doc]                        

model = gensim.models.Word2Vec(TOKENIZED_CORPUS, 
Exemple #10
0
def conver2tfidf(data):
    new_data = []
    for q in data:
        new_data.append(q)
    tfidf_vectorizer, tfidf_X = tfidf_extractor(new_data)
    return tfidf_vectorizer, tfidf_X
norm_test_corpus = normalize_corpus(test_corpus)  

''.strip()

from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)  
bow_test_features = bow_vectorizer.transform(norm_test_corpus) 

# tfidf features
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)    


# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in norm_test_corpus]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
Exemple #12
0
def main():
    # 获取数据集
    corpus, labels = get_data()
    print("总的数据量:", len(labels))

    # 删除无用数据,也就是空数据
    corpus, labels = remove_empty_docs(corpus, labels)

    print('样本之一:', corpus[10])
    print('样本对应的label:', labels[10])
    label_name_map = ["垃圾邮件", "正常邮件"]
    # 下标为0 的是垃圾邮件,为1 的是正常邮件
    print('实际类型:', label_name_map[int(labels[10])],
          label_name_map[int(labels[5900])])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)

    # 进行归一化
    # 现在数据的个数
    # 第二个参数传入为True表示是否用分词信息
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)
    print(norm_train_corpus[11])

    print("==========数据处理完成==========")

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # jieba分词
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]
    # build word2vec 模型
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)

    # 朴树贝叶斯分类器
    mnb = MultinomialNB()
    # svm分类器
    svm = SGDClassifier(loss='hinge', n_iter=100)
    # lr分类器
    lr = LogisticRegression()

    # 两种数据处理方式,三种分类模型

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    import re

    num = 0
    # 用户问句,该问句的标签,该问句的预测结果
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        # 垃圾邮件
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('原始文本:')
            print(document)

            num += 1
            if num == 4:
                break

    num = 0

    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('原始文本:')
            print(document)

            num += 1
            if num == 4:
                break
def main():
    corpus, labels = get_data()  # 获取数据集

    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    print('样本之一:', corpus[10])
    print('样本的label:', labels[10])
    label_name_map = ["垃圾邮件", "正常邮件"]
    print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) #labels[0:4999]为1.0,labels[5000:10001]为0.0
    print('实际类型:', label_name_map[1], label_name_map[0])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
                                                                            labels,
                                                                            test_data_proportion=0.3)
    #对数据进行规整化和预处理
    from normalization import normalize_corpus

    # 进行归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    # print(norm_train_corpus[:3])
    norm_test_corpus = normalize_corpus(test_corpus)
    # print(norm_test_corpus)
    # norm_test_corpus1 = ['中信(国际)电子科技有限公司推出新产品:升职步步高、做生意发大财、连找情人都用的上,详情进入网址httpwwwusa5588comccc电话:02033770208服务热线:013650852999']
    # ''.strip()

    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    # print(bow_vectorizer)
    # print(bow_train_features)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    tokenized_train = [jieba.lcut(text)
                       for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text)
                      for text in norm_test_corpus]
    # build word2vec 模型
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)
#训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()  #朴素贝叶斯
    svm = SGDClassifier(loss='hinge', n_iter=100)   #支持向量机
    lr = LogisticRegression()   #逻辑回归

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)
    # print(mnb_bow_predictions)  #返回的预测结果:[0. 0. 1. ... 0. 1. 0.]
    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
                                                      train_features=bow_train_features,
                                                      train_labels=train_labels,
                                                      test_features=bow_test_features,
                                                      test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)


    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)


    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)



    import re

    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
    #部分分错邮件
    print("部分分错邮件:")
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break