def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print('Loading dataset, 80% for training, 20% for testing...')
    movie_reviews = load_files(dataset_dir_name)
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print('Feature selection...')
    print('fs method:' + fs_method, 'fs num:' + str(fs_num))
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]

    print('Building VSM model...')
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)

    clf = MultinomialNB().fit(doc_train_vec,
                              doc_class_list_train)  # µ÷ÓÃMultinomialNB·ÖÀàÆ÷
    doc_test_predicted = clf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)
    print('Accuracy: ', acc)

    return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
Example #3
0
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)
    # 对数据集进行划分,80%用来进行训练,20%进行测试,并把对应的类别进行标注
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()

    # doc_term_list_train:得到训练数据集中的每个文档进行分词的数组
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]

    # doc_class_list_train:每个文档对应的类别编号的数组
    term_set_fs = FeatureSelections.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]
    print "term_set_fs length %s " % (len(term_set_fs))

    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)
    # 朴素贝叶斯分类器
    # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类器
    # doc_test_predicted = clf.predict(doc_test_vec)

    # SVM分类器
    svclf = SVC(kernel='linear')
    svclf.fit(doc_train_vec, doc_class_list_train)
    doc_test_predicted = svclf.predict(doc_test_vec)

    # KNN
    # knnclf = KNeighborsClassifier()  # default with k=5
    # knnclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = knnclf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)

    print 'Accuracy: ', acc

    from sklearn.metrics import classification_report
    print 'precision,recall,F1-score如下:》》》》》》》》'
    print classification_report(doc_test_predicted, doc_class_list_test)

    return acc
Example #4
0
def feats(X_train_terms,
          X_train_texts,
          train_Y,
          X_test_texts,
          sel_feat_method=None,
          K=None):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()  # 这里使用的是tf-idf

    if sel_feat_method != None:
        term_dict = sel_terms(X_train_terms, train_Y, sel_feat_method, K)
        count_vect.fixed_vocabulary = True
        count_vect.vocabulary_ = term_dict

    X_train_counts = count_vect.transform(X_train_texts)
    X_train_feats = tfidf_transformer.fit_transform(X_train_counts)
    X_test_counts = count_vect.transform(X_test_texts)  # 构建文档计数
    X_test_feats = tfidf_transformer.transform(X_test_counts)  # 构建文档tfidf

    return X_train_feats, X_test_feats