def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print('Loading dataset, 80% for training, 20% for testing...') movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split( movie_reviews.data, movie_reviews.target, test_size=0.2, random_state=0) print('Feature selection...') print('fs method:' + fs_method, 'fs num:' + str(fs_num)) vectorizer = CountVectorizer(binary=True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [ word_tokenizer(doc_str) for doc_str in doc_str_list_train ] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print('Building VSM model...') term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec = vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) # µ÷ÓÃMultinomialNB·ÖÀàÆ÷ doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print('Accuracy: ', acc) return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary = True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类 doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) # 对数据集进行划分,80%用来进行训练,20%进行测试,并把对应的类别进行标注 doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split( movie_reviews.data, movie_reviews.target, test_size=0.2, random_state=0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary=True) word_tokenizer = vectorizer.build_tokenizer() # doc_term_list_train:得到训练数据集中的每个文档进行分词的数组 doc_terms_list_train = [ word_tokenizer(doc_str) for doc_str in doc_str_list_train ] # doc_class_list_train:每个文档对应的类别编号的数组 term_set_fs = FeatureSelections.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print "term_set_fs length %s " % (len(term_set_fs)) print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec = vectorizer.transform(doc_str_list_test) # 朴素贝叶斯分类器 # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类器 # doc_test_predicted = clf.predict(doc_test_vec) # SVM分类器 svclf = SVC(kernel='linear') svclf.fit(doc_train_vec, doc_class_list_train) doc_test_predicted = svclf.predict(doc_test_vec) # KNN # knnclf = KNeighborsClassifier() # default with k=5 # knnclf.fit(doc_train_vec, doc_class_list_train) # doc_test_predicted = knnclf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc from sklearn.metrics import classification_report print 'precision,recall,F1-score如下:》》》》》》》》' print classification_report(doc_test_predicted, doc_class_list_test) return acc
def feats(X_train_terms, X_train_texts, train_Y, X_test_texts, sel_feat_method=None, K=None): count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer() # 这里使用的是tf-idf if sel_feat_method != None: term_dict = sel_terms(X_train_terms, train_Y, sel_feat_method, K) count_vect.fixed_vocabulary = True count_vect.vocabulary_ = term_dict X_train_counts = count_vect.transform(X_train_texts) X_train_feats = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(X_test_texts) # 构建文档计数 X_test_feats = tfidf_transformer.transform(X_test_counts) # 构建文档tfidf return X_train_feats, X_test_feats