def ex_feature(train_set,test_set,t_train,t_test,hash=False,use_tf=False,K=2000): ''' extract feature from train_set,test_set,using term frequency or tf-idf. And a stop_word.txt is necessary :param train_set:numpy array or sparse matrix of shape [n_samples,n_features] Training data :param test_set:numpy array or sparse matrix of shape [n_samples,n_features] Training data :param t_train:numpy array of shape [n_samples, n_targets] Target values :param t_test:numpy array of shape [n_samples, n_targets] Target values :param hash: use HashingVectorizer :param use_tf: use term frequency to descend dimensions :param K:select k best features based on cki2,only used if ``use_tf == 'False'`` :return:train_Set and test_set after extracting features ''' with open('chinese_stopword.txt', 'r', encoding='utf-8-sig') as f: stop_words = list(f.read().splitlines()) data_train_size_mb = size_mb(train_set) data_test_size_mb = size_mb(test_set) start_time = time.time() print('extracting features......') if hash: from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(non_negative=True) x_train = vectorizer.fit_transform(train_set) x_test = vectorizer.fit_transform(test_set) else: tfidf_transformer = TfidfTransformer() if use_tf: vectorizer = CountVectorizer(max_features=K,stop_words=stop_words, decode_error='strict') x_train_tf_matrix = vectorizer.fit_transform(train_set) x_train = tfidf_transformer.fit_transform(x_train_tf_matrix) x_test_tf_matrix = vectorizer.transform(test_set)#共用一个vectorizer x_test = tfidf_transformer.fit_transform(x_test_tf_matrix) else: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words=stop_words) x_train_tfidf_matrix = vectorizer.fit_transform(train_set) x_test_tfidf_matrix = vectorizer.transform(test_set) chi2=SelectKBest(chi2, k=K) x_train = chi2.fit_transform(x_train_tfidf_matrix, t_train) x_test = chi2.transform(x_test_tfidf_matrix) end_time=time.time() print('extract features took %.2f s at %0.2fMB/S' % ( (time.time() - start_time), (data_train_size_mb+data_test_size_mb) / (end_time-start_time))) return x_train, x_test
### Splitting Data into Train and Test using a StratifiedShuffleSplit sss = StratifiedShuffleSplit(Y, 10, test_size=0.3, random_state=0) ### Using the generated indices to create Test and Train datasets for train_index, test_index in sss: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] ### Select K best features based on a chi-squared test feature_chi = 72 ## selecting best 2/3rd features chi2 = SelectKBest(chi2, k=feature_chi) X_train = chi2.fit_transform(X_train, y_train) X_test = chi2.transform(X_test) #print(X_train) ### Defining a function to print statistics which helps us benchmark classifier performance def benchmark(clf): clf_descr = str(clf).split('(')[0] ## store name of the classifier print(clf_descr) ## print name t0 = time() ## store current time in t0 clf.fit(X_train, y_train) ## run classifier to fit data train_time = time() - t0 ## Calculate time take to train print("train time: %0.3fs" % train_time) ## print statistic t0 = time() ## store current time in t0 pred = clf.predict(X_test) ## use trained classifer to predict class for test data
## import data data_path = '/Users/zhangzhaopeng/统计学习/机器学习/Text_Classification/data_preprocessing.pkl' fp = open(data_path, 'rb') x_train, x_test, y_train, y_test = pickle.load(fp) fp.close() ## 卡方检验选择特征 from sklearn import naive_bayes from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 vectorizer = CountVectorizer(min_df=2) x_train_tf = vectorizer.fit_transform(x_train) x_test_tf = vectorizer.transform(x_test) chi2 = SelectKBest(chi2, k=4000) x_train_chi2 = chi2.fit_transform(x_train_tf, y_train) x_test_chi2 = chi2.transform(x_test_tf) ## naive bayes naive_chi2 = naive_bayes.MultinomialNB().fit(x_train_chi2, y_train) naive_chi2_preds = naive_chi2.predict(x_test_chi2) count_accu = 0 for i in range(len(y_test)): if y_test[i] == naive_chi2_preds[i]: count_accu += 1 naive_accu_chi2 = count_accu / len(y_test) #naive_accu2 = metrics.accuracy_score(naive_preds, y_test) print("Test set accuracy: ", naive_accu_chi2) # confusion_matrix conf_arr_naive_chi2 = [[0, 0], [0, 0]] for i in range(len(y_test)):
# I make further modifications on the features_list by applying MinMaxScaler and # SelectKBest based on Chi-squared scoring function to choose 10 best features. scalar = MinMaxScaler() scaled_features = scalar.fit_transform(features) #print scaled_features features_train, features_test, labels_train, labels_test = \ train_test_split(scaled_features, labels, test_size=0.1, random_state=42) # Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best # results for different methods and clasifiers. chi2 = SelectKBest(chi2, 10) features_train = chi2.fit_transform(features_train, labels_train) features_test = chi2.transform(features_test) # keep selected feature names # i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)] features_list = ["poi"] + features_list_new print "chi2 selected features_list = " pprint (features_list) # I will apply featureFormat to new feature_list with 10 best members and extraxt # new labels/features to use them for the same varity of clasifiers and compare their scores. data = featureFormat(my_dataset, features_list)
# In[42]: for i in range(0, len(x_feature_names)): print (i, x_feature_names[i]) # ## Feature Scaling and Selection from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 print X.shape chi2 = SelectKBest(chi2, k=20) X_new = chi2.fit_transform(X, y) print X_new.shapep_values_of_features_dict = {} for i in range(0, len(x_feature_names)): p_values_of_features_dict[x_feature_names[i]] = chi2.pvalues_[i] p_values_of_features_dict import operator sorted_p_values_of_features_dict = sorted(p_values_of_features_dict.items(), key=operator.itemgetter(1))sorted_p_values_of_features_dict # In[43]: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_new = scaler.fit_transform(X)
def preprocess_dataset(path, remove_punctuation=True, text_representation='tfidf', tfidf_max_features=None, tfidf_min_df=7, tfidf_max_df=0.8, feature_selection=None, labels='sentiment'): """ Preprocess dataset and return features and labels Args: path: path string to trainset.txt remove_punctuation: remove punctuation flag, default True text_representation: representation of text, default tfidf tfidf_max_features: max_features for tfidf, default None tfidf_min_df: min_df for tfidf, default 7 tfidf_max_df: max_df for tfidf, default 0.8 feature_selection: type of feature_selection, default None labels: return type of labels, default sentiment Return: (features, y_labels): preprocessed features and labels ([contains binarized 'sentiment'] or [label encoding of 'topic' and label 'topic_labels']) """ reviews = trainset_to_df(path) # Removing punctuations if remove_punctuation: process_text = lambda review: ' '.join(review.translate(str.maketrans('', '', string.punctuation)).split()) reviews['text'] = reviews['text'].apply(process_text) # Removing all stopwords stopword_list = stopwords.words('english') stopword_list += 'nt' reviews['text'] = reviews['text'].apply(lambda review: ' '.join([word for word in review.split() if word not in stopword_list])) # Drop id column reviews.drop('id', axis=1, inplace=True) # Binarize sentiment label reviews['sentiment'] = reviews['sentiment'].apply(lambda sentiment: 0 if sentiment == 'neg' else 1) # Label Encode topic label le = LabelEncoder() reviews['topic'] = le.fit_transform(reviews['topic']) reviews['topic_labels'] = le.inverse_transform(reviews['topic']) # Vectorize text with Tfid if text_representation == 'tfidf': vectorizer = TfidfVectorizer (max_features=tfidf_max_features, min_df=tfidf_min_df, max_df=tfidf_max_df) features = vectorizer.fit_transform(reviews['text']).toarray() # Get labels if labels == 'sentiment': y_labels = reviews['sentiment'] else: y_labels = reviews[['topic', 'topic_labels']] # Feature selection if feature_selection == 'variance_threshold': from sklearn.feature_selection import VarianceThreshold var_thres = VarianceThreshold(threshold=np.var(features)) features = var_thres.fit_transform(features) elif feature_selection == 'chi_square_test': from sklearn.feature_selection import SelectKBest, chi2 chi2 = SelectKBest(chi2, k=features.shape[1]) if labels == 'topic': features = chi2.fit_transform(features, y_labels.iloc[:, 0]) else: features = chi2.fit_transform(features, y_labels) return (features, y_labels)