print "" print "Selected Feature list - before Feature_Selection", features1 ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features1, sort_keys=True) labels, features = targetFeatureSplit(data) ### We do not know yet if feature scaling and feature filering using kbest will benefit our model yet. ### But lets try it anyway # Scale features scaler = MinMaxScaler() features = scaler.fit_transform(features) # K-best features - choosing 6 features for a trial k_best = SelectKBest(k=6) k_best.fit(features, labels) result_list = zip(k_best.get_support(), features1[1:], k_best.scores_) result_list = sorted(result_list, key=lambda x: x[2], reverse=True) #print "K-best features - i.e. top 6 features selected:", result_list """ OUTPUT: K-best features - i.e. top 6 features selected: [(True, 'exercised_stock_options', 25.097541528735491), (True, 'total_stock_value', 24.467654047526391), (True, 'bonus', 21.060001707536578), (True, 'wealth', 19.457343207083316), (True, 'salary', 18.575703268041778), (True, 'fraction_to_poi', 16.641707070468989), (False, 'long_term_incentive', 10.072454529369448),
def select_feat(data, labels, n_components): data = SelectKBest(chi2, k=n_components).fit_transform(data, labels) return data
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target)
elif FeatSelection_SVM == True: X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames=featureNames[LogRegFeats.get_support()] print ("SVC Transformed X:",X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) ''' KFilt=None # KFilt=200 if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] print("X reduced to K best features: ",X.shape) print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) #varFilt = VarianceThreshold(threshold=0.05) #X = varFilt.fit_transform(X) #print(X.shape,"X post low variance feature filtering") 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 15
max_df=0.75, stop_words='english', tokenizer=tokenize) #count_vectorizer = count_vectorizer.fit(twenty_train.data) # tf-idf transformer from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() ########## ########## print("SGDClassifier") #SGDClassifier from sklearn.linear_model import SGDClassifier pipeline = Pipeline([ ('vect', count_vectorizer), ('chi2', SelectKBest(chi2, k=1500)), ('tfidf', tfidf_transformer), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) #Fit print("Fit") text_clf_sgd = pipeline.fit(twenty_train.data, twenty_train.target) #Evaluation of the performance on the test set print("Evaluation")
m = len(temparray) tn = 0 fn = 0 for i in y: if i == 1: tn = tn + 1 else: fn = fn + 1 print(4, tn, fn) maxFeature = 16 timestampMaxFeature = 6 numMaxFeature = 12 model2 = SelectKBest(chi2, k=maxFeature) #选择k个最佳特征 model2.fit_transform(temparray, y) featureScore = model2.scores_.tolist() fig = plt.figure(figsize=(9.6, 5.4)) # 不画折线图的原因是有大量的0 plt.bar(list(range(31)), featureScore) ax = fig.add_subplot(1, 1, 1) ax.xaxis.grid(True, which='major') # x坐标轴的网格使用主刻度 ax.set_title("Cate " + str(cate) + " Feature Score Distribution Of Chi-square Test") ax.set_xlabel("Feature") ax.set_ylabel("Score") # plt.show()
def get_kBest_mutual(X, y): return SelectKBest(score_func=mutual_info_regression, k="all").fit(X, y)
#drawing scatterplot on numerical data sns.pairplot(cardial_3) # 1st method univariate feature engg from numpy import set_printoptions from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 array = cardial_3.values X = array[:,0:120] Y = array[:,120] # feature extraction test = SelectKBest(score_func=chi2, k=10) # k tells how many top features we need fit = test.fit(X, Y) # summarize scores set_printoptions(precision=3) scores=fit.scores_ scores_df=pd.DataFrame(scores) top_index=scores_df.sort_values(0,ascending = False).head(10).index colname_uni = cardial_3.columns[[top_index]] # features = fit.transform(X) # 0 RAZRIV # 1 S_AD_KBRIG # 2 ROE # 3 ZSN_A
target = data['PSS_Stress'] data = data.drop('PSS_Stress', 1) # Missing Data Filtering print(data.isnull().any( axis=1).sum()) # número de registos que possuem pelo menos um valor 'NaN' data = data.fillna(data.median()) # substituir NaN por valor da mediana # data = data.fillna(data.mean()) # substituir NaN por valor da média # data = data.dropna() # descartar registos que possuem NaN # Feature selection selector = SelectKBest(f_classif, k=5) # teste para k = 1..10 selector.fit(data, target) cols = selector.get_support(indices=True) cols_names = list(data.columns[cols]) for idx, (ci, cn) in enumerate(zip(cols, cols_names)): print("*" * (len(cols) - idx) + " " * idx, ci, cn) data = data[cols_names] # Comparar resultados: scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) values_standardized = scaler.fit_transform(data.values) data = pd.DataFrame(values_standardized, columns=data.columns)
print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print "done in %fs" % (time() - t0) print def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # mapping from integer feature name to original token string feature_names = np.asarray(vectorizer.get_feature_names())
from nltk.corpus import words df = pd.read_csv('fake_real_tweets.csv') print('loaded data') y = df.label df = df.drop('label', axis=1) indices = df.index.get_values() x_train, x_main_test, y_train, y_main_test, train_indices, test_indices = train_test_split( df['text'], y, indices, test_size=0.33, shuffle=True) stop_words = ['http', 'https', 'twitter', 'com', 'www'] print('learning tfidf') tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.7) tfidf_train = tfidf_vectorizer.fit_transform(x_train) clf = SelectKBest(score_func=mutual_info_classif, k=1000) tfidf_fit = clf.fit(tfidf_train, y_train) tfidf_x_train_ft = tfidf_fit.transform(tfidf_train) print('done tfidf') print('learning count vectorizer') count_vectorizer = CountVectorizer(stop_words=stop_words) count_train = count_vectorizer.fit_transform(x_train) clf = SelectKBest(score_func=mutual_info_classif, k=1000) count_fit = clf.fit(count_train, y_train) count_x_train_ft = count_fit.transform(count_train) print('done count vectorizer') print('learning mn count') mn_count_clf = MultinomialNB() mn_count_clf.fit(count_x_train_ft, y_train)
for rs in random_seeds: # choose a random sample of zeros (Legit Class) credit_data_df_legit_random = credit_data_df_legit.sample(numberOfZeros, random_state=rs) # merge the above with the ones (Fraud Class) and do the rest of the pipeline with it result = credit_data_df_legit_random.append(credit_data_df_fraud) # create dataframe X, which includes variables time, amount, V1, V2, V3, V4 etc X = result[features] # create array y, which includes the classification only y = result['Class'] #Select the best features Using the SelectKBest Method from sklearn select_kbest = SelectKBest(f_classif, k=24) #Fit the method onto the data and then return a transformed array X_new = select_kbest.fit_transform(X, y) # use sklearn to split the X and y, into X_train, X_test, y_train y_test with 80/20 split X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=rs, stratify=y) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------# # TRAINING ON THE TRAINING SET # ------------------------------------------------------------------------------------------------------------------------------------------------------------------# # use sklearns Logistic Regression to fit a model to train data clf = LogisticRegression(random_state=rs,
newdf_test.drop('service', axis=1, inplace=True) print(newdf_test['label'].value_counts()) X_DOS = newdf.drop('label', 1) Y_DOS = newdf.label X_DOS_test = newdf_test.drop('label', 1) Y_DOS_test = newdf_test.label colNames = list(X_DOS) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif np.seterr(divide='ignore', invalid='ignore') fclass = SelectKBest( f_classif, k=111) #iterate the k from 1 to 120. The max. accuracy comes at k=111 . fclass.fit(X_DOS, Y_DOS) true = fclass.get_support() fclasscolindex_DOS = [i for i, x in enumerate(true) if x] fclasscolname_DOS = list(colNames[i] for i in fclasscolindex_DOS) print('Features selected :', fclasscolname_DOS) features = newdf[fclasscolname_DOS].astype(float) features1 = newdf_test[fclasscolname_DOS].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=0) t0 = time()
names_hashingfile, order=-1) """ Random forest consists of a number of decision trees. Every node in the decision trees is a condition on a single feature, designed to split the dataset into two so that similar response values end up in the same set. Random forest’s impurity based ranking is typically aggressive in the sense that there is a sharp drop-off of scores after the first few top ones. Tree based methods can model non-linear relations well and don’t require much tuning. For a forest, the impurity decrease from each feature can be averaged and the features are ranked according to this measure. """ rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names_hashingfile) """ Selecting the k-best features """ kbest = SelectKBest(chi2, total_features) #using all the features for analysis kbest.fit(X, Y) #print(np.abs(kbest.scores_)) ranks["KBest"] = rank_to_dict(np.nan_to_num(np.abs(kbest.scores_)), names_hashingfile) """ Another tree based classifier """ treec = ExtraTreesClassifier() treec.fit(X, Y) ranks["ExtraTrees"] = rank_to_dict(treec.feature_importances_, names_hashingfile) """ With linear correlation (Lin. corr.), each feature is evaluated independently, and we measure the linear relationship between each feature and the response variable. """
print("%d samples, %d features" % (n_subjects, n_features)) ### Prediction with SVR ####################################################### print("ANOVA + SVR") ### Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel from sklearn.svm import SVR svr = SVR(kernel='linear') ### Dimension reduction from sklearn.feature_selection import SelectKBest, f_regression # Here we use a classical univariate feature selection based on F-test, # namely Anova. feature_selection = SelectKBest(f_regression, k=2000) # We have our predictor (SVR), our feature selection (SelectKBest), and now, # we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svr = Pipeline([('anova', feature_selection), ('svr', svr)]) ### Fit and predict anova_svr.fit(gm_maps_masked, age) age_pred = anova_svr.predict(gm_maps_masked) ### Visualization ### Look at the SVR's discriminating weights coef = svr.coef_
data_dict[key]['from_poi_to_this_person'], data_dict[key]['to_messages']) data_dict[key]['ratio_to_poi_email'] = np.true_divide( data_dict[key]['from_this_person_to_poi'], data_dict[key]['from_messages']) my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #Feature Selection trimmed_features_list = ['poi'] k = 5 select = SelectKBest(k=k) select = select.fit(features, labels) features = select.transform(features) top_scores = np.sort(select.scores_)[-k:] for i in range(len(features_list[1:])): if select.scores_[i] in top_scores: trimmed_features_list.append(features_list[1:][i]) features_list = trimmed_features_list ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html
t0 = time() mlp.fit(X_train_s, y_train) print("done in %0.3fs" % (time() - t0)) y_pred = mlp.predict(X_test_s) print(classification_report(y_test, y_pred, target_names=target_names)) # %% # Univariate feature filtering (Anova) with Logistic-L2 # ----------------------------------------------------- anova_l2lr = Pipeline([ ('standardscaler', preprocessing.StandardScaler()), ('anova', SelectKBest(f_classif)), ('l2lr', lm.LogisticRegression(max_iter=1000, class_weight='balanced', fit_intercept=False)) ]) param_grid = {'anova__k': [50, 100, 500, 1000, 1500, X_train.shape[1]], 'l2lr__C': 10. ** np.arange(-3, 3)} anova_l2lr_cv = GridSearchCV(anova_l2lr, cv=5, param_grid=param_grid, n_jobs=5) t0 = time() anova_l2lr_cv.fit(X=X_train, y=y_train) print("done in %0.3fs" % (time() - t0)) print("Best params found by grid search:") print(anova_l2lr_cv.best_params_)
# # One downside of this is that we are using knowledge from the dataset to select features, and thus introducing some overfitting. We could get around the overfitting in the "real world" by using a subset of the data for feature selection, and using a different subset for training the algorithm. We'll make things a bit simpler for now and skip that step. # In[7]: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 # Convert the upvotes variable to binary so it works with a chi-squared test. col = submissions["upvotes"].copy(deep=True) col_mean = col.mean() col[col < col_mean] = 0 col[(col > 0) & (col > col_mean)] = 1 # Find the 1000 most informative columns selector = SelectKBest(chi2, k=1000) selector.fit(full_matrix, col) top_words = selector.get_support().nonzero() # Pick only the most informative columns in the data. chi_matrix = full_matrix[:, top_words[0]] # ###7: Adding meta features # If we ignore the "meta" features of the headlines we're missing out on a lot of good information. These features are things like length, amount of punctuation, average word length, and other sentence specific features. # # Adding these in can greatly increase prediction accuracy. # # To add them in, we'll loop over our headlines, and apply a function to each one. Some functions will count the length of the headline in characters, and others will do more advanced things, like counting the number of digits. # In[8]:
def get_kBest_f_regr(X, y): return SelectKBest(score_func=f_regression, k=X.shape[1]).fit(X, y)
# In[20]: from sklearn.pipeline import Pipeline from sklearn.neighbors import KNeighborsRegressor from sklearn.impute import SimpleImputer from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV from sklearn import metrics imputer = SimpleImputer(strategy='median') constant = VarianceThreshold( threshold=0.0) #Feature selector that removes all low-variance features. min_max_scaler = MinMaxScaler() selector = SelectKBest(f_regression) knn = KNeighborsRegressor() selectkbest = Pipeline([('impute', imputer), ('constant', constant), ('scaler', min_max_scaler), ('select', selector), ('knn_regression', knn)]) selectkbest = selectkbest.fit(X_train, y_train) y_test_pred = selectkbest.predict(X_test) print("\n The MSE is:\n", metrics.mean_squared_error(y_test_pred, y_test)) # Hyper-parameter tunning of the k number of features # In[21]:
import pandas from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 df = pandas.read_csv('../Data/customer_behavior.csv') x = df[['bachelor', 'gender', 'age', 'salary']] y = df['purchased'].values sel = VarianceThreshold() x_val = sel.fit_transform(x) print(x_val) print(sel.get_support()) clf = SelectKBest(chi2, k=2) clf.fit(x, y) x_new = clf.fit_transform(x, y) print(clf.scores_) print(x_new)
import seaborn as sns import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 glass = pd.read_csv("glass.csv") #####correlation plt.figure(figsize=(10, 5)) corr = glass.corr() sns.heatmap(corr, annot=True, linewidths=.2) # feature selection x = glass.iloc[:, 0:9] y = glass.iloc[:, 9] bestfeature = SelectKBest(score_func=chi2, k='all') fit = bestfeature.fit(x, y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x.columns) featurescore = pd.concat([dfcolumns, dfscores], axis=1) featurescore.columns = ['RI', 'Score'] print(featurescore.nlargest(9, "Score")) ###feature importance from sklearn.ensemble import ExtraTreesClassifier import matplotlib.pyplot as plt model = ExtraTreesClassifier() model.fit(x, y) print(model.feature_importances_ ) #use inbuilt class feature_importances of tree based classifiers #plot graph of feature importances for better visualization
cor_feature = X.iloc[:, np.argsort(np.abs(cor_list) )[-num_feats:]].columns.tolist() # feature selection? 0 for not select, 1 for select cor_support = [True if i in cor_feature else False for i in feature_name] return cor_support, cor_feature cor_support, cor_feature = cor_selector(X, y, num_feats) print(str(len(cor_feature)), 'selected features') ## 2) Chi-squared - between features and target, keep features with higher chi-square from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler X_norm = MinMaxScaler().fit_transform(X) chi_selector = SelectKBest(chi2, k=num_feats) chi_selector.fit(X_norm, y) chi_support = chi_selector.get_support() chi_feature = X.loc[:, chi_support].columns.tolist() print(str(len(chi_feature)), 'selected features') ## 3) Recursive elimination - recursively reducing/eliminating features # frature importance value calculated each time, and lowest gets dropped from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5) rfe_selector.fit(X_norm, y)
# In[12]: ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_n, sort_keys=True) labels, features = targetFeatureSplit(data) # In[18]: def skipOne(elem): return elem[1] from sklearn.feature_selection import SelectKBest, f_classif selector = SelectKBest(f_classif, k=5) selector.fit(features, labels) scores = zip(features_list_n[1:], selector.scores_) sorted_scores = sorted(scores, key=skipOne, reverse=True) pprint.pprint('SelectKBest scores: ') pprint.pprint(sorted_scores) all_features = features_list + [(i[0]) for i in sorted_scores[0:20]] pprint.pprint(all_features) kBest_features = features_list + [(i[0]) for i in sorted_scores[0:10]] pprint.pprint('KBest') pprint.pprint(kBest_features) # In[14]: from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split(
def select_top_k_features(data, labels, n_components=1700): data = SelectKBest(chi2, k=n_components).fit_transform(data, labels) return data
labels = np.array(labels) cv = StratifiedShuffleSplit(n_splits=1000, random_state=42) for train_idx, test_idx in cv.split(features, labels): features_train, features_test = features[train_idx], features[test_idx] labels_train, labels_test = labels[train_idx], labels[test_idx] ### Import modules from sklearn.pipeline import make_pipeline, Pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.feature_selection import SelectKBest from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression # Make pipeline: pipe = make_pipeline(MinMaxScaler(), SelectKBest(), LogisticRegression(random_state=42)) print "Pipe steps: \n{}".format(pipe.steps) # parameter grid for SelectKBest: param_grid = {'selectkbest__k': range(5,16), 'logisticregression__C': (1, 10, 100, 1000), \ 'logisticregression__solver': ('newton-cg', 'lbfgs', 'sag')} # gridsearch and cross-validation: grid = GridSearchCV(pipe, param_grid=param_grid, cv=15) # fitting: grid.fit(features_train, labels_train) # evaluation metrics: from sklearn.metrics import confusion_matrix, recall_score, precision_score, classification_report
X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs" % (duration)) #, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string feature_names = vectorizer.get_feature_names() select_chi2 = 100000 print("chi-2 ? (True/False)") Bchi = input("chi-2 ? (True/False)") if Bchi: #opts.select_chi2: print("Extracting %d best features by a chi-squared test" % select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() ############################################################################### # Benchmark classifiers # build n-models for each classifier # a test sample is given to all n-models.. each model classifies it and each model's accuracy is calculated separately and averaged to get the accuracy of the whole model # ## ! when a test sample is given it does NOT get the list of probabilities from all classifiers ; it does get to know whether it belongs to a class or not; all bulk tests are given to a model and it is tested
pickle.dump(clf, open(filename, 'wb')) train_path = os.path.abspath(join(cwd, args.train)) serialization_dir = os.path.abspath(join(cwd, args.serialization_dir)) print("Load data") X_train, y_train = load_dataset(train_path) target_names = list(set([i[0] for i in y_train])) print("%d documents" % len(X_train)) print("%d categories" % len(target_names)) print("Training model") t0 = time() transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8) ch2 = SelectKBest(chi2, k=20000) X_train = transformer.fit_transform(X_train) X_train = ch2.fit_transform(X_train, y_train) y_transformer = LabelEncoder() y_train = [item for sublist in y_train for item in sublist] y_train = y_transformer.fit_transform(y_train) model = LinearSVC(C=1) estimator = model.fit(X_train, y_train) t1 = time() - t0 print("Train time: %0.3fs" % t1) t0 = time() save_model(serialization_dir + "/x_transformer.pkl", transformer) save_model(serialization_dir + "/y_transformer.pkl", y_transformer)
def main(footprint_path, training_path, od): """ :param footprint_path: input footprints csv :param training_path: input training data csv :param od: output directory :return: """ plt.rcParams['figure.figsize'] = 10.5, 4 features = pd.read_csv(footprint_path, index_col=0) training_label_df = pd.read_csv(training_path, index_col=0) training_data = features.loc[training_label_df.index] training_labels = training_label_df['induced'] """ UNSUPERVISED ROUTE """ pca = PCA(n_components=None) pca.fit(training_data) plt.title("PCA Parameter Optimization") plt.xlabel("Number Components Used") plt.ylabel("Cumulative Explained Variance Ratio") plt.xticks(np.arange(pca.n_components_), np.arange(1, pca.n_components_)) plt.grid(b=True, which='major', color='0.65', linestyle='--') plt.axhline(0.8, color='y', linestyle='--', linewidth=2) plt.axhline(0.9, color='g', linestyle='--', linewidth=2) plt.plot(np.cumsum(pca.explained_variance_ratio_), 'bo-') plt.savefig(os.path.join(od, 'pca_param_optimization.png')) plt.clf() fig = plt.figure() plt.suptitle("Comparison of # PCA Components Kept", fontsize=18) ax = plt.subplot(122, projection='3d') ax.set_title('PCA Reduction to 3 Components') ax.set_xlabel("PCA Component 1") ax.set_ylabel("PCA Component 2") ax.set_zlabel("PCA Component 3") pca = PCA(n_components=3) reduced_data = pca.fit_transform(training_data) ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c=['r' if x else 'b' for x in training_labels], s=120, alpha=0.5) ax = plt.subplot(121) ax.set_title('PCA Reduction to 2 Components') ax.set_xlabel("PCA Component 1") ax.set_ylabel("PCA Component 2") pca = PCA(n_components=2) reduced_data = pca.fit_transform(training_data) ax.scatter(reduced_data[:, 0], reduced_data[:, 1], c=['r' if x else 'b' for x in training_labels], s=120, alpha=0.5) kmeans = KMeans(init='k-means++', n_clusters=2, n_init=1) kmeans.fit(reduced_data) h = 0.01 x_min, x_max = reduced_data[:, 0].min() - 0.1, reduced_data[:, 0].max() + 0.1 y_min, y_max = reduced_data[:, 1].min() - 0.1, reduced_data[:, 1].max() + 0.1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Pastel1, aspect='auto', origin='lower') centroids = kmeans.cluster_centers_ ax.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='black', zorder=10) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) fig.set_tight_layout(dict(pad=5.0)) plt.savefig(os.path.join(od, 'pca_scatter.png')) plt.clf() """ SUPERVISED ROUTE """ kb = SelectKBest(chi2, k='all') kb.fit(training_data, training_labels) kb_df = pd.DataFrame(kb.scores_, columns=['kb_score'], index=training_data.columns) clf = ExtraTreesClassifier() clf = clf.fit(training_data, training_labels) clf_df = pd.DataFrame(clf.feature_importances_, columns=['clf_score'], index=training_data.columns) fdf = pd.concat( [clf_df.rank().astype(int), clf_df, kb_df.rank().astype(int), kb_df], axis=1, join='inner') fdf.columns = ['clf_rank', 'clf_score', 'kb_rank', 'kb_score'] fdf = fdf.sort_values('clf_rank', ascending=False) fdf['combine_rank'] = (fdf['clf_rank'] + fdf['kb_rank']).astype(int) fdf = fdf.sort_values('combine_rank', ascending=False) fdf['path_length'] = [1 + x.count('&') for x in fdf.index] fdf.to_csv(os.path.join(od, 'feature_scores.csv'), float_format='%.3f') # In[6]: fig = plt.figure() fig.suptitle('Feature Stratification', fontsize=18) ax = plt.subplot(121) ax.set_title("Feature Score Scatter") ax.set_xlabel("K-Best Score") ax.set_ylabel("Random Forest Score") ax.scatter(fdf['kb_score'], fdf['clf_score'], s=120, alpha=0.6) ax = plt.subplot(122) ax.set_title("Feature Rank Scatter") ax.set_xlabel("K-Best Rank") ax.set_ylabel("Random Forest Rank") ax.scatter(fdf['kb_rank'], fdf['clf_rank'], s=120, alpha=0.6, c=(fdf['kb_rank'] + fdf['clf_rank'])) fig.set_tight_layout(dict(pad=5.0)) fig.savefig(os.path.join(od, 'feature_strat.png')) plt.clf() # train LR selected_features = fdf['combine_rank'].sort_values( ascending=False).head(4).index features_cut = training_data[selected_features] plt.suptitle("Before and After Feature Selection", fontsize=16) plt.subplot(1, 2, 1) plt.title('All Features Logistic Regression Confusion Matrix') plt.xlabel('Predicted Labels') plt.ylabel('True Labels') loo = cross_validation.LeaveOneOut(len(training_labels)) lr = linear_model.LogisticRegression(C=1e5) predicted_labels = cross_val_predict(lr, training_data, training_labels, cv=loo) cm = confusion_matrix(training_labels, predicted_labels) cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues) plt.colorbar() tick_marks = np.arange(2) plt.xticks(tick_marks, ['uninteresting', 'interesting'], rotation=45) plt.yticks(tick_marks, ['uninteresting', 'interesting']) for i, cas in enumerate(cm): for j, c in enumerate(cas): plt.annotate(c, xy=(j, i), horizontalalignment='center', verticalalignment='center', bbox=dict(fc='w', boxstyle='round,pad=1')) plt.subplot(1, 2, 2) plt.title('Top Features Logistic Regression Confusion Matrix') plt.xlabel('Predicted Labels') plt.ylabel('True Labels') loo = cross_validation.LeaveOneOut(len(training_labels)) lr = linear_model.LogisticRegression(C=1e5) predicted_labels = cross_val_predict(lr, features_cut, training_labels, cv=loo) cm = confusion_matrix(training_labels, predicted_labels) cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues) plt.colorbar() tick_marks = np.arange(2) plt.xticks(tick_marks, ['uninteresting', 'interesting'], rotation=45) plt.yticks(tick_marks, ['uninteresting', 'interesting']) for i, cas in enumerate(cm): for j, c in enumerate(cas): plt.annotate(c, xy=(j, i), horizontalalignment='center', verticalalignment='center', bbox=dict(fc='w', boxstyle='round,pad=1')) plt.tight_layout(pad=5.0) plt.savefig( os.path.join(od, 'feature_selection_classification_results.png')) plt.clf()
if DO_SVD: print("dimension reduction svd with d=%d" % Reduction_D) svd = TruncatedSVD(n_components=Reduction_D, algorithm="randomized", n_iterations=5, random_state=None, tol=0) data = svd.fit_transform(data) if DO_NMF: print("dimension reduction nmf with d=%d" % Reduction_D) nmf = NMF(n_components=Reduction_D) data = nmf.fit_transform(data) print("Extracting best features by a chi-squared test") ch2NumFeatures = 1000 ch2 = SelectKBest(chi2, k=ch2NumFeatures) # print vectorizer.get_stop_words() data = ch2.fit_transform(data, target) # print data KNN = 5 nn = NearestNeighbors(n_neighbors=KNN + 1, algorithm='ball_tree').fit(data.todense()) # query and data are the same so every node is counted as its most similar here distances, indices = nn.kneighbors(data.todense()) nodeIndex = -1 nodeHomophilies = [] for neighbors in indices: nodeHomophily = 0 nodeIndex += 1