def string_selection(): # get data vectorizer = CountVectorizer(decode_error='ignore') ch2 = SelectKBest(chi2, k=100) # get data train_data, permission_list = db_tool.get_new_train_data() x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'], train_data['target'], test_size=0.2, random_state=1) # feature extraction x_train = vectorizer.fit_transform(x_train) feature_names = vectorizer.get_feature_names() x_train = ch2.fit_transform(x_train, y_train) feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print(ch2.scores_) print(ch2.get_support(indices=True)) print(feature_names) x_test = vectorizer.transform(x_test) x_test = ch2.transform(x_test) # # build the model model = MultinomialNB().fit(x_train, y_train) # # # valid the model predicted = model.predict(x_test) print (metrics.accuracy_score(y_test, predicted))
def test_mutual_info_classif(): X, y = make_classification( n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def featureSelection(X, y, selection_method, estimator=None, num_features=None, feature_names=None, features_file=None): if selection_method == "kbest": sel = SelectKBest(f_regression, k=num_features).fit(X, y) return sel.get_support() elif selection_method == "from_model": sel = SelectFromModel(estimator, 0.8) # define threshold?? sel.fit(X, y) return sel.get_support()
def pred_SOC(train, val, test, all_vars, loop): data = (val, test, train) # variable selection SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001) univ_selector = SelectKBest(score_func = f_regression, k = 4500) univ_selector.fit(train[all_vars], train['SOC']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['SOC']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x]: lass_only.append(all_vars[x]) #randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen]) gbr = GradientBoostingRegressor(n_estimators = 900, learning_rate = .0785, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4, subsample = .4) gbr.fit(train[chosen2], train['SOC']) for dset in data: dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # lasso #lass = Lasso(alpha=.00000025, positive=True) #lass.fit(train[all_vars], train['SOC']) #for dset in data: # dset['SOC_las_prds'] = lass.predict(dset[all_vars]) # ridge SOC_ridge = RidgeCV(np.array([.315]), normalize=True) SOC_ridge.fit(train[all_vars], train['SOC']) for dset in data: dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars]) # SVR svr = svm.SVR(C=9000, epsilon=.1) svr.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= ['SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds', 'SOC_svr_prds' ] name = 'SOC_prds' + str(object=loop) write_preds(models, name, train, val, test, 'SOC')
def use(method): if method == 'naive bayes': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('bayes',GaussianNB())] clf = Pipeline(estimators) parameters = {"skb__k":[8,9,10,11,12], "pca__n_components":[2,6,4,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train_scaled, labels_train) print "features score: " print SKB_k.scores_ features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected elif method == 'svm': estimators = [('reduce_dim', PCA()), ('svc', SVC())] clf = Pipeline(estimators) parameters = {'svc__C': [1,10]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_estimator_ elif method == 'decision tree': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('tree', tree.DecisionTreeClassifier())] clf = Pipeline(estimators) parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12], "pca__n_components":[2,4,6,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train, labels_train) features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected accuracy = accuracy_score(labels_test, pred) print "accuracy score:" print accuracy calculate_precision_recall(pred, labels_test)
def kbest_test(): ''' Select K Best testing ''' from sklearn.preprocessing import MinMaxScaler from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) feature_list = ["poi", "bonus", "deferral_payments", "deferred_income", "director_fees", "exercised_stock_options", "expenses", "from_messages", #"from_this_person_to_poi", #"from_poi_to_this_person", "loan_advances", "long_term_incentive", "other", "restricted_stock", "restricted_stock_deferred", "salary", "shared_receipt_with_poi", "to_messages", "total_payments", "total_stock_value" ] data = featureFormat(enron_data, feature_list) labels, features = targetFeatureSplit(data) # rescale features to be in [0..1] range scaler = MinMaxScaler() features_scaled = scaler.fit_transform(features) sk = SelectKBest(chi2, k=6) # f_classif data_new = sk.fit_transform(features_scaled, labels) #print data_new.shape feature_list_new = [x for x, y in zip(feature_list, sk.get_support()) if y==True] print '--- Selected Features ---\r\n' print sk.get_support(True), "\r\n", feature_list_new feature_list_scores = zip(feature_list, sk.scores_) feature_list_scores = sorted(feature_list_scores, key=lambda k: k[1], reverse=True) print '--- All Features ---' for item in feature_list_scores: print item[0], ": ", "{0:.4f}".format(item[1]) return
def build_report(x_data, y_labels, classifier, cross_val_iterator, tfidf: TfidfVectorizer, features): cm = numpy.zeros((3, 3)) f1 = precision = recall = accuracy = float() support = Counter(y_labels) filename = 'Bigrams + Unigrams/features.txt' # svd = TruncatedSVD(n_components=5000, random_state=42) # x_data = svd.fit_transform(x_data, y_labels) # try: # os.remove(filename) # except FileNotFoundError: # pass #okay for i, (train, test) in enumerate(cross_val_iterator): x_train, x_test, y_train, y_test = x_data[train], x_data[test], y_labels[train], y_labels[test] selector = SelectKBest(chi2, k=features) selector.fit(x_train, y_train) x_train = x_train[:, selector.get_support()] x_test = x_test[:, selector.get_support()] y_pred = classifier.fit(x_train, y_train).predict(x_test) confusion_matrix, f1_measure, precision_sc, recall_sc, accuracy_sc = (metrics.confusion_matrix(y_test, y_pred), metrics.f1_score(y_test, y_pred), metrics.precision_score(y_test, y_pred), metrics.recall_score(y_test, y_pred, average='weighted'), metrics.accuracy_score(y_test, y_pred)) # with open(filename, 'a+') as fea_file: # fea_file.write("***********************************\n") # # features = tfidf.get_feature_names() # selected_features = [] # selected_indices = selector.get_support() # for i, selected in enumerate(selected_indices): # if selected: # selected_features.append(features[i]) # for feature in selected_features: # fea_file.write(feature + ",") # fea_file.write("CM: " + str(confusion_matrix) + " f1: " + str(f1_measure) + " Precision: " + str( # precision_sc) + " Recall: " + str(recall_sc)) cm += confusion_matrix f1 += f1_measure precision += precision_sc recall += recall_sc accuracy += accuracy_sc return (cm, f1 / cross_val_iterator.n_folds, precision / cross_val_iterator.n_folds, recall / cross_val_iterator.n_folds, support, accuracy / cross_val_iterator.n_folds)
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def selectFeatureSet_anova(data_x, data_y, nFeatures): """ Use cross-validation with nfolds < nsamples in test_x (i.e. nTestPerClass (defualt 10) * nClasses (eg 12)) Select best features based on ANOVA for svm. """ #1. Run SVM to get the feature ranking anova_filter = SelectKBest(f_regression, k= nFeatures) anova_filter.fit(data_x, data_y) print 'selected features in boolean: \n', anova_filter.get_support() print 'selected features in name: \n', test_x.columns[anova_filter.get_support()]; #2. Select the top nFeatures features selectedCols = data_x.columns[anova_filter.get_support()] #3. Run SVM (or any other) again on this selected features return selectedCols
def Chi2(df, n): """Feature selection using Chi2 on the whole dataframe. Chi2 measures the dependence between stochastic variables, this method weeds out features that are most likely to be independent of class""" from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 X_all = df.drop('Class', axis=1).values Y_all = df.loc[:, 'Class'].values # Set selection to chi2 with n to keep ch2 = SelectKBest(chi2, k=n) X_new = ch2.fit_transform(X_all, Y_all) index = ch2.get_support(indices=True) # Translate keep indices into the indices in the df fixed_index = [] for i in index: new_i = i + 1 fixed_index.append(new_i) fixed_index = [0] + fixed_index good = [df.columns[i] for i in fixed_index] print("Features selected using Chi2 feature selection: %s" % str(good)) df = df.loc[:,good] return(df)
def corr_matrix_of_important_words(term_doc_mat, word_list, scores, n_features_to_keep): selector = SelectKBest(k = n_features_to_keep).fit(term_doc_mat, scores) informative_words_index = selector.get_support(indices=True) labels = [word_list[i] for i in informative_words_index] data = pd.DataFrame(term_doc_mat[:,informative_words_index].todense(), columns=labels) data['Score'] = reviews.Score return data.corr()
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100): instances_list = instance_dic[word] feature_words=feature_dic[word] feature_xs = [] labels = [] for instance in instances_list: label = ' '.join(instance.senseid) feature_x_dic = feature_vector(instance,feature_words) feature_vals=[] for word in feature_words: feature_vals.append(feature_x_dic[word]) feature_xs.append(feature_vals) labels.append(label) # 1st round feature selection by removing low variance features sel_lowvr = VarianceThreshold(threshold=(thre_hold)) feature_xs_selected = sel_lowvr.fit(feature_xs) lowvr_index = feature_xs_selected.get_support(indices=True).tolist() feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist() # 2nd round feature selection using sklearn's SelectKBest() if num_feature < len(feature_xs_selected[0]): sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels) chi2_index= sel_chi2.get_support(indices=True).tolist() #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis return lowvr_index, chi2_index else: print str(word) + ": chi2 selection not executed due to low # of features" return lowvr_index, [i for i in range(len(lowvr_index))]
def train_and_test(self, train_file, test_file): lines = read_text_src(train_file) lines = [x for x in lines if len(x) > 1] X_train = [line[1] for line in lines] y_train = [line[0] for line in lines] # lines = read_text_src(test_file) # lines = [x for x in lines if len(x) > 1] # X_test = [line[1] for line in lines] # y_test = [line[0] for line in lines] vectorizer = CountVectorizer(tokenizer=zh_tokenize) # ngram_range=(1,2) X_train = vectorizer.fit_transform(X_train) print type(X_train) # X_test = vectorizer.transform(X_test) word = vectorizer.get_feature_names() v = len(word) get_bn_ratios(X_train,y_train,v) N = X_train.shape[1] ch2 = SelectKBest(chi2, k=int(N * 0.2)) X_train = ch2.fit_transform(X_train, y_train) feature_names = [word[i] for i in ch2.get_support(indices=True)]
def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
def choseFeature(TrainX, TrainY, TestX): cF = SelectKBest(chi2, k=100) cF.fit(TrainX, TrainY) check = cF.get_support() newTrainX = cF.transform(TrainX) newTestX = cF.transform(TestX) return (newTrainX, newTestX)
def featureSelectionSelectKBest(data, Featurenumber): label = data[:,1] datanew = data[:,2:] for i in range(0,len(datanew)): datanew[i] = map(abs, datanew[i]) size = Featurenumber selector = SelectKBest(chi2, k=size).fit(data[:,2:],data[:,1]) print selector.get_support(True) X_new = selector.fit_transform(datanew, label) data[:,2:size+2] = X_new fd = open('History.txt','a') history = 'Feature Selection: SelectKBest' + '\n' + 'Selected Feature: ' + str(selector.get_support(True)) + '\n' fd.write(history) fd.close() return data[:,:size+2]
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def discriminatory_features(): print 'Finding most discriminatory features...' NUM_FEATURES = 10 all_points = class1_song_points + class2_song_points true_labels = [0]*len(class1_song_points)+[1]*len(class2_song_points) feature_indices = [] for i in range(NUM_FEATURES): selector = SelectKBest(chi2, i+1) selector.fit(all_points, true_labels) new_indices = selector.get_support(indices=True) for index in new_indices: if index not in feature_indices: feature_indices.append(index) feature_descriptions = [] for index in feature_indices: feature = feature_names[index] if feature.lower() in wsj_mapping.keys(): key = wsj_mapping[feature.lower()] description = key + ': ' + wsj_to_description[key] elif feature in word_vocab: description = 'The word: ' + feature else: description = feature feature_descriptions.append(description) return jsonify(features=feature_descriptions)
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: selector = SelectKBest(chi2, k=10) selector = selector.fit(x, y) selected_features = self.features[selector.get_support()] print(selected_features) x = selector.transform(x) return x
def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console): Tk.Frame.__init__(self, master) self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.evaluator = evaluator self.df = df self.console = console frame_train = Tk.Frame(self) frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15) plt.figure(figsize=(12, 20)) plt.subplot(111) # k best feature's names plt.figure(figsize=(12, 8)) plt.subplot(111) selection = SelectKBest(f_classif, k=3) selection.fit(self.x_train, self.y_train) feature_scores = selection.scores_ feature_names = df.columns.values feature_names = feature_names[feature_names != "NSP"] kbest_feature_indexes = selection.get_support() kbest_feature_names = feature_names[kbest_feature_indexes] # 存为DataFrame rec = zip(feature_scores, feature_names) data = pd.DataFrame(rec, columns=["Score", "Feature"]) sns.barplot(x="Feature", y="Score", data=data) plt.xticks(rotation=-90) plt.title("Cardiotocography Feature Scores Ranking") self.attach_figure(plt.gcf(), frame_train)
def main(): inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r') trainData = inp.readlines() trainData = trainData[2:] td=[] print len(trainData) for line in trainData: td.append(line.split(',')) out = [] #print len(td[2]) for i in range(len(td)): out.append(int(td[i][1])) td[i] = td[i][2:-1] for j in range(len(td[0])): td[i][j] = int(td[i][j]) '''for i in range(len(td)): nConstant = sum(td[i]) for j in range(len(td[0])): td[i][j] =td[i][j]/nConstant ''' #print td[0] #print len(td[0]) clf = SelectKBest(k=100) b = clf.fit_transform(td,out) #print b[0] j =clf.get_support(indices =True) #print len(b), len(b[0]) #print j '''k=0
def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def _calculate(measurements): # Initialize classifiers classifiers = dict() # Create classifier for each model for key in measurements: # Initialize model classifiers[key] = { "models": dict(), "features": [] } vec = DictVectorizer() # Set vectorizer to use only selected features features = vec.fit_transform(measurements[key][0]) # Init feature selection and use it support = SelectKBest(chi2, k=10).fit(features, measurements[key][1]) vec.restrict(support.get_support()) # Assign used features classifiers[key]["features"] = vec.get_feature_names() # Get selected features data data = vec.transform(measurements[key][0]).toarray() # We need to split these data to create learning and testing set X_train, X_test, y_train, y_test = train_test_split(data, measurements[key][1]) # Fit all models classifiers[key]["models"] = ModelService._createModels(X_train, X_test, y_train, y_test) # Return result return classifiers
def get_k_best(x,y, k=300): ''' return k features name ''' sk = SelectKBest(f_classif, k=300) sk.fit_transform(x,y) return x.columns[sk.get_support()]
def predict_mulitple_subgraphs(X_original,y): time_start = time.time() #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #rforest = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #rforest = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #rforest = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) print (skb.get_support(indices=False)) rforest.fit(X,y) #my_get_fp_fn_inter(rforest,X,y) #m_secs =(time.time() - time_start)*1000 #print ("training mi-seconds {}".format(m_secs)) f_test_new_released = 'apks/' files = get_filepaths(f_test_new_released) subgraph_property(files, rforest, skb)
print(newdf_test['label'].value_counts()) X_Probe=newdf.drop('label',1) Y_Probe=newdf.label X_Probe_test = newdf_test.drop('label',1) Y_Probe_test = newdf_test.label colNames=list(X_Probe) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif np.seterr(divide='ignore', invalid='ignore'); fclass = SelectKBest(f_classif, k = 55) #iterate the k from 1 to 120. The max. accuracy comes at k=55 . fclass.fit(X_Probe , Y_Probe) true=fclass.get_support() fclasscolindex_Probe=[i for i, x in enumerate(true) if x] fclasscolname_Probe=list(colNames[i] for i in fclasscolindex_Probe) print('Features selected :',fclasscolname_Probe) features = newdf[fclasscolname_Probe].astype(float) features1 = newdf_test[fclasscolname_Probe].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.svm import LinearSVC clf = LinearSVC(random_state = 0) t0 = time() clf.fit(features, lab) tt = time() - t0 print ("Classifier trained in {} seconds".format(round(tt,3)))
def class33(X_train, X_test, y_train, y_test, i, X_1k, y_1k): ''' This function performs experiment 3.3 Parameters: X_train: NumPy array, with the selected training features X_test: NumPy array, with the selected testing features y_train: NumPy array, with the selected training classes y_test: NumPy array, with the selected testing classes i: int, the index of the supposed best classifier (from task 3.1) X_1k: numPy array, just 1K rows of X_train (from task 3.2) y_1k: numPy array, just 1K rows of y_train (from task 3.2) ''' clf_index = {1: SVC(kernel="linear", max_iter=1000), 2: SVC(kernel="rbf", gamma=2, max_iter=1000), 3: RandomForestClassifier(max_depth=5, n_estimators=10), 4: MLPClassifier(alpha=0.05), 5: AdaBoostClassifier()} clf = clf_index[i] csv = open("a1_3.3.csv", "w+") count = 0 b1_feat = [] best_feat = [] for data in [(X_1k, y_1k), (X_train, y_train)]: for k in [5, 10, 20, 30, 40, 50]: selector = SelectKBest(f_classif, k) selector.fit_transform(data[0], data[1]) pp = selector.pvalues_ indexes = selector.get_support() best = pp[indexes] # top features of 1k training set if count == 0: if k == 5: print("len", len(indexes.tolist())) print("indexes", indexes.tolist()) indexes = indexes.tolist() for index in range(0, len(indexes)): if indexes[index] is True: b1_feat.append(index) # top features for 32k training set elif count == 1: if k == 5: print("len", len(indexes.tolist())) print("indexes", indexes.tolist()) indexes = indexes.tolist() for index in range(0, len(indexes)): if indexes[index] is True: best_feat.append(index) csv.write(str(k)) for p in best: csv.write("," + str(p)) csv.write("\n") count += 1 print("best 5 features", best_feat, b1_feat) X_1k_best = np.zeros((1000, 5)) X_test_best = np.zeros((8000, 5)) X_train_best = np.zeros((32000, 5)) for j in range(5): for i in range(0, len(X_test)): X_test_best[i][j] = X_test[i][best_feat[j]] for i in range(0, len(X_train)): X_train_best[i][j] = X_train[i][best_feat[j]] for i in range(0, len(X_1k)): X_1k_best[i][j] = X_1k[i][best_feat[j]] clf.fit(X_1k_best, y_1k) result = clf.predict(X_test_best) print("result len", len(result)) csv.write(str(accuracy(confusion_matrix(y_test, result))) + ",") clf.fit(X_test_best, y_test) result = clf.predict(X_test_best) csv.write(str(accuracy(confusion_matrix(y_test, result))) + "\n") csv.write("liwc_sexual, receptiviti_cautious, receptiviti_type_a are the common best features in both low and high" "amounts of data. We can see that the cautious feature may be a good indicator since people in " "different political groups may be more wary of some topics, hence are more cautious. Or it can " "possibly be an indicator that conspiracy theorists correlate to certain parties.\n") csv.write("P values are generally higher given more data. This may be because there is less bias a set of data" "can have towards particular features.\n") csv.write("liwc_sexual, receptiviti_cautious,receptiviti_type_a, number of commas, number of common nouns are the top 5" "features for the 32K training case. This seems to suggest that different parties tends to have " "different speech habits since the features are so diverse. This makes sense since different parties " "would attract a specific type of demographic, as such they may be more prone to use a similar tone and" "sentence structure.")
# Ref: http://stackoverflow.com/questions/25792012/feature-selection-using-scikit-learn from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif # 'try' for second exploration on feature selection, with new features # Here SelectKBest will pick 4 features from the 5 features that were picked # from the previous analysis. features_list_try_2 = [ 'poi', 'exercised_stock_options', 'expenses', 'fraction_from_poi', 'fraction_to_poi', 'restricted_stock' ] data_try_2 = featureFormat(data_dict, features_list_try_2, sort_keys=True) labels_try_2, features_try_2 = targetFeatureSplit(data_try_2) selector = SelectKBest(f_classif, k=4) features_try_2_selected = selector.fit_transform(features_try_2, labels_try_2) # Ref: http://stackoverflow.com/questions/21471513/sklearn-selectkbest-which-variables-were-chosen features_selected_indices = selector.get_support( indices=True) + 1 # Since I will retrive them from # 'features_list_test_2', which # contains 'poi' as first entry print "Features selected by 'SelectKBest':\n", features_list_try_2[ features_selected_indices[0]] print features_list_try_2[features_selected_indices[1]] print features_list_try_2[features_selected_indices[2]] print features_list_try_2[features_selected_indices[3]] print # ******************************************************************* # Now I will explore Principal Component Analysis with a set of features that take # into account all of the features in the first set, plus the new created features. # The principal components will not be any of the original features, but a linear # combination of them. # I still want to see if I can gain any further insight with the results.
remove = [] for col in X.columns: if X[col].std() == 0: remove.append(col) X.drop(remove, axis=1, inplace=True) test.drop(remove, axis=1, inplace=True) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif selectK = SelectKBest(f_classif, k=220) selectK.fit(X, y) X_sel = selectK.transform(X) features = X.columns[selectK.get_support()] print(features) sel_test = selectK.transform(test) X, y, X_submission = np.array(X_sel), np.array( y.astype(int)).ravel(), np.array(sel_test) if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] clfs = [ RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini', class_weight='balanced'),
print("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0])) ss = MinMaxScaler() x_train = ss.fit_transform(x_train, y_train) x_test = ss.transform(x_test) print("原始数据各个特征属性的调整最小值:", ss.min_) print("原始数据各个特征属性的缩放数据值:", ss.scale_) ch2 = SelectKBest(chi2, k=3) x_train = ch2.fit_transform(x_train, y_train) x_test = ch2.transform(x_test) select_name_index = ch2.get_support(indices=True) print("对类别判断影响最大的三个特征属性分布是:", ch2.get_support(indices=False)) pca = PCA(n_components=2) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) model = DecisionTreeClassifier(criterion='entropy') model.fit(x_train, y_train) y_test_hat = model.predict(x_test) from sklearn.externals.six import StringIO with open("iris.dot", 'w') as f: f = tree.export_graphviz(model, out_file=f)
trainX = X[0:1240, :] trainY = Y[0:1240] else: testX = X[subNo * trialNum:(subNo + 1) * trialNum, :] testY = Y[subNo * trialNum:(subNo + 1) * trialNum] trainX = np.vstack( (X[0:subNo * trialNum, :], X[(subNo + 1) * trialNum:subNum * trialNum, :])) trainY = np.concatenate( (Y[0:subNo * trialNum], Y[(subNo + 1) * trialNum:subNum * trialNum])) # three feature selection method... # method 1 sel_criteria1 = SelectKBest(chi2, k=num_k).fit(trainX, trainY) sel_indx1_mask = sel_criteria1.get_support() sel_indx1 = np.where(sel_indx1_mask == True) sel_indx1 = sel_indx1[0] trainX1 = trainX[:, sel_indx1] testX1 = testX[:, sel_indx1] # svm clf1 = svm.SVC(kernel='linear') clf1.fit(trainX1, trainY) predict_testY1 = clf1.predict(testX1) f1_scores[no_k, 0, subNo] = metrics.f1_score(testY, predict_testY1) acc_scores[no_k, 0, subNo] = metrics.accuracy_score(testY, predict_testY1) print('current sub performance:', acc_scores[no_k, 0, subNo], ' kbest:', num_k, ' selection_method:', 1) # method 2
def ANN(): digits = load_digits() data_features = digits.data[:, 0:-1] label = digits.data[:, -1] ylim = None digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split\ (data_features, label, test_size=0.3, random_state=0, stratify=label) feature_columns = pd.DataFrame(data=digits_trainingX).columns #clf = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(63,), random_state=1, #solver='adam') #clf.fit(digits_trainingX, digits_trainingY) #y_pred = clf.predict(digits_testingX) kb = SelectKBest(score_func=f_regression, k=45) kb.fit(digits_trainingX, digits_trainingY) mask = kb.get_support() chosen_features = [] for bool, feature in zip(mask, feature_columns): if bool: chosen_features.append(feature) #indices = np.argsort(kb.scores_)[::-1] #selected_features = [] #for i in range(63): #selected_features.append(pd.DataFrame(data=digits_trainingX).columns[indices[i]]) df = pd.DataFrame(data=digits_trainingX) df = df[chosen_features] digits_trainingX = df.to_numpy() df2 = pd.DataFrame(data=digits_testingX) df2 = df2[chosen_features] digits_testingX = df2.to_numpy() #digits_trainingX = digits_trainingX[chosen_features] clf = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(45,), random_state=1, solver='lbfgs') clf.fit(digits_trainingX, digits_trainingY) y_pred = clf.predict(digits_testingX) train_sizes = np.linspace(.1, 1.0, 5) # ======================== CITATION BELOW ==============================================# # https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html cv = None n_jobs = None train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(clf, digits_trainingX, digits_trainingY, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True) _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title('Control Curve') if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") # ======================== CITATION ABOVE ==============================================# optimizers = ['lbfgs', 'sgd', 'adam'] max_iters = [100, 200, 500] batch_size = [5, 10, 100] seed = 52 #for i in range(63): #selected_features.append(pd.DataFrame(data=digits_trainingX).columns[indices[i]]) #plt.figure() #plt.bar(selected_features, kb.scores_[indices[range(63)]], color='r', align='center') #plt.xticks(rotation=45) #plt.xlabel('features') #plt.ylabel('score') param_grid = dict(solver=optimizers, max_iter=max_iters, batch_size=batch_size) grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10, scoring='accuracy') grid_results = grid.fit(digits_trainingX, digits_trainingY)
headline = fin.readline() for line in fin: row = line.strip().split('\t') X.append([float(x) if x != '' else 0.0 for x in row[1:-1]]) Y.append(float(row[-1])) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0) ch2 = SelectKBest(mutual_info_regression, k=10) ch2.fit(X_train, Y_train) selected_features = ch2.get_support(indices=True) row = headline.split('\t') fout.write(data) for feature in range(len(selected_features)): fout.write('\t' + row[feature + 1]) #+1 because the compound name is the first column fout.write('\n') fout.close()
def main(): print("Validating Connected IoT Devices!") DM.dm_engine() DM.block_all_ips() # Importing the dataset dataset = pd.read_csv('/home/pi/Software/IoT-HASS/CICIDS2017_Sample.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 78].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) ############## Start of Feature Scaling ################### from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Decision Tree Classification to the Training set from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier.fit(X_train, y_train) # Feature Selection from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2 KBestSelector = SelectKBest(k=5) KBestSelector = KBestSelector.fit(X_train, y_train) X_train_FS = KBestSelector.transform(X_train) names = dataset.iloc[:, :-1].columns.values[KBestSelector.get_support()] scores = KBestSelector.scores_[KBestSelector.get_support()] names_scores = list(zip(names, scores)) ns_df = pd.DataFrame(data=names_scores, columns=['Feat_Name', 'F_Score']) ns_df_sorted = ns_df.sort_values(['F_Score', 'Feat_Name']) #print(ns_df_sorted) # Fit the model with the new reduced features classifier.fit(X_train_FS, y_train) # Predicting the Test set results X_test_FS = KBestSelector.transform(X_test) y_pred = classifier.predict(X_test_FS) conn = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, socket.ntohs(3)) # define array variables to hold time and statistics TimeBetBwdPkts = 0 NumBwdPkts = 0 NumIdleFlow = 0 prev_fin_flag = 0 flow_idle_start_time = datetime.datetime.now() flow_idle_end_time = datetime.datetime.now() AllTimesBetBwdPkts = [] AllflowIdleTimes = [] AllPacketLengths = [] max_biat = 0 mean_biat = 0 std_biat = 0 pkt_len_varience = 0 std_idle = 0 while True: raw_data, addr = conn.recvfrom(65535) dest_mac, src_mac, eth_proto, data = unpack_ethernet_frame(raw_data) # get packet length or size packet_length = len(raw_data) AllPacketLengths.append(packet_length) # IPv4 if eth_proto == 8: (version, header_length, ttl, proto, src, target, data) = ipv4_packet_header(data) # TCP packet if proto == 6: (src_port, dest_port, sequence, acknowledgement, flag_urg, flag_ack, flag_psh, flag_rst, flag_syn, flag_fin, data) = unpack_tcp_segment(data) # capture packet flow # we will identifiy each flow by determining when src and dst ip change # first capture the original src and dst IPs prev_src_ip = src prev_target_ip = target if flag_fin == '1' and prev_fin_flag == '0': flow_idle_start_time = datetime.datetime.now() NumIdleFlow = NumIdleFlow + 1 elif flag_fin == '0' and prev_fin_flag == '1': flow_idle_end_time = datetime.datetime.now() else: flow_idle_start_time = datetime.datetime.now() flow_idle_end_time = datetime.datetime.now() prev_fin_flag = flag_fin flowIdleTime = (flow_idle_end_time - flow_idle_start_time).microseconds AllflowIdleTimes.append(flowIdleTime) LastTimeBwdPktSeen = datetime.datetime.now() if (NumBwdPkts == 1): TimeBetBwdPkts = 0 elif (NumBwdPkts > 1): TimeBetBwdPkts = (datetime.datetime.now() - LastTimeBwdPktSeen).microseconds else: TimeBetBwdPkts = 0 NumBwdPkts = NumBwdPkts + 1 AllTimesBetBwdPkts.append(TimeBetBwdPkts) # get statistics values for backwards packets if sum(AllTimesBetBwdPkts) == 0: mean_biat = 0 max_biat = 0 std_biat = 0 else: mean_biat = stats.mean(AllTimesBetBwdPkts) max_biat = max(AllTimesBetBwdPkts) std_biat = stats.stdev(AllTimesBetBwdPkts) if (sum(AllflowIdleTimes) > 0 and len(AllflowIdleTimes) > 1): std_idle = stats.stdev(AllflowIdleTimes) else: std_idle = 0 if (sum(AllPacketLengths) > 0 and len(AllPacketLengths) > 1): pkt_len_varience = stats.variance(AllPacketLengths) else: pkt_len_varience = 0 # Invoking iot_hass() function iot_hass(mean_biat, std_biat, max_biat, pkt_len_varience, std_idle, src, target, classifier, dest_mac, src_mac, raw_data)
def accuracy(new_features,features_list): #Feature List features_list = monta_feature(features_list) if new_features == False: print features_list else: features_list = nova_feature(data_dict_woo, features_list) print "" print features_list print "Testando novos features adicionados:\n" testa_nova_feature(data_dict_woo, "DIETRICH JANET R", features_list[-2:]) # Extraindo as features e os labels do conjunto de dados data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) print "" # Criando Min/Max Scaler from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() # Scale Features features = scaler.fit_transform(features) skbest = SelectKBest(k=10) # try best value to fit sk_trans = skbest.fit_transform(features_train, labels_train) indices = skbest.get_support(True) print "="*10,"skbest.scores_","="*10 print skbest.scores_ print "="*10, "="*(len("skbest.scores_")-2),"="*10 print "" print "="*10,"features - score","="*10 for index in indices: print 'features: %s score: %f' % (features_list[index + 1], skbest.scores_[index]) print "="*10, "="*(len('features: %s score: %f')-2),"="*10 print "" #print "GaussianNB" # GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print "Accuracy GaussianNB = {:.5f}".format(accuracy_score(prediction, labels_test)) #print "KNeighborsClassifier" # KNeighborsClassifier clf = KNeighborsClassifier() clf = KNeighborsClassifier(algorithm = 'auto',leaf_size = 20,n_neighbors = 3,weights = 'uniform') clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print "Accuracy KNeighborsClassifier = {:.5f}".format(accuracy_score(prediction, labels_test)) #print "SVC" # SVC clf = SVC(kernel = 'linear',max_iter = 10000,random_state = 42) clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print "Accuracy SVC = {:.5f}".format(accuracy_score(prediction, labels_test)) #print "AdaBoostClassifier" clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, min_samples_leaf=2, class_weight='balanced'), n_estimators=50, learning_rate=.8) clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print "Accuracy AdaBoostClassifier = {:.5f}".format(accuracy_score(prediction, labels_test))
def chi_square(X, y, numOfFeatures = 'all'): X_Norm = MinMaxScaler().fit_transform(X) selector = SelectKBest(score_func=chi2, k=numOfFeatures).fit(X_Norm, y) cols = selector.get_support(indices = True).tolist() x_new = selector.transform(X) return x_new, cols
for ii in i_train: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in i_test: features_test.append(features[jj]) labels_test.append(labels[jj]) # print features_train # fit selector to training set selector = SelectKBest(k=k) selector.fit(features_train, labels_train) # print selector.scores_ # print selector.get_support(indices = True) for i, j in zip(selector.get_support(indices=True), selector.scores_): best_features.append(features_list[i]) best_scores.append(j) # print best_features from collections import defaultdict d = defaultdict(int) for idx, key in enumerate(best_scores): if idx > k - 1: idx = idx % k d[idx] += key # print d # for i in best_features:
tree_best_cols = [] for e in zip(X.columns[1:], clf.feature_importances_): if e[1] > 0.005: print(e) tree_best_cols += [e[0]] X_tree_best = df[tree_best_cols] X_tree_best.hist(bins=50, figsize=(10, 10)) plt.show() X_tree_best.boxplot(figsize=(10, 8)) plt.show() ### Select k best feature importance sel = SelectKBest(f_classif, k=40).fit(X, y) kbest_col10 = X.columns[sel.get_support()] X_kbest10 = df[kbest_col10] X_kbest10.hist(bins=50, figsize=(10, 10)) plt.show() X_kbest10.boxplot(figsize=(10, 8)) plt.show() ### PCA dim reduction pca = PCA(n_components=10) X_pca = pca.fit_transform(X) pca.explained_variance_ratio_.sum() ################################################# ### Train / test set split & performance measures #################################################
from sklearn.datasets import load_breast_cancer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, f_classif ''' SelectKBest(score_func=<function f_classif>, k=10) Select features according to the k highest scores. ''' breastData = load_breast_cancer() X = breastData.data y = breastData.target print('original data features number = ', str(X.shape[1]), ' feature') FeatureSelectionMethod = SelectKBest(score_func=chi2, k=5) new_X = FeatureSelectionMethod.fit_transform(X, y) print('new data features number = ', str(new_X.shape[1]), ' feature') print('selected features are') print(FeatureSelectionMethod.get_support())
print(newdf_test['label'].value_counts()) X_DOS = newdf.drop('label', 1) Y_DOS = newdf.label X_DOS_test = newdf_test.drop('label', 1) Y_DOS_test = newdf_test.label colNames = list(X_DOS) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 chi2f = SelectKBest( chi2, k=119) #iterate the k from 1 to 120. The max. accuracy comes at k=119 . chi2f.fit(X_DOS, Y_DOS) true = chi2f.get_support() chicolindex_DOS = [i for i, x in enumerate(true) if x] chicolname_DOS = list(colNames[i] for i in chicolindex_DOS) print('Features selected :', chicolname_DOS) features = newdf[chicolname_DOS].astype(float) features1 = newdf_test[chicolname_DOS].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) t0 = time() clf.fit(features, lab) tt = time() - t0 print("Classifier trained in {} seconds".format(round(tt, 3)))
#%%split trainigng and testing dataset df = df.drop(labels='amount', axis=1) msk = np.random.rand(len(df)) < 0.8 x_train = df[msk].astype('float64') y_train = x_train['log_amount'].astype('float64') x_train = x_train.drop(labels='log_amount', axis=1).astype('float64') x_test = df[~msk].astype('float64') y_test = x_test['log_amount'].astype('float64') x_test = x_test.drop(labels='log_amount', axis=1).astype('float64') #%% from sklearn.feature_selection import f_regression from sklearn.feature_selection import SelectKBest clf = SelectKBest(f_regression, k=10) X_new1 = clf.fit_transform(X=np.asarray(x_train.values, dtype="float64"), y=(np.asarray(y_train, dtype="float64"))) mask1 = clf.get_support() new_features = x_train.columns[mask1] print(new_features) #%% from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import roc_auc_score clf = RandomForestRegressor(n_estimators=100, max_features='sqrt', n_jobs=-1, random_state=0) X_new = clf.fit(X=np.asarray(x_train.values, dtype="float64"), y=(np.asarray(y_train, dtype="float64"))) importances = clf.feature_importances_ indices = np.argsort(importances) plY = x_train.columns[indices] plX = importances[indices]
print("\nCM per Test-set Random Forest: \n", cmRF_test, "\n") # 3. applicare una features selection per evidenziare i pixel più significativi feature_importances_ = clfRF.feature_importances_ print( "Importanza assegnata alle features dall'algoritmo (indica quanto gli sono servite durante il suo allenamento): \n", feature_importances_, "\n") # 4. proiettare il dataset sulle 30 features più significative select = SelectKBest(f_classif, k=30) select.fit(X, y) mask = select.get_support( ) # ottengo un array che è composto da booleani. 'True' se la feature è importante 'False' se non è importante np_mask = np.array( mask ) # trasformo mask in un array numpy per poter eseguire operazioni su tale array --> in particolare l'estrazione delle fetures più significative np_columns = np.array( data.columns[1:] ) # prelevo tutte le colonne del dataset, la prima è "label" e non va considerata most_significative_features = np_columns[ np_mask] # selezioniamo le features più significative most_significative_features_importances = clfRF.feature_importances_[ np_mask] # seleziono i rispettivi valori di importanza
if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # ############################################################################# # Benchmark classifiers def benchmark(clf):
feat = features.split('","') feat = feat[1:-1] desiredno = len(feat) features_test = test[feat] labels_train = train[label] features_train = train[feat] labels_test = test[label] # feature selection select = SelectKBest(f_regression, k="all").fit(features_train, labels_train) ranking = select cols = select.get_support(indices=True) rank = select.scores_ mask = select.get_support() new_features = features_train.columns[mask] features_train = features_train[new_features] features_test = features_test[new_features] # ridge regression t0 = time() from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV grid_param = {'alpha': [10, 4, 1.0, 0.5, 0.3, 0.08, 0.02]} rdg_reg = Ridge() gd_sr = GridSearchCV( estimator=rdg_reg,
def select_k_best(k, data, labels): k_best = SelectKBest(k=k) data = k_best.fit_transform(data, labels) return data, labels, k_best.get_support()
def select_kbest_freg(X_train, y_train, k): f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train) f_support = f_selector.get_support() f_feature = X_train.loc[:, f_support].columns.tolist() return f_feature
def k_significant_feat( feat, y_class, k=5, score_func='f_classif', scale=None, feat_names=None, plot=True, k_to_plot=None, close_after_plotting=False, saveto=None, figsize=None, title=None, xlabel=None ): """ Finds the k most significant features in the feature matrix, based on how well they separate the data in groups defined in y_class. It uses univariate statistical tests (the type of test is specified in the variable score_func). param: feat: array-like, shape=(n_samlples, n_features) The feature matrix y_class: array-like, shape=(n_samples) Vector with the class of each samples k: integer or 'all' Number of fetures to select score_func: str or function, optional If string 'f_classif', 'chi2', 'mutual_info_classif' then the function f_classif, chi2 or mutual_info_classif from sklearn.feature_selection will be used. Otherwise, the user needs to input a function that takes two arrays X and y, and returns a pair of arrays (scores, pvalues) or a single array with scores. Default is 'f_classif'. scale: None, str or function, optional If string 'standardize', 'minmax_scale', the tierpsytools.preprocessing.scaling_class.scalingClass is used to scale the features. Otherwise the used can input a function that scales features. Default is None (no scaling). feat_names: list shape=(n_features) The names of the features, when feat is an array and not a dataframe (will be used for plotting) return: support: array of booleans True for the selected features, False for the rest plot: boolean If True, the boxplots of the chosen features will be plotted plot """ from sklearn.feature_selection import \ SelectKBest, chi2,f_classif, mutual_info_classif if plot and k_to_plot is None: k_to_plot = k if isinstance(feat,np.ndarray): feat = pd.DataFrame(feat, columns=feat_names) feat = feat.loc[:, feat.std()!=0] if isinstance(k,str): if k=='all': k = feat.shape[1] else: raise Exception('Data type for k not recognized.') # Find most significant features if isinstance(score_func, str): if score_func=='f_classif': score_func = f_classif elif score_func=='chi2': score_func = chi2 elif score_func=='mutual_info_classif': score_func = mutual_info_classif if scale is not None: if isinstance(scale, str): scaler = scalingClass(scaling=scale) feat_scaled = scaler.fit_transform(feat) else: feat_scaled = scale(feat) else: feat_scaled = feat skb = SelectKBest(score_func=score_func, k=k) skb.fit(feat_scaled, y_class) support = skb.get_support() sorted_scores = np.sort(skb.scores_) ids_sorted_scores = np.argsort(skb.scores_) top_ft_ids = np.flip(ids_sorted_scores[~np.isnan(sorted_scores)])[:k] scores = skb.scores_[top_ft_ids] if hasattr(skb, 'pvalues_'): pvalues = skb.pvalues_[top_ft_ids] else: pvalues = None # Plot a boxplot for each feature, showing its distribution in each class if plot: plot_feature_boxplots( feat.iloc[:, top_ft_ids[:k_to_plot]], y_class, scores, pvalues=pvalues, figsize=figsize, saveto=saveto, xlabel=xlabel, close_after_plotting=close_after_plotting) if pvalues is not None: return feat.columns[top_ft_ids].to_list(), (scores, pvalues), support else: return feat.columns[top_ft_ids].to_list(), scores, support
print("Accuracy for Knn decision Tree is ", accuracy) pp.pprint( classification_report(y_true=labels_test, y_pred=pred, target_names=target_names)) #### using Select K Best algo to selection the best 7 features after ty. no_of_selected_feat = 10 from sklearn.feature_selection import SelectKBest, f_classif kbest = SelectKBest(f_classif, k=no_of_selected_feat) kbest.fit_transform(features, labels) features_selected = [ features_list[i + 1] for i in kbest.get_support(indices=True) ] features_score = {} for a, b in zip(features_selected, kbest.scores_): features_score[a] = b #### publishing the top 10 features from best score to fewer score print('\n') print("SelectKBest chose " + str(no_of_selected_feat) + " features") pp.pprint(sorted(features_score.items(), key=itemgetter(1), reverse=True)) print("\n") ### Adding poi feature at the begining fo the features list. if 'poi' in features_selected:
cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-100:]].columns.tolist() # feature selection? 0 for not select, 1 for select cor_support = [True if i in cor_feature else False for i in feature_name] return cor_support, cor_feature cor_support, cor_feature = cor_selector(X, y) print(str(len(cor_feature)), 'selected features') from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler X_norm = MinMaxScaler().fit_transform(X) chi_selector = SelectKBest(chi2, k=100) chi_selector.fit(X_norm, y) chi_support = chi_selector.get_support() chi_feature = X.loc[:, chi_support].columns.tolist() print(str(len(chi_feature)), 'selected features') from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:, rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression
#print(all_x_quadr) transformer_exp = FunctionTransformer(np.exp) all_x_exp = transformer_exp.transform(all_x_linear) #print(all_x_exp) transformer_cos = FunctionTransformer(np.cos) all_x_cos = transformer_cos.transform(all_x_linear) #print(all_x_cos) all_x_ones = np.ones((len(all_y), 1)) all_x = np.concatenate( (all_x_linear, all_x_quadr, all_x_exp, all_x_cos, all_x_ones), axis=1) ## feature selection ## sel = SelectKBest(k=18) all_x_sel = sel.fit_transform(all_x, all_y) sel_result = sel.get_support(indices=False) #list of selected features all_index = np.arange(0, 21) sel_in_index = all_index[sel_result] sel_out_index = all_index[~sel_result] print(sel_in_index) print(sel_out_index) ## ridge regression ## #alpha_set = np.array([1e-2, 1e-1, 1e0, 1e1, 1e2]) rmse_avg_all = [] alpha = 10 #K folds kf = KFold(n_splits=10, shuffle=True, random_state=42) rmse_total = 0 models = [] rmses = []
if not a.disable_early_stop: xgb_fit_params['early_stopping_rounds'] = 200 cb_fit_params['early_stopping_rounds'] = 200 lgb_fit_params['early_stopping_rounds'] = 200 if a.select_k_best is not None: print(f"Selecting {a.select_k_best} best features") def score_features(X, y, estimator=None): return clone(estimator).fit(X, y).feature_importances_ xgb_regressor = xgb.XGBRegressor(**xgb_params) fs = SelectKBest(score_func=lambda X, y: score_features(X, y, estimator=xgb_regressor), k=a.select_k_best).fit(X_all[:Y.shape[0]], Y) X_all = X_all.iloc[:, fs.get_support(indices=True)] print(X_all.shape) print('start training...') folds = a.folds bootstrap_runs = a.bootstrap_runs fold_scores = [] fold_predictions = [] oof_fold_predictions = [] n_leak = np.where(leak_Y !=0)[0].shape[0] print(np.arange(len(Y)).shape, n_leak) to_train_idx = np.arange(len(Y))
y_score = clf.predict(x_test) M = confusion_matrix(y_test, y_score) P, R, F1 = computeP_R_F1(M) print("RandomForestClassifier : ") print(M) print("P = " + str(P) + "\nR = " + str(R) + "\nF1 = " + str(F1) + "\n-------------") model = SelectFromModel(clf, prefit=True) X_new = model.transform(x_train) print(X_new.shape) selector = SelectKBest(chi2, k=2) X_new = selector.fit_transform(x_train, y_train) idxs_selected = selector.get_support(indices=True) print(idxs_selected) x_train_new = x_train[:, 0:3] x_test_new = x_test[:, 0:3] clf.fit(x_train_new, y_train) y_score = clf.predict(x_test_new) M = confusion_matrix(y_test, y_score) P, R, F1 = computeP_R_F1(M) print("RandomForestClassifier FS : ") print(M) print("P = " + str(P) + "\nR = " + str(R) + "\nF1 = " + str(F1) + "\n-------------") from sklearn.ensemble import ExtraTreesClassifier
for i in k: select = SelectKBest(f_classif, k=i) x_train_new = select.fit_transform(x_train, y_train) svm.fit(x_train_new, y_train) train_accuracy.append(svm.score(x_train_new, y_train)) plt.plot(k, train_accuracy, color = 'red', label = 'Train') plt.xlabel('k values') plt.ylabel('Train accuracy') plt.legend() plt.show() select_top = SelectKBest(f_classif, k =5) x_train_new = select_top.fit_transform(x_train, y_train) x_test_new = select_top.fit_transform(x_test, y_test) print('Top train features', x_train.columns.values[select_top.get_support()]) print('Top train features', x_test.columns.values[select_top.get_support()]) c = [1.0, 0.25, 0.5, 0.75] kernels = ['linear', 'rbf'] gammas = ['auto', 0.01, 0.001, 1] #1/n_feature svm = SVC() grid_svm = GridSearchCV(estimator = svm, param_grid = dict(kernel = kernels, C = c, gamma = gammas), cv = 5) grid_svm.fit(x_train_new, y_train) print('The best hyperparamters: ', grid_svm.best_estimator_) svc_model = SVC(C = 1, gamma='auto', kernel='linear') svc_model.fit(x_train_new, y_train)
if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # ############################################################################# # Benchmark classifiers
target = data['PSS_Stress'] data = data.drop('PSS_Stress', 1) # Missing Data Filtering print(data.isnull().any(axis=1).sum()) # número de registos que possuem pelo menos um valor 'NaN' data = data.fillna(data.median()) # substituir NaN por valor da mediana # data = data.fillna(data.mean()) # substituir NaN por valor da média # data = data.dropna() # descartar registos que possuem NaN # Feature selection selector = SelectKBest(f_classif, k=5) selector.fit(data, target) cols = selector.get_support(indices=True) cols_names = list(data.columns[cols]) for idx, (ci, cn) in enumerate(zip(cols, cols_names)): print("*" * (len(cols) - idx) + " " * idx, ci, cn) data = data[cols_names] # Comparar resultados entre MinMaxScaler e RobustScaler: scaler = preprocessing.RobustScaler() values_standardized = scaler.fit_transform(data.values) data = pd.DataFrame(values_standardized, columns=data.columns) clf_model = SVC()