def _calculate(measurements): # Initialize classifiers classifiers = dict() # Create classifier for each model for key in measurements: # Initialize model classifiers[key] = { "models": dict(), "features": [] } vec = DictVectorizer() # Set vectorizer to use only selected features features = vec.fit_transform(measurements[key][0]) # Init feature selection and use it support = SelectKBest(chi2, k=10).fit(features, measurements[key][1]) vec.restrict(support.get_support()) # Assign used features classifiers[key]["features"] = vec.get_feature_names() # Get selected features data data = vec.transform(measurements[key][0]).toarray() # We need to split these data to create learning and testing set X_train, X_test, y_train, y_test = train_test_split(data, measurements[key][1]) # Fit all models classifiers[key]["models"] = ModelService._createModels(X_train, X_test, y_train, y_test) # Return result return classifiers
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def _SelectKBest(self, X, y): print('Selecting K Best from whole image') from sklearn.feature_selection import SelectKBest, f_classif # ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. The number of features to be selected is set to 784 feature_selection = SelectKBest(f_classif, k=self.k_features) feature_selection.fit(X, y) scores = f_classif(X, y)[0] mask_k_best = np.zeros(scores.shape, dtype=bool) mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\ = 1 import nibabel mask_brain_img = nibabel.load(self.mask_non_brain).get_data() mask_brain = mask_brain_img.flatten().astype(bool) roi = np.zeros(mask_brain.flatten().shape) roi[mask_brain] = mask_k_best roi = roi.reshape(mask_brain_img.shape) img = nibabel.Nifti1Image(roi, np.eye(4)) img.to_filename('/tmp/best.nii.gz') print('SelectKBest data reduction from: %s' % str(X.shape)) X = feature_selection.transform(X) print('SelectKBest data reduction to: %s' % str(X.shape)) self.feature_reduction_method = feature_selection return X
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile # selector = SelectPercentile(f_classif, percentile=30) # selector.fit(features_train_transformed, lables) # selector : SelectKBest selector = SelectKBest(k=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def classification_level_RandForest_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0) # Maybe some original features where good, too? selectKbest = SelectKBest(k=1,score_func=f_regression) # Build estimator from PCA and Univariate selection: X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train) randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse') # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)]) tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80], randomForestReg__n_estimators=[1,2,4,8,16,32,64], randomForestReg__min_samples_split=[2,3,5,10,20]) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def features_importance(features_train, labels_train, feature_list): X=SelectKBest() X.fit(features_train, labels_train) Scores=X.scores_ Pvalues=X.pvalues_ index=feature_list[1:] return pd.DataFrame({'Scores': Scores,'Pvalues': Pvalues},index=index)
def PerformFeatureSelection(adult_train, features, Output): selector = SelectKBest(f_classif, k=5) selector.fit(adult_train[features], adult_train[Output]) scores = -numpy.log10(selector.pvalues_) plt.bar(range(len(features)), scores) plt.xticks(range(len(features)), features, rotation='vertical') plt.show()
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection): nsample=sparseArrayRowNorm[0].shape[0] scaler = StandardScaler(with_mean=False) #scaler = MinMaxScaler() testsize=int(nsample/nfold) cvIdx=[1]*(nsample-testsize)+[2]*testsize random.seed(100) aucRes=[] for nn in range(nrep): #print nn random.shuffle(cvIdx) Y_train=y_all[np.where(np.array(cvIdx)==1)[0]] Y_test=y_all[np.where(np.array(cvIdx)==2)[0]] X_train_all=[] X_test_all=[] for ii in xrange(len(sparseArrayRowNorm)): varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1])) X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:] X_train =varSelector.fit_transform(X_train, Y_train) X_train_all=X_train_all+[X_train] X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:] X_test= varSelector.transform(X_test) X_test_all=X_test_all+[X_test] X_train=hstack(X_train_all,format='csr') X_test=hstack(X_test_all,format='csr') del X_train_all del X_test_all aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure)) print np.array(aucRes).mean() return np.array(aucRes).mean()
def KFold_Kbest_summary(features, labels, clf, N_folds,test_size,n_select): results_ptable = PrettyTable(["iteration", "accuracy", "recall", "precision"]) results_arr=[] cnt=0 skb=SelectKBest(score_func=f_classif, k=n_select) features=skb.fit_transform(features,labels) kf= StratifiedShuffleSplit(labels,n_iter=N_folds,test_size=test_size,random_state=42) for train_indices, test_indices in kf: cnt+=1 features_train =[features[ii] for ii in train_indices] features_test =[features[ii] for ii in test_indices] labels_train =[labels[ii] for ii in train_indices] labels_test =[labels[ii] for ii in test_indices] #skb=SelectKBest(score_func=f_classif, k=n_select) #features_train=skb.fit_transform(features_train,labels_train) #features_test=skb.transform(features_test) clf.fit(features_train,labels_train) acc=accuracy_score(labels_test, clf.predict(features_test)) rec=recall_score(labels_test, clf.predict(features_test)) pre=precision_score(labels_test, clf.predict(features_test)) results_arr.append([cnt,acc,rec,pre]) return np.mean(np.array(results_arr)[:,1]), np.mean(np.array(results_arr)[:,2]), np.mean(np.array(results_arr)[:,3])
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def select_k_best(data_dict, features_list, k): # Create dataset from feature list data = featureFormat(data_dict, features_list) # Split dataset into labels and features labels, features = targetFeatureSplit(data) # Create Min/Max Scaler scaler = preprocessing.MinMaxScaler() # Scale Features features = scaler.fit_transform(features) # Create k_best feature selection k_best = SelectKBest(k=k) # Fit k_best k_best.fit(features, labels) # Get k_best scores scores = k_best.scores_ # Create list with features and scores unsorted_pairs = zip(features_list[1:], scores) # Sort list sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) # Create dict if k == "all": k_best_features = dict(sorted_pairs) else: k_best_features = dict(sorted_pairs[:k]) return k_best_features
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k="all") X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r)
def helpfulModelingPipelineRFC(): print "Loading pickles..." #comments_discussion_df=pd.read_pickle('comments_discussion.p') X=pd.read_pickle('X.p') y_actual=pd.read_pickle('y_actual.p') X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0) print y_actual_train.head() #pca = PCA(n_components=1) #use only SelectKBest to select features selection = SelectKBest(f_classif,k=15) X_features = selection.fit(X_train.iloc[:,0:len(X.columns)-2], y_actual_train).transform(X_train.iloc[:,0:len(X_train.columns)-2]) rfc = RandomForestClassifier(criterion='entropy') # Do grid search over k, n_components and C: pipeline = Pipeline([('feature_selection', selection), ('rfc', rfc)]) param_grid = dict(feature_selection__k=[11,13,14,15,16], rfc__n_estimators=[950,1000,1050], rfc__max_depth = [13,14,15,16], rfc__min_samples_split = [4,5,6,7], rfc__min_samples_leaf = [1,2,3]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='precision', cv=20 ,verbose=10,n_jobs=15) grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train['is_helpful'].values) print(grid_search.best_estimator_) #print "All columns:"+str(X.columns) #print "Just the selected columns:"+str(X.columns[pipeline.named_steps['selection'].get_support()]) pickle.dump(grid_search.best_estimator_, open( "rfc_best_estimator.p", "wb" ) )
def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def gridSearchCV_test(): ch2 = SelectKBest(chi2, k=20) # get data train_data = db_tool.get_new_train_data() X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'], train_data['target'], test_size=0.2, random_state=1) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) param_grid = [ {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]}, {'alpha': [0, 9, 0.4], 'fit_prior': [True]} ] clf = grid_search.GridSearchCV(MultinomialNB(), param_grid) # # build the model clf.fit(X_train, y_train) print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) predicted = clf.predict(X_test) print (metrics.accuracy_score(y_test, predicted)) print(metrics.classification_report(y_test, predicted))
def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: selector = SelectKBest(chi2, k=10) selector = selector.fit(x, y) selected_features = self.features[selector.get_support()] print(selected_features) x = selector.transform(x) return x
def string_selection(): # get data vectorizer = CountVectorizer(decode_error='ignore') ch2 = SelectKBest(chi2, k=100) # get data train_data, permission_list = db_tool.get_new_train_data() x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'], train_data['target'], test_size=0.2, random_state=1) # feature extraction x_train = vectorizer.fit_transform(x_train) feature_names = vectorizer.get_feature_names() x_train = ch2.fit_transform(x_train, y_train) feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print(ch2.scores_) print(ch2.get_support(indices=True)) print(feature_names) x_test = vectorizer.transform(x_test) x_test = ch2.transform(x_test) # # build the model model = MultinomialNB().fit(x_train, y_train) # # # valid the model predicted = model.predict(x_test) print (metrics.accuracy_score(y_test, predicted))
def get_k_best(x,y, k=300): ''' return k features name ''' sk = SelectKBest(f_classif, k=300) sk.fit_transform(x,y) return x.columns[sk.get_support()]
def main(): inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r') trainData = inp.readlines() trainData = trainData[2:] td=[] print len(trainData) for line in trainData: td.append(line.split(',')) out = [] #print len(td[2]) for i in range(len(td)): out.append(int(td[i][1])) td[i] = td[i][2:-1] for j in range(len(td[0])): td[i][j] = int(td[i][j]) '''for i in range(len(td)): nConstant = sum(td[i]) for j in range(len(td[0])): td[i][j] =td[i][j]/nConstant ''' #print td[0] #print len(td[0]) clf = SelectKBest(k=100) b = clf.fit_transform(td,out) #print b[0] j =clf.get_support(indices =True) #print len(b), len(b[0]) #print j '''k=0
def select_k_best_features(dataset, features_list, k): """ For E+F dataset, select k best features based on SelectKBest from sklearn.feature_selection Input: dataset: data in dictionary format features_list: the full list of features to selection from k: the number of features to keep Return: the list of length of k+1 with the first element as 'poi' and other k best features """ labels_train, __, features_train, __ = \ test_training_stratified_split(dataset, features_list) k_best = SelectKBest(k=k) k_best.fit(features_train, labels_train) impt_unsorted = zip(features_list[1:], k_best.scores_) impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True)) k_best_features = [elem[0] for elem in impt_sorted][:k] print k, "best features:" print k_best_features return ['poi'] + k_best_features
def find_features(dataset, features, target): selector = SelectKBest(f_classif, k=5) selector.fit(dataset[features], dataset[target[0]]) scores = -np.log10(selector.pvalues_) plt.bar(range(len(features)), scores) plt.xticks(range(len(features)), features, rotation="vertical") plt.show()
def helpfulModelingPipelineGBC(): #load the pickles print "Loading pickle..." X=pd.read_pickle('X.p') y_actual=pd.read_pickle('y_actual.p') print "X head without the body and the comment_id:" print X.iloc[:,0:len(X.columns)-2].head() print "y_actual:" print y_actual['is_helpful'].values X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0) selection = SelectKBest(f_classif,k=15) X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train) gbc = GradientBoostingClassifier(n_estimators=200) print np.unique(X_train.iloc[:,5:6]) #Create a pipeline of feature selection and gradient boosting classifier pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)]) param_grid = dict(feature_selection__k=[9,10,11,12,14], gbc__n_estimators = [450,500,550], gbc__max_depth = [33,35,40], gbc__min_samples_split = [1,2,3], gbc__min_samples_leaf = [2,3,4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15) grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train) print(grid_search.best_estimator_) print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()]) pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
ax.set_xticklabels(names) ax.set_yticklabels(names) plt.show() # In[] # Pre processing the data X = dataframe.drop(["Outcome"], axis=1) y = dataframe["Outcome"] # In[] # 1. Best Feature Selection # Univariate Selection from sklearn.feature_selection import SelectKBest, chi2 test = SelectKBest(score_func=chi2, k=4) # Chose best 4 features fit = test.fit(X, y) print(fit.scores_) X = fit.transform(X) print(X[0:5, :]) # 77% accuracy # In[] # Model Training from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Model Evaluations with Kfolds
print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print "done in %fs" % (time() - t0) print def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # mapping from integer feature name to original token string feature_names = vectorizer.get_feature_names()
def initialize(filename, labels_train, typetoread, toexclude=None, n_estimators=None, estimators_to_test=None, class_weight=None): """ Takes in features and labels pertaining to a tag and fits and returns a TfidfVectorizer, SelectPercentile, and RandomForestClassifier :param filename: The base file location where information about the dataset can be found. :param labels_train: The labels to use when classifying. :param typetoread: The features list to use ("Use" or "Description") :param toexclude: A list of indices of the features list to exclude from classification. Useful to exclude values known to be positive or negative without classifier use. If not given, assumes all features are valid. :param n_estimators: The number of trees to use in the Random Forest Classifier as per the sklearn documentation. If not given, GridSearchCV will select between 50, 150, and 250. :param estimators_to_test: A list of different numbers of estimators to test using GridSearch CV as per the sklearn documentation. If not given, GridSearchCV will select between 50, 150, and 250. :param class_weight: The weightings to use for the various classes as per the sklearn documentation. If not given, all classes have equal weight :return forest: A fitted RandomForestVectorizer. :return vectorizer: A fitted TfidfVectorizer. :return selector: A fitted Selector at 10%. """ features_train = pickle.load( open( os.path.abspath("../DataFiles/" + filename + "features" + typetoread), "rb")) labels_train = pd.Series(labels_train) if toexclude: features_train = pd.Series( np.delete(np.array(features_train), toexclude, axis=0)) print("Creating Vectorizer") vectorizer = TfidfVectorizer(stop_words="english", max_df=.5, ngram_range=(1, 3)) print("Fitting Vectorizer") features_train_transformed = vectorizer.fit_transform(features_train) features_train = None print("Creating Selector") selector = SelectKBest(k=18000) print("Fitting Selector") selector.fit(features_train_transformed, labels_train) print("Transforming data") features_train_transformed_selected = selector.transform( features_train_transformed) features_train_transformed = None features_train_transformed_selected = features_train_transformed_selected.toarray( ) print("Creating Forest") if not n_estimators: forest = RandomForestClassifier(min_samples_leaf=2, class_weight=class_weight) if not estimators_to_test: parameters = { "n_estimators": [50, 150, 250], } else: parameters = { "n_estimators": estimators_to_test, } forest = GridSearchCV(forest, parameters) else: forest = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=2, class_weight=class_weight) print("Fitting Forest") forest.fit(features_train_transformed_selected, labels_train) return forest, vectorizer, selector
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 df = pd.read_csv('Wholesale.csv', index_col=False) x = df.iloc[:, 1:8] y = df.iloc[:, -1].values #feature selection best_features = SelectKBest(score_func=chi2, k=5) fit = best_features.fit(x, y) df_scores = pd.DataFrame(fit.scores_) df_columns = pd.DataFrame(x.columns) features_scores = pd.concat([df_columns, df_scores], axis=1) features_scores.columns = ['features', 'score'] #giving name to columns print(features_scores.nlargest(5, 'score')) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test)
new_dataset[k] = df[k] else: imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') df = pd.DataFrame(imp_mean.fit_transform(df)) df.columns = [k] new_dataset[k] = df[k] #Encoding Categorical Data categorical_variables = list( new_dataset.select_dtypes( exclude=['int64', 'float', 'bool']).columns.values) new_dataset = pd.get_dummies(new_dataset, prefix_sep="__", columns=categorical_variables) #Now all that is left is to LabelEncode the independent variable which is only needed for classification and we can remove this step for regression '''labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y)''' #Next is Feature Selection #Can be implemented in two ways #1)Proposed by Tathagat from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif test = SelectKBest(score_func=f_classif, k=4) fity = test.fit(X, Y) global features features = fity.transform(X) nm = (X.columns[fity.get_support()]) dr = pd.DataFrame(features, columns=nm)
elif os.uname()[1] in ['mia.local', 'mia']: db_path = '/Users/fraimondo/data/pet_suv_db/' group = 'Paris' df = pd.read_csv( op.join(db_path, 'Liege', 'group_results_SUV', 'Liege' + '_db_GM_AAL.csv')) gen_df = pd.read_csv( op.join(db_path, group, 'group_results_SUV', group + '_db_GM_AAL_nocereb.csv')) df_train = df.query('QC_PASS == True and ML_VALIDATION == False') df_test = gen_df.query('QC_PASS == True and ML_gener == True') classifiers = OrderedDict() classifiers['SVC_rec'] = Pipeline([('scaler', RobustScaler()), ('select', SelectKBest(f_classif, 10)), ('clf', SVC(kernel="linear", C=1, probability=True, class_weight={ 0: 1, 1: 2.4 }))]) classifiers['SVC_prec'] = Pipeline([('scaler', RobustScaler()), ('select', SelectKBest(f_classif, 10)), ('clf', SVC(kernel="linear", C=1, probability=True, class_weight={
"C": [0.5, 1., 2., 3., 4., 5.], "class_weight": ['auto', 'balanced'], "k": [500, 750, 1000, 1500, 2000, 2500] } scores = [] max_score = 0 for C in parameters["C"]: for class_weight in parameters["class_weight"]: for k in parameters["k"]: print "Starting test for parameters " + str(C) + ", " + str( k) + ", " + class_weight my_Pipeline = Pipeline([ ('tfidf', tfidf_transformer), ('select', SelectKBest(chi2, k=k)), ('clf', LogisticRegression(C=C, class_weight=class_weight)), ]) my_OvR = OneVsRestClassifier(my_Pipeline, n_jobs=-1) fitted_Pipeline = my_OvR.fit(count_vect.fit_transform(questions), topics) probabilities_array = fitted_Pipeline.predict_proba( count_vect.transform(test_questions)) with open("labeler_samples.myans", "w") as f: probabilities_array = fitted_Pipeline.predict_proba( count_vect.transform(test_questions)) for probabilities in probabilities_array: top = top_10_elements_helper(probabilities) string_to_write = [] for i in reversed(top):
""" If the features are categorical, calculate a chi-square (χ2) statistic between each feature and the target vector. However, if the features are quantitative, compute the ANOVA F-value between each feature and the target vector. The F-value scores examine if, when we group the numerical feature by the target vector, the means for each group are significantly different """ # load libraries from sklearn import datasets from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif # load iris data iris = datasets.load_iris() # create features and target x = iris.data y = iris.target # create a SelectKBest object to selevt features with two best ANOVA F-values fvalue_selector = SelectKBest(f_classif, k=2) # appply the SelectKBest object to the features and target x_kbest = fvalue_selector.fit_transform(x, y) # Show results print('Original number of features:', x.shape[1]) print('Reduced number of features:', x_kbest.shape[1])
SEED = 1234 X = wine_data.drop('Class', axis=1) y = wine_data['Class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED) print('\nEvaluating models...') results = [] names = [] for name, model in SPOT_CHECK_MODELS: features = [] features.append(('select_best', SelectKBest(k=3))) feature_union = FeatureUnion(features) estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('model', model)) pipeline_model = Pipeline(estimators) cv_score = calculate_cv_score(pipeline_model, X_train, y_train, SEED, n_splits=10) cv_score_mean = cv_score.mean() cv_score_std = cv_score.std() metrics = {
def KNN(): digits = load_digits() data_features = digits.data[:, 0:-1] label = digits.data[:, -1] ylim = None digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split \ (data_features, label, test_size=0.3, random_state=0, stratify=label) feature_columns = pd.DataFrame(data=digits_trainingX).columns # ======================== CITATION BELOW ==============================================# #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html kb = SelectKBest(score_func=f_regression, k=45) kb.fit(digits_trainingX, digits_trainingY) mask = kb.get_support() chosen_features = [] for bool, feature in zip(mask, feature_columns): if bool: chosen_features.append(feature) # ======================== CITATION ABOVE ==============================================# df = pd.DataFrame(data=digits_trainingX) df = df[chosen_features] digits_trainingX = df.to_numpy() df2 = pd.DataFrame(data=digits_testingX) df2 = df2[chosen_features] digits_testingX = df2.to_numpy() classifier = KNeighborsClassifier(n_neighbors=5, weights='distance') classifier.fit(digits_trainingX, digits_trainingY) prediction = classifier.predict(digits_testingX) accuracy_score(prediction, digits_testingY) algorithm = ['ball_tree', 'kd_tree'] weights = ['uniform', 'distance'] seed = 52 param_grid = dict(algorithm=algorithm, weights=weights) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10, scoring='accuracy') grid_results = grid.fit(digits_trainingX, digits_trainingY) #evaluating algorithm #negative = 0 #neutral = 1 #positive = 2 #print(confusion_matrix(twitter_testingY, prediction)) #print(classification_report(twitter_testingY, prediction)) # ======================== CITATION BELOW ==============================================# # https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/ error = [] for i in range(1, 40): knn = KNeighborsClassifier(n_neighbors=i) knn.fit(digits_trainingX, digits_trainingY) pred_i = knn.predict(digits_testingX) error.append(np.mean(pred_i != digits_testingY.T)) plt.figure(figsize=(12, 6)) plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o', markerfacecolor='blue', markersize=10) plt.title('Error Rate K Value') plt.xlabel('K Value') plt.ylabel('Mean Error') # ======================== CITATION ABOVE ==============================================# AdaBoost = AdaBoostClassifier(n_estimators=400, learning_rate=1, algorithm='SAMME') AdaBoost.fit(digits_trainingX, digits_trainingY) prediction = AdaBoost.score(digits_trainingX, digits_trainingY) print('Accuracy post-boosting: ', prediction * 100, '%')
X_test_poly = polyFeatures(X_test_ISO, ConColsNum, CatColsNum) LR = LinearRegression().fit(X_train_poly, y_train) RG = Ridge().fit(X_train_poly, y_train) LA = Lasso().fit(X_train_poly, y_train) EN = ElasticNet().fit(X_train_poly, y_train) print('LRScore:{}\nRidgeScore:{}\nLassoScore:{}\nElasticNetScore:{}'.format( LR.score(X_test_poly, y_test), RG.score(X_test_poly, y_test), LA.score(X_test_poly, y_test), EN.score(X_test_poly, y_test))) GB = GradientBoostingRegressor().fit(X_train_ISO, y_train) print("Gradient Boosting score: {}".format(GB.score(X_test_ISO, y_test))) svr = SVR().fit(X_train_ISO, y_train) print("SVM grid search score: {}".format(svr.score(X_test_ISO, y_test))) select = SelectKBest(k=20, score_func=f_regression) select.fit(X_train_ISO, y_train) X_train_sub = select.transform(X_train_ISO) X_test_sub = select.transform(X_test_ISO) LR_selected = LinearRegression().fit(X_train_sub, y_train) LR_selected.score(X_test_sub, y_test) important_features = [] for i in list(X_train): if abs(y_train.corr(X_train[i])) > 0.03: important_features.append(i) X_train = X_train[important_features] X_test = X_test[important_features]
def predict_return(filename): returns_dict = {} #random.seed(100) Feature_indices = [ 1, 2, 3, 4, 6, 7, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 ] validate_list = [] stock_list = [] count = 0 target_list = [] with open('.\StocksDisp\\' + filename) as csvfile: spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: if '' not in row: if row[0] != 'Date': stock_list.append(row) if row[0] == '21-09-2017' or row[0] == '22-09-2017' or row[ 0] == '25-09-2017' or row[0] == '26-09-2017' or row[ 0] == '27-09-2017': target_list.append(row[8:16]) temp_list = [] for i in range(len(row)): if i in Feature_indices: temp_list.append(float(row[i])) validate_list.append(temp_list) #random.shuffle(stock_list) # print(stock_list[0:5]) # input() TrainingDataIndex = int( 0.8 * len(stock_list) ) #Training Data and Test Data are being split in the ratio 80:20 TrainingData = stock_list[:TrainingDataIndex] TestData = stock_list[TrainingDataIndex:] for index in range(8, 16): # if returnweek == 1: # index = 8 # elif returnweek == 2: # index = 9 # elif returnweek == 3: # index = 10 # elif returnweek == 4: # index = 11 # elif returnweek == 5: # index = 12 # elif returnweek == 6: # index = 13 # elif returnweek == 7: # index = 14 # elif returnweek == 8: # index = 15 X_train = [] Y_train = [] for record in TrainingData: X = [] if float(record[index]) <= 2: target_variable = 0 elif float(record[index]) > 2 and float(record[index]) <= 5: target_variable = 1 elif float(record[index]) > 5 and float(record[index]) <= 8: target_variable = 2 elif float(record[index]) > 8 and float(record[index]) <= 12: target_variable = 3 elif float(record[index]) > 12 and float(record[index]) <= 15: target_variable = 4 elif float(record[index]) > 15 and float(record[index]) <= 20: target_variable = 5 else: target_variable = 6 X.append(float(record[1])) X.append(float(record[2])) X.append(float(record[3])) X.append(float(record[4])) X.append(float(record[6])) X.append(float(record[7])) X.append(float(record[40])) X.append(float(record[41])) X.append(float(record[42])) X.append(float(record[43])) X.append(float(record[44])) X.append(float(record[45])) X.append(float(record[46])) X.append(float(record[47])) X.append(float(record[48])) X.append(float(record[49])) X.append(float(record[50])) X.append(float(record[51])) X.append(float(record[52])) X.append(float(record[53])) X.append(float(record[54])) X.append(float(record[55])) X.append(float(record[56])) X.append(float(record[57])) X.append(float(record[58])) X.append(float(record[59])) X.append(float(record[60])) X.append(float(record[61])) X.append(float(record[62])) X.append(float(record[63])) X.append(float(record[64])) X.append(float(record[65])) X.append(float(record[66])) X.append(float(record[67])) X.append(float(record[68])) X.append(float(record[69])) X.append(float(record[70])) X.append(float(record[71])) X.append(float(record[72])) X.append(float(record[73])) X.append(float(record[74])) X.append(float(record[75])) X.append(float(record[76])) X.append(float(record[77])) X.append(float(record[78])) X.append(float(record[79])) X.append(float(record[80])) X.append(float(record[81])) X.append(float(record[82])) X.append(float(record[83])) X.append(float(record[84])) X.append(float(record[85])) X.append(float(record[86])) X.append(float(record[87])) X.append(float(record[88])) X.append(float(record[89])) X.append(float(record[90])) X.append(float(record[91])) X.append(float(record[92])) X.append(float(record[93])) X.append(float(record[94])) X.append(float(record[95])) X.append(float(record[96])) X.append(float(record[97])) X.append(float(record[98])) X.append(float(record[99])) X.append(float(record[100])) X_train.append(X) Y_train.append(target_variable) X_test = [] Y_test = [] for record in TestData: X = [] if float(record[index]) <= 2: target_variable = 0 elif float(record[index]) > 2 and float(record[index]) <= 5: target_variable = 1 elif float(record[index]) > 5 and float(record[index]) <= 8: target_variable = 2 elif float(record[index]) > 8 and float(record[index]) <= 12: target_variable = 3 elif float(record[index]) > 12 and float(record[index]) <= 15: target_variable = 4 elif float(record[index]) > 15 and float(record[index]) <= 20: target_variable = 5 else: target_variable = 6 X.append(float(record[1])) X.append(float(record[2])) X.append(float(record[3])) X.append(float(record[4])) X.append(float(record[6])) X.append(float(record[7])) X.append(float(record[40])) X.append(float(record[41])) X.append(float(record[42])) X.append(float(record[43])) X.append(float(record[44])) X.append(float(record[45])) X.append(float(record[46])) X.append(float(record[47])) X.append(float(record[48])) X.append(float(record[49])) X.append(float(record[50])) X.append(float(record[51])) X.append(float(record[52])) X.append(float(record[53])) X.append(float(record[54])) X.append(float(record[55])) X.append(float(record[56])) X.append(float(record[57])) X.append(float(record[58])) X.append(float(record[59])) X.append(float(record[60])) X.append(float(record[61])) X.append(float(record[62])) X.append(float(record[63])) X.append(float(record[64])) X.append(float(record[65])) X.append(float(record[66])) X.append(float(record[67])) X.append(float(record[68])) X.append(float(record[69])) X.append(float(record[70])) X.append(float(record[71])) X.append(float(record[72])) X.append(float(record[73])) X.append(float(record[74])) X.append(float(record[75])) X.append(float(record[76])) X.append(float(record[77])) X.append(float(record[78])) X.append(float(record[79])) X.append(float(record[80])) X.append(float(record[81])) X.append(float(record[82])) X.append(float(record[83])) X.append(float(record[84])) X.append(float(record[85])) X.append(float(record[86])) X.append(float(record[87])) X.append(float(record[88])) X.append(float(record[89])) X.append(float(record[90])) X.append(float(record[91])) X.append(float(record[92])) X.append(float(record[93])) X.append(float(record[94])) X.append(float(record[95])) X.append(float(record[96])) X.append(float(record[97])) X.append(float(record[98])) X.append(float(record[99])) X.append(float(record[100])) X_test.append(X) Y_test.append(target_variable) # scaler = MinMaxScaler() # scaler.fit(X_train) # X_train = scaler.transform(X_train) test = SelectKBest(score_func=chi2, k=50) fit = test.fit(X_train, Y_train) print(fit) input() pca = PCA() pca.fit(X_train) X_train = pca.fit_transform(X_train) logistic.fit(X_train, Y_train) svm.fit(X_train, Y_train) #clf.fit(X_train,Y_train) # lda.fit(X_train,Y_train) # clf_prediction = clf.predict(X_test) logistic_prediction = logistic.predict(X_test) svm_prediction = svm.predict(X_test) print(accuracy_score(Y_test, svm_prediction)) # input() # print(index) answer_list = logistic.predict(validate_list) temp = [] for i in range(len(target_list)): if index != 15: if float(target_list[i][index - 8]) <= 2: target_variable = 0 elif float(target_list[i][index - 8]) > 2 and float( target_list[i][index - 8]) <= 5: target_variable = 1 elif float(target_list[i][index - 8]) > 5 and float( target_list[i][index - 8]) <= 8: target_variable = 2 elif float(target_list[i][index - 8]) > 8 and float( target_list[i][index - 8]) <= 12: target_variable = 3 elif float(target_list[i][index - 8]) > 12 and float( target_list[i][index - 8]) <= 15: target_variable = 4 elif float(target_list[i][index - 8]) > 15 and float( target_list[i][index - 8]) <= 20: target_variable = 5 else: target_variable = 6 temp.append((answer_list[i], target_variable)) confusion_matrix_predicted.append(answer_list[i]) confusion_matrix_actual.append(target_variable) else: temp.append(answer_list[i]) returns_dict['R' + str(index - 7)] = temp return (returns_dict)
""" svc_selectkbest.py script ############################################ ###### Written by: Mikolaj Buchwald ###### ############################################ Example of SVC (Support Vector Classifier) and SelectKBest (feature selection) of the Haxby's database. Based strongly on Alexandre Abraham's code: https://github.com/AlexandreAbraham/frontiers2013 Novum is sampling step (split original dataset train and test subsets) and classification performed on test data. Leave-40%-samples-out cross validation has been performed to prove the accuracy of the model (classifier - SVC) Subject 001, data preprocessed with fsl: * brain extraction * motion correction """ # ### Load Haxby dataset ###################################################### import numpy as np import nibabel from sklearn.datasets.base import Bunch from os.path import expanduser
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 filename = 'pima-indians-diabetes.csv' #url = 'https://myfilecsv.com/test.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values #splitting the array to input and output X = array[:,0:8] Y = array[:,8] #feature selection test = SelectKBest(score_func=chi2, k=4) fit = test.fit(X, Y) #print the scores for the features set_printoptions(precision=3) print(fit.scores_) #print the first five rows of the best 4 features (Columns) selected features = fit.transform(X) print(features[0:5,:])
classes = { 'acq': 0, 'crude': 1, 'earn': 2, 'grain': 3, 'interest': 4, 'money-fx': 5, 'ship': 6, 'trade': 7 } Y = np.array([classes[i0] for i0 in data["Y"]]) X = data["X"] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(X.values.astype('U')).toarray() x_train = SelectKBest(mutual_info_classif, k=int(0.1 * len(X[1]))).fit_transform(X, Y) y_train = Y # Uncomment to use test dataset: # data = pd.read_table("data/WebKB/webkb-test-stemmed.txt") # data.columns=["Y","X"] # classes = {'project':0, 'facu lty':1, 'course':2, 'student':3} # Y = np.array([classes[i0] for i0 in data["Y"]]) # X = data["X"] # vectorizer = TfidfVectorizer() # X = vectorizer.fit_transform(X.values.astype('U')).toarray() # x_test = SelectKBest(chi2, k=0.1*len(X[1]).fit_transform(X, y)) # y_test = Y knn_model = KNeighborsClassifier(n_neighbors=3, weights='distance', n_jobs=1) # Minkowski distance with p=2
def test_gridsearch_pipeline(): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) pipeline = Pipeline([ ('reduce_dim', PCA()), ('classify', SVC()) ]) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS } ] gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid) expected = """ GridSearchCV(cv=3, error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classify', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))]), iid='warn', n_jobs=1, param_grid=[{'classify__C': [1, 10, 100, 1000], 'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0)], 'reduce_dim__n_components': [2, 4, 8]}, {'classify__C': [1, 10, 100, 1000], 'reduce_dim': [SelectKBest(k=10, score_func=<function chi2 at some_address>)], 'reduce_dim__k': [2, 4, 8]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n repr_ = pp.pformat(gspipline) # Remove address of '<function chi2 at 0x.....>' for reproducibility repr_ = re.sub('function chi2 at 0x.*>', 'function chi2 at some_address>', repr_) assert repr_ == expected
##χ2 (卡方检验)提取特征 from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 import pandas as pd df1 = pd.read_csv('dataset3_1.csv',index_col=False,names = ['A','B','C','D','E','F','G','H','I','J','label']) feature = ['A','B','C','D','E','F','G','H','I','J'] df1_new= SelectKBest(chi2, k=2).fit_transform(df1[feature],df1['label']) print(df1_new)
def select_best_k_features(self, data, y, k): selector = SelectKBest(chi2, k=k) data = selector.fit(data, y) idxs = selector.get_support(indices=True) return idxs
duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s):
def get_gridsearch_classifier(clf_name): """ add docstring later """ #%% "is_sparse" flag """note: i included this so method like Lasso, so I can obtain nnz after model fit. for feature selection methods like ttest, i set this as False since here I know nnz prehand.""" is_sparse = False # <- set this to True if method is sparse #%% ***START HUGE ELIF STATEMENT **** if clf_name == 'sklLogregL1': """ L1 logistic regression """ np.random.seed( 0) # <- needed to ensure replicability in LogReg fit model from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l1', random_state=0) param_grid = {'C': 2.**np.arange(-8, 15, 2)} is_sparse = True elif clf_name == 'sklLinSvm': """ Linear SVM (hinge loss) """ from sklearn.svm import LinearSVC clf = LinearSVC(loss='hinge') param_grid = {'C': 2.**np.arange(-22, -6, 2)} elif clf_name == 'rbfSvm': # RBF Kernel SVM from tak.ml import PrecomputedRBFSVM clf = PrecomputedRBFSVM() param_grid = { 'C': 10.**np.arange(-1, 10, 1), 'gamma': 10.**np.arange(-12, 1, 1) } elif clf_name == 'ttestRbfSvm': """ ttest + RBF Kernel SVM using Pipeline (3 parameters) """ from tak.ml import ttest_for_fs, PrecomputedRBFSVM from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline ttest_fs = SelectKBest(score_func=ttest_for_fs) # setup pipeline of ttest_filter + RBF_SVM clf = Pipeline([('ttest', ttest_fs), ('svm', PrecomputedRBFSVM())]) # estimator parameters in a pipeline accessed as: <estimator>__<estimator> param_grid = { 'ttest__k': (2**np.arange(4, 11, 1)).astype(int), 'svm__C': 10.**np.arange(-1, 10, 1), #^^^^^must be int, or scikit will complain 'svm__gamma': 10.**np.arange(-12, 1, 1) } is_sparse = False elif clf_name == 'ttestLinSvm': """ ttest + liblinear Pipeline (2 parameters)""" from tak.ml import ttest_for_fs from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline ttest_fs = SelectKBest(score_func=ttest_for_fs) clf = Pipeline([ ('ttest', ttest_fs), ('liblin', LinearSVC(loss='hinge')), ]) param_grid = { 'ttest__k': (2**np.arange( 4, 12, 1)).astype(int), # must be int, or scikit will complain 'liblin__C': 2.**np.arange(-18, 1, 2), } is_sparse = False elif clf_name == 'rfeLinSvm': """Added 11/07/2015""" # RFE + linear svm with hinge loss (2 parameters) from tak.ml import RFESVM clf = RFESVM(step=0.1) param_grid = { 'n_features_to_select': (2**np.arange( 4, 12, 1)).astype(int), # must be int, or scikit will complain 'C': 2.**np.arange(-18, 3, 2), } is_sparse = False elif clf_name == 'enetLogRegSpams': """ Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters)""" from tak.ml import SpamFistaFlatWrapper param_grid = { 'lambda1': 2.**np.arange(-12, 1, 1), # L1 penalty (lambda1 in SPAMS) 'lambda2': 2.**np.arange(-10, 8, 2), } # L2 penalty (lambda2 in SPAMS) clf = SpamFistaFlatWrapper(loss='logistic', regul='elastic-net', max_it=400, tol=1e-3) is_sparse = True #%% === PCA stuffs...no interpretability, but see if accuracy improves ==== #%% PCA + LDA elif clf_name == 'PcaLda': # 1 parameter from sklearn.lda import LDA from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('LDA', LDA(solver='lsqr', shrinkage='auto')), ]) #param_grid = {'PCA__n_components':(2.**np.arange(1.5, 9,0.5)).astype(int)} param_grid = {'PCA__n_components': np.array([10, 50, 500]).astype(int)} #%% PCA + LINSVM elif clf_name == 'PcaLinSvm': from sklearn.svm import LinearSVC from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('SVM', LinearSVC(loss='hinge')), ]) # param_grid = {'PCA__n_components':(2.**np.arange(1.5, 10,2)).astype(int), # 'SVM__C':2.**np.arange(-18,1,3)} param_grid = { 'PCA__n_components': (2.**np.arange(1.5, 9, 0.5)).astype(int), 'SVM__C': 2.**np.arange(-18, 3, 2) } #%% PCA + RBFSVM elif clf_name == 'PcaRbfSvm': from tak.ml import PrecomputedRBFSVM from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('SVM', PrecomputedRBFSVM()), ]) # param_grid = {'PCA__n_components':(2.**np.arange(1.5, 10,2)).astype(int), # 'SVM__C': 10.**np.arange(-1,10,3),#^^^^^must be int, or scikit will complain # 'SVM__gamma': 10.**np.arange(-12,-5,1)} param_grid = { 'PCA__n_components': (2.**np.arange(2, 10, 1)).astype(int), 'SVM__C': 10.**np.arange(-1, 10, 2), #^^^^^must be int, or scikit will complain 'SVM__gamma': 10.**np.arange(-12, -5, 2) } #%% ttest + LDA (for interpretability, I guess) elif clf_name == 'ttestLDA': from tak.ml import ttest_for_fs from sklearn.lda import LDA from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectKBest ttest_fs = SelectKBest(score_func=ttest_for_fs) clf = Pipeline([ ('ttest', ttest_fs), ('LDA', LDA(solver='lsqr', shrinkage='auto')), ]) param_grid = {'ttest__k': (2**np.arange(4, 10.5, 0.5)).astype(int)} #%%______huge elif above is complete. return ______ return clf, param_grid, is_sparse
def model_training_testing(train_data_file, test_data_file, model_name, lexicon_path): # read data df_train = pd.read_csv(train_data_file) df_test = pd.read_csv(test_data_file) # change label df_train["label"] = df_train["label"].replace("objective", "neutral") df_test["label"] = df_test["label"].replace("objective", "neutral") # data preprocessing # get number of caps before converting everything to lower case train_caps = np.array(df_train["tweet_tokens"].apply(get_caps)).reshape( df_train.shape[0], -1) test_caps = np.array(df_test["tweet_tokens"].apply(get_caps)).reshape( df_test.shape[0], -1) df_train["tweet_tokens"] = df_train["tweet_tokens"].apply( preprocessing_upper_lower).values df_test["tweet_tokens"] = df_test["tweet_tokens"].apply( preprocessing_upper_lower).values df_train["tweet_tokens"] = df_train["tweet_tokens"].apply( preprocessing_url).values df_test["tweet_tokens"] = df_test["tweet_tokens"].apply( preprocessing_url).values df_train["pos_tags"] = df_train["pos_tags"].fillna(value=" ") df_test["pos_tags"] = df_test["pos_tags"].fillna(value=" ") df_train["pos_tags"] = df_train["pos_tags"].apply(strip) df_test["pos_tags"] = df_test["pos_tags"].apply(strip) # prepare ngram word data sub_training_X, training_Y, sub_test_X, test_Y = ngram_feature( df_train, df_test) # prepare ngram char data char_training_X, char_test_X = char_feature(df_train, df_test) # concatenate them together sub_training_X = np.concatenate((sub_training_X, char_training_X), axis=1) sub_test_X = np.concatenate((sub_test_X, char_test_X), axis=1) print("ngram done") # create model based on arguement feature_select = SelectKBest(f_classif, k=1000) sub_training_X = feature_select.fit_transform(sub_training_X, training_Y) sub_test_X = feature_select.transform(sub_test_X) # lexicon model if (model_name == "Ngram+Lex") or (model_name == "Ngram+Lex+Enc") or (model_name == "Custom"): # get lexicon features lex_emo_uni = lexicon_path + "/Sentiment140-Lexicon/Emoticon-unigrams.txt" lex_emo_bi = lexicon_path + "/Sentiment140-Lexicon/Emoticon-bigrams.txt" lex_hs_uni = lexicon_path + "/Hashtag-Sentiment-Lexicon/HS-unigrams.txt" lex_hs_bi = lexicon_path + "/Hashtag-Sentiment-Lexicon/HS-bigrams.txt" uni_train, uni_train_total = unigram_lex(df_train, lex_emo_uni) uni_test, uni_test_total = unigram_lex(df_test, lex_emo_uni) bi_train, bi_train_total = bigram_lex(df_train, lex_emo_bi) bi_test, bi_test_total = bigram_lex(df_test, lex_emo_bi) hs_uni_train, hs_uni_train_total = unigram_lex(df_train, lex_hs_uni) hs_uni_test, hs_uni_test_total = unigram_lex(df_test, lex_hs_uni) hs_bi_train, hs_bi_train_total = bigram_lex(df_train, lex_hs_bi) hs_bi_test, hs_bi_test_total = bigram_lex(df_test, lex_hs_bi) # combine lexicon features with ngram features # training_X = np.concatenate((training_X, uni_train), axis=1) training_X = uni_train test_X = uni_test training_X = np.concatenate( (training_X, bi_train, hs_uni_train, hs_bi_train, uni_train_total, bi_train_total, hs_uni_train_total, hs_bi_train_total), axis=1) test_X = np.concatenate( (test_X, bi_test, hs_uni_test, hs_bi_test, uni_test_total, bi_test_total, hs_uni_test_total, hs_bi_test_total), axis=1) print("Lex done") if model_name == "Ngram+Lex+Enc": train_encoding = np.array( df_train["tweet_tokens"].apply(hashtags)).reshape( df_train.shape[0], -1) train_encoding = np.concatenate((train_encoding, train_caps), axis=1) train_encoding = np.concatenate( (train_encoding, np.array( df_train["tweet_tokens"].apply(exclaim)).reshape( df_train.shape[0], -1)), axis=1) test_encoding = np.array( df_test["tweet_tokens"].apply(hashtags)).reshape( df_test.shape[0], -1) test_encoding = np.concatenate((test_encoding, test_caps), axis=1) test_encoding = np.concatenate( (test_encoding, np.array( df_test["tweet_tokens"].apply(exclaim)).reshape( df_test.shape[0], -1)), axis=1) # train_pos, test_pos = pos_occurence(df_train, df_test) # training_X = np.concatenate((training_X, train_encoding), axis=1) # test_X = np.concatenate((test_X, test_encoding), axis=1) # training_X = np.concatenate((training_X, train_pos), axis=1) # test_X = np.concatenate((test_X, test_pos), axis=1) print("Enc done") if model_name == "Custom": # previous encoding train_encoding = np.array( df_train["tweet_tokens"].apply(hashtags)).reshape( df_train.shape[0], -1) train_encoding = np.concatenate((train_encoding, train_caps), axis=1) # additional features train_encoding = np.concatenate( (train_encoding, np.array( df_train["tweet_tokens"].apply(exclaim)).reshape( df_train.shape[0], -1)), axis=1) # previous encoding test_encoding = np.array( df_test["tweet_tokens"].apply(hashtags)).reshape( df_test.shape[0], -1) test_encoding = np.concatenate((test_encoding, test_caps), axis=1) # additional features test_encoding = np.concatenate( (test_encoding, np.array( df_test["tweet_tokens"].apply(exclaim)).reshape( df_test.shape[0], -1)), axis=1) train_pos, test_pos = pos_occurence(df_train, df_test) training_X = np.concatenate((training_X, train_encoding), axis=1) test_X = np.concatenate((test_X, test_encoding), axis=1) training_X = np.concatenate((training_X, train_pos), axis=1) test_X = np.concatenate((test_X, test_pos), axis=1) print("Custom done") if model_name != "Ngram": # scale data to have 0 mean and unit variance scaler = preprocessing.StandardScaler().fit(training_X) training_X = scaler.transform(training_X) test_X = scaler.transform(test_X) training_X = np.concatenate((sub_training_X, training_X), axis=1) test_X = np.concatenate((sub_test_X, test_X), axis=1) else: training_X = sub_training_X test_X = sub_test_X # # SGD sgd = SGDClassifier(loss="hinge", penalty="elasticnet", l1_ratio=0.05, random_state=43, max_iter=6000) print("training Linear SVM") sgd.fit(training_X, training_Y) print("testing Linear SVM") predictions = sgd.predict(test_X) f1 = f1_score(test_Y, predictions, average='macro') class_score = f1_score(test_Y, predictions, average=None) print("f1 score for negative is {}".format(class_score[0])) print("f1 score for positive is {}".format(class_score[1])) print("f1 score for neutral is {}".format(class_score[2])) print("macro f1 score is {}".format(f1))
import numpy as np import operator import re admissions_age = pd.read_excel('annual_maya.xlsx', 'admissions_age_gr_prim_sec') jobs = pd.read_excel('annual_maya.xlsx', 'jobs') admissions_gender = pd.read_excel('annual_maya.xlsx', 'adm') annual = pd.read_excel('annual_maya.xlsx', 'admissioned_region_prim_sec_t77') #cycling = pd.read_excel('annual_earnings_cleanedup.xlsx', 'cycling') dataset = pd.merge(admissions_age, jobs, on='year') dataset2 = pd.merge(dataset, admissions_gender, on='year') all_data = pd.merge(dataset2, annual, on='year') #print (data.sheet_names) #all_data.isnull().any() predictors = [ 'yr', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009', '2_all_ages', '2_under_16', '2_16_24', '2_25_34', '2_35_44', '2_45_54', '2_55_64', '2_65_74', '2_75_over', '2_unknown_age', 'n_jobs', 'annual_income', 'annual_income_m', 'annual_income_f', 'full_time', 'part_time', 'all_m_f', 'male', 'female', 'no_gender' ] selector = SelectKBest(f_classif, k=7) selector.fit(all_data[predictors], all_data["obese"]) weight = -np.log10(selector.pvalues_) plt.bar(range(len(predictors)), weight) plt.xticks(range(len(predictors)), predictors, rotation="vertical") plt.show()
def get_best_features(x_train, y_train, x_test): select_best = SelectKBest(chi2, k=400000) select_best.fit(x_train, y_train) x_train = select_best.transform(x_train) x_test = select_best.transform(x_test) return x_train, x_test
X_con.head() # In[19]: core = pd.concat([X_con, (np.sqrt(df.iloc[:, -1])) / 100], axis=1) core.corr(method='pearson') # In[20]: from sklearn.feature_selection import f_oneway from sklearn.feature_selection import SelectKBest # In[21]: model = SelectKBest(score_func=f_oneway, k=4) m = model.fit(X_cat, np.sqrt(df.iloc[:, -1]) / 100) # In[22]: dfscores = pd.DataFrame(m.scores_, columns=['Scores']) dfcolumns = pd.DataFrame(X_cat.columns) dfpvalues = pd.DataFrame(m.pvalues_) frc = pd.concat([dfscores, dfcolumns, dfpvalues], axis=1) frc.columns = ['Scores', 'Features', 'Pvalues'] # In[23]: frc # In[61]:
from time import time cv = StratifiedShuffleSplit(labels, 1000, random_state=42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) t0 = time() from sklearn.feature_selection import SelectKBest, f_classif selector = SelectKBest(f_classif, k=10) selector.fit(features_train, labels_train) features_train = selector.transform(features_train) features_test = selector.transform(features_test) clf = tree.DecisionTreeClassifier() clf = clf.fit(features_train, labels_train) pred = clf.predict(features_test) from sklearn.metrics import accuracy_score, precision_score, recall_score print("Accuracy: ", accuracy_score(labels_test, pred)) print("Precision: ", precision_score(labels_test, pred)) print("Recall: ", recall_score(labels_test, pred)) imp = clf.feature_importances_ fea_imp = dict(zip(features_list2, imp)) print {k: v for k, v in fea_imp.iteritems() if v > 0.1} print("Decision tree algorithm time:", round(time() - t0, 3), "s")
def mkchi2(k): """Make k-best chi2 selector""" return SelectKBest(chi2, k=k)
newdf_test.drop('service', axis=1, inplace=True) print(newdf_test['label'].value_counts()) X_type = newdf.drop('label', 1) Y_type = newdf.label X_type_test = newdf_test.drop('label', 1) Y_type_test = newdf_test.label colNames = list(X_type) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 chi2f = SelectKBest( chi2, k=67) #iterate the k from 1 to 120. The max. accuracy comes at k=67 . chi2f.fit(X_type, Y_type) true = chi2f.get_support() chicolindex_type = [i for i, x in enumerate(true) if x] chicolname_type = list(colNames[i] for i in chicolindex_type) print('Features selected :', chicolname_type) features = newdf[chicolname_type].astype(float) features1 = newdf_test[chicolname_type].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=0)
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert isinstance(estimator[0].steps[-1][1].random_state, int)
] # You will need to use more features # features_list = ['poi','salary', 'total_payments', 'bonus', 'total_stock_value', 'expenses', # 'exercised_stock_options', 'other', 'long_term_incentive', # 'restricted_stock', 'ratio_from_poi', # 'ratio_to_poi', 'shared_receipt_with_poi'] # You will need to use more features ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # print(features) #minmax特征转化 scaler = MinMaxScaler() fea = scaler.fit_transform(features) # 使用卡方选择特征。调出大于2的值,即删除变量expenses、other、ratio_to_poi X_new = SelectKBest(chi2, k=2).fit(fea, labels) print(X_new.scores_) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB clf0 = GaussianNB() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project
# }}} if len(sys.argv) < 4: print("%s <A> <B> <C> <labels>\n" % sys.argv[0]) sys.exit(0) directory_A = sys.argv[1] directory_B = sys.argv[2] directory_C = sys.argv[3] A, B, C, names = read_SRM_ABC(directory_A, directory_B, directory_C) X = numpy.vstack((A, C)) Xt = numpy.hstack(([0] * len(A), [1] * len(C))) selector = SelectKBest(f_classif, k=500) selector.fit(X, Xt) X = selector.transform(X) B = selector.transform(B) pm = grid_search(X, Xt) clf = svm.SVC(kernel=pm['kernel'], C=pm['C'], gamma=pm['gamma']) clf.fit(X, Xt) Z = clf.predict(B) # Calculate accuracy if len(sys.argv) == 5 and os.path.exists(sys.argv[4]): with open(sys.argv[4], 'r') as f: lines = f.read().splitlines() d = {}