def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert np.sum(np.abs(support - gtruth)) < 2
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def test_select_fwe_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fwe heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 2)
def _select_fwe(self, input_df, alpha): """ Uses Scikit-learn's SelectFwe feature selection to filter the subset of features according to p-values corresponding to Family-wise error rate Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on alpha: float in the range [0.001, 0.05] The highest uncorrected p-value for features to keep Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the 'best' features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values # forcing 0.001 <= alpha <= 0.05 if alpha > 0.05: alpha = 0.05 elif alpha <= 0.001: alpha = 0.001 if len(training_features.columns.values) == 0: return input_df.copy() with warnings.catch_warnings(): # Ignore warnings about constant features warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=alpha) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
dataset = sys.argv[1] preprocessor_list = [ Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100)) ] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore')
def train_predict_and_test(model, target_name, train_features, train_labels, test_features, test_labels, feature_selection=None): classification = (target_name == Phenotypes.DIAGNOSED_ASTHMA or target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL) # Standardize data standardized = False if model == Models.MLP or model == Models.SVM: print("Standardizing data..") standardized = True features_mean = train_features.mean() features_std = train_features.std() train_features = (train_features - features_mean) / features_std test_features = (test_features - features_mean) / features_std if not classification: labels_mean = train_labels.mean() labels_std = train_labels.std() train_labels = (train_labels - labels_mean) / labels_std test_labels = (test_labels - labels_mean) / labels_std # Load optimized params params = load_optimized_params(model, target_name) # Features selection feature_selector = VarianceThreshold(threshold=0).fit( train_features) # Removing features with 0 variance train_col, test_col = train_features.columns, test_features.columns train_features = pd.DataFrame(feature_selector.transform(train_features), columns=train_col) test_features = pd.DataFrame(feature_selector.transform(test_features), columns=test_col) if feature_selection == "fwe": print("Selecting features according to Familly Wise Error") # alpha = 5e-2 alpha = 0.3 if params is not None: try: alpha = params['transformer_alpha'] except KeyError: print( "Cannot find parameter alpha for FWE feature selector. Using default value" ) features_selector = SelectFwe(f_regression, alpha=alpha).fit(train_features, train_labels) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "kbest": k = 150 if params is not None: try: k = params['k'] except KeyError: print( "Cannot find parameter k for k-best feature selector. Using default value: k=", k) print("Selecting k-best features:", k) score_func = f_regression if classification: score_func = f_classif features_selector = SelectKBest(score_func=score_func, k=k) features_selector = features_selector.fit(train_features, train_labels) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "tree": print("Selecting features from RF feature importance") clf = RandomForestRegressor(n_estimators=100).fit( train_features, train_labels) if classification: clf = RandomForestClassifier(n_estimators=100).fit( train_features, train_labels) features_selector = SelectFromModel(clf, prefit=True) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "corr": threshold = 0.9 # Recommended default value col_corr = set() corr_matrix = train_features.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if abs(corr_matrix.iloc[i, j]) > threshold: colname = corr_matrix.columns[i] col_corr.add(colname) train_features = train_features.drop(col_corr, axis=1) test_features = test_features.drop(col_corr, axis=1) # Oversampling if classification and model != Models.SVM and model != Models.CART and model != Models.ELASTIC: print("Oversampling features..") if target_name == Phenotypes.DIAGNOSED_ASTHMA: sampling_strat = 0.5 else: sampling_strat = { 0: np.max(np.bincount(train_labels)) // 4, 1: np.max(np.bincount(train_labels)), 2: np.max(np.bincount(train_labels)), 3: np.max(np.bincount(train_labels)) // 2 } oversampler = imblearn.over_sampling.RandomOverSampler( sampling_strategy=sampling_strat, random_state=42) # oversampler = imblearn.over_sampling.SMOTE(sampling_strategy=1.0, # k_neighbors=5, # random_state=42) train_features, train_labels = oversampler.fit_resample( train_features, train_labels) if model == Models.RF: if target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL: # Create validation set for threshold optimization val_features, test_features, val_labels, test_labels = train_test_split( test_features, test_labels, test_size=0.5, random_state=42) model, predictions = _predict_rf(target_name, train_features, train_labels, val_features, val_labels) else: model, predictions = _predict_rf(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.ELASTIC: model, predictions = predict_elastic_net(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.XGB: model, predictions = _predict_xgb(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.MLP: model, predictions = _predict_mlp(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.SVM: model, predictions = _predict_svm(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.CART: model, predictions = _predict_cart(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.NAIVE: if not (classification): predictions = predict_naive(train_features, train_labels, test_features, test_labels) else: raise SystemExit("Cannot use naive model on classification task") else: raise SystemExit("Unkwown model:", model) # Destandardize results if standardized and not (classification): print("destandardize data..") predictions = (predictions * labels_std) + labels_mean test_labels = (test_labels * labels_std) + labels_mean # Print results if classification: print_classification_metrics(ground_truth=test_labels, predictions=predictions, num_classes=test_labels.nunique()) else: print_regression_metrics(ground_truth=test_labels, predictions=predictions) return model, predictions
class Voting: __n_neighbors = 7 __feature_methods = { "k_chi_3": SelectKBest(chi2, k=3), "k_chi_4": SelectKBest(chi2, k=4), "k_chi_5": SelectKBest(chi2, k=5), "k_fclassif_3": SelectKBest(f_classif, k=3), "k_fclassif_4": SelectKBest(f_classif, k=4), "k_fclassif_5": SelectKBest(f_classif, k=5), "k_mutual_3": SelectKBest(mutual_info_classif, k=3), "k_mutual_4": SelectKBest(mutual_info_classif, k=4), "k_mutual_5": SelectKBest(mutual_info_classif, k=5), "fpr_chi_01": SelectFpr(chi2, alpha=0.1), "fpr_chi_005": SelectFpr(chi2, alpha=0.05), "fpr_chi_001": SelectFpr(chi2, alpha=0.01), "fpr_fclassif_01": SelectFpr(f_classif, alpha=0.1), "fpr_fclassif_005": SelectFpr(f_classif, alpha=0.05), "fpr_fclassif_001": SelectFpr(f_classif, alpha=0.01), "fnr_chi_01": SelectFdr(chi2, alpha=0.1), "fnr_chi_005": SelectFdr(chi2, alpha=0.05), "fnr_chi_001": SelectFdr(chi2, alpha=0.01), "fnr_fclassif_01": SelectFdr(f_classif, alpha=0.1), "fnr_fclassif_005": SelectFdr(f_classif, alpha=0.05), "fnr_fclassif_001": SelectFdr(f_classif, alpha=0.01), "fwe_chi_01": SelectFwe(chi2, alpha=0.1), "fwe_chi_005": SelectFwe(chi2, alpha=0.05), "fwe_chi_001": SelectFwe(chi2, alpha=0.01), "fwe_fclassif_01": SelectFwe(f_classif, alpha=0.1), "fwe_fclassif_005": SelectFwe(f_classif, alpha=0.05), "fwe_fclassif_001": SelectFwe(f_classif, alpha=0.01), } def get_feature_method_names(self): return self.__feature_methods.keys() def learn(self, feature_method_name: str = None) -> List[int]: labels, train_array, test_array = LearnUtils.get_learn_data() if feature_method_name is not None: feature_filter = self.__feature_methods[feature_method_name] feature_filter.fit(train_array, labels) train_array = feature_filter.transform(train_array) test_array = feature_filter.transform(test_array) clf = self.__create_classifier() clf.fit(train_array, labels) return clf.predict(test_array).tolist() def cross_validation(self) -> List[float]: labels, train_array = LearnUtils.get_cross_val_data() clf = self.__create_classifier() return cross_val_score(clf, train_array, labels, cv=6) def __create_classifier(self): knn = KNN().get_classifier() svc = SVCMethod().get_classifier() random_forest = RandomForest().get_classifier() bayes = Bayes().get_classifier() return VotingClassifier(estimators=[("knn", knn), ("svc", svc), ("rf", random_forest), ("bayes", bayes)], voting="soft")
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
feature_cols=np.array(feature_cols) # In[ ]: X=df[feature_cols].values y=df.classname.values # In[ ]: le = LabelEncoder() y = le.fit_transform(y) # In[ ]: print("Orig X -> ",X.shape) Fwe = SelectFwe(alpha=0.001).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] # In[ ]: rf = RandomForestClassifierWithCoef(max_depth= 9, min_samples_split= 3, min_samples_leaf= 3, n_estimators= 650, n_jobs= -1, max_features= "auto") # In[ ]: scores = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.3)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) # scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1') # print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8185185185185185 exported_pipeline = make_pipeline( SelectFwe(score_func=f_classif, alpha=0.026000000000000002), ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.35000000000000003, min_samples_leaf=3, min_samples_split=16, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def train_classi(model_name, inputs, X_pos, y_pos, X, y, X_neg, y_neg): scaler = None model_type = inputs['model_type'] out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm'] if (model_type == "tpot"): logging_info("Training model... %s", str(model_type)) from sklearn.pipeline import make_pipeline if (model_name == "tpot_select"): clf = tpot_classi(inputs) elif (model_name == "SVM"): logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.preprocessing import StandardScaler #from sklearn.svm import LinearSVC from sklearn.svm import SVC # Pipeline from tpot #clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5)) # Cross validate with C vals - default is 1 # LinearSVC does not have a predict_proba function clf = make_pipeline( StandardScaler(), SVC(kernel='linear', probability=True, random_state=0, tol=1e-5)) elif (model_name == "estimator_SVM"): from sklearn.ensemble import GradientBoostingClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline, make_union #from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from xgboost import XGBClassifier # Score on the training set was:0.968003998605 #clf = make_pipeline(StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)),SelectFwe(score_func=f_classif, alpha=0.02),StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True, penalty="l2")),StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)),LinearSVC(C=1.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001)) clf = make_pipeline( StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)), SelectFwe(score_func=f_classif, alpha=0.02), StackingEstimator(estimator=LogisticRegression( C=1.0, dual=True, penalty="l2")), StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)), SVC(kernel='linear', probability=True, C=1.0, tol=0.001)) elif (model_name == "log_reg"): logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from tpot.builtins import StackingEstimator, ZeroCount # Pipeline from tpot # Score on humap was:0.986160063433 clf = make_pipeline( ZeroCount(), StackingEstimator( estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6, min_samples_leaf=4, min_samples_split=6, n_estimators=100)), LogisticRegression(C=15.0, dual=False, penalty="l2")) elif (model_name == "extra_trees"): from sklearn.ensemble import ExtraTreesClassifier from tpot.builtins import StackingEstimator from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Normalizer from sklearn.preprocessing import FunctionTransformer from copy import copy # Score on the training set was:0.948305771055 clf = make_pipeline( make_union( FunctionTransformer(copy), make_pipeline( StackingEstimator(estimator=ExtraTreesClassifier( bootstrap=False, criterion="gini", max_features=0.25, min_samples_leaf=8, min_samples_split=11, n_estimators=100)), Normalizer(norm="l1"))), StackingEstimator( estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.75, min_samples_leaf=15, min_samples_split=18, n_estimators=100)), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.85, min_samples_leaf=5, min_samples_split=4, n_estimators=100)) else: # Random forest logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import PolynomialFeatures # Pipeline from tpot # Score on humap was:0.986160063433 clf = make_pipeline( VarianceThreshold(threshold=0.05), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.35, min_samples_leaf=1, min_samples_split=11, n_estimators=100)) clf.fit(X, y) logging_info("Finished Training model") logging_info("Evaluating training accuracy...") #Training accuracy acc_overall_train = clf.score(X, y) acc_pos_train = clf.score(X_pos, y_pos) acc_neg_train = clf.score(X_neg, y_neg) res_pos = clf.predict(X_pos) res = clf.predict(X_neg) n_pos = len(X_pos) n_neg = len(X_neg) acc, acc_neg, Recall, Precision, F1_score = calc_metrics( res, res_pos, n_neg, n_pos) analyze_sizewise_accuracies( X_pos, res_pos, X_neg, res, out_comp_nm + '_size_wise_accuracies_train.png') train_fit_probs = clf.predict_proba(X)[:, 1] train_aps = sklearn_metrics_average_precision_score(y, train_fit_probs) with open(out_comp_nm + '_metrics.out', "a") as fid: print("Training set average precision score = %.3f" % train_aps, file=fid) model = clf if hasattr(model, 'decision_function'): score = model.decision_function(X_neg) np_savetxt(out_comp_nm + '_train_neg_score.out', score) score = model.decision_function(X_pos) np_savetxt(out_comp_nm + '_train_pos_score.out', score) elif (model_type == "NN"): # Standardizing the feature matrix from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Scaling X_pos and X_neg as well now for testing with them later X_pos = scaler.transform(X_pos) X_neg = scaler.transform(X_neg) import tensorflow as tf from tensorflow import keras #tf.enable_eager_execution() # Fix ensuing errors logging_info("Training model... %s", str(model_type)) # multi-layer perceptron #for most problems, one could probably get decent performance (even without a second optimization step) by setting the hidden layer configuration using just two rules: (i) number of hidden layers equals one; and (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers. print() dims = X.shape n_feats = dims[1] n_classes = 2 logging_info("No. of nodes in input layer = %s", str(n_feats)) logging_info("No. of nodes in output layer (since softmax) = %s", str(n_classes)) hidden_nodes = int((n_feats + n_classes) / 2) logging_info("No. of nodes in the one hidden layer = %s", str(hidden_nodes)) model = keras.Sequential([ keras.layers.Dense(n_feats, activation=tf.nn.relu), keras.layers.Dense(hidden_nodes, activation=tf.nn.relu), keras.layers.Dense(n_classes, activation=tf.nn.softmax) ]) #model = keras.Sequential([keras.layers.Dense(n_feats, activation = tf.nn.relu), keras.layers.Dense(n_classes, activation = tf.nn.softmax)]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) N_epochs = 1000 model.fit(X, y, epochs=N_epochs, verbose=0) with open(out_comp_nm + '_metrics.out', "a") as fid: print("No. of epochs = ", N_epochs, file=fid) logging_info("Finished Training model") logging_info("Evaluating training accuracy...") loss_overall, acc_overall_train = model.evaluate(X, y, verbose=0) loss_pos, acc_pos_train = model.evaluate(X_pos, y_pos, verbose=0) loss_neg, acc_neg_train = model.evaluate(X_neg, y_neg, verbose=0) else: print("Model type not found") logging_info("Finished Evaluating training accuracy.") with open(out_comp_nm + '_metrics.out', "a") as fid: print("Accuracy overall train = %.3f" % acc_overall_train, file=fid) print("Accuracy positive train = %.3f" % acc_pos_train, file=fid) print("Accuracy negative train = %.3f" % acc_neg_train, file=fid) print("Train Precision = %.3f" % Precision, file=fid) print("Train Recall = %.3f" % Recall, file=fid) print("Train F1 score = %.3f" % F1_score, file=fid) return model, scaler
from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100))] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore') for (preprocessor, C, loss, fit_intercept) in itertools.product( preprocessor_list, [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1., 10., 50., 100.], ['hinge', 'squared_hinge'], [True, False]): features = input_data.drop('class', axis=1).values.astype(float)
from sklearn.preprocessing import Normalizer from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-15.203873985130153 exported_pipeline = make_pipeline( SelectFwe(score_func=f_regression, alpha=0.026000000000000002), StackingEstimator(estimator=LinearSVR(C=10.0, dual=False, epsilon=0.001, loss="squared_epsilon_insensitive", tol=0.01)), StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.75, learning_rate=0.001, loss="quantile", max_depth=2, max_features=0.35000000000000003, min_samples_leaf=15, min_samples_split=17, n_estimators=100, subsample=0.7500000000000001)),
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import OneHotEncoder, StackingEstimator from tpot.export_utils import set_param_recursive from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.3656802383316783 exported_pipeline = make_pipeline( make_union( make_pipeline(SelectFwe(score_func=f_classif, alpha=0.008), OneHotEncoder(minimum_fraction=0.1), SelectPercentile(score_func=f_classif, percentile=13)), FunctionTransformer(copy)), LinearSVC(C=0.1, dual=False, loss="squared_hinge", penalty="l2", tol=0.01)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
] for eachDataset in datasetNames: print eachDataset X_sparse, y_full = load_svmlight_file("OriginalDatasets/" + eachDataset) X_full = X_sparse.toarray() methodsNames = ["full", "pca", "chi2", "fs", "et", "fpr", "fdr", "fwe"] methodsFS = [ SelectPercentile(chi2, percentile=100), PCA(n_components=0.9), SelectPercentile(chi2, percentile=75), SelectPercentile(f_classif, percentile=75), ExtraTreesClassifier(random_state=0), SelectFpr(), SelectFdr(), SelectFwe() ] for x in range(0, len(methodsFS)): fsm = methodsFS[x] fsm.fit(X_full, y_full) X_redu = fsm.transform(X_full) #Some algorithms fail and select 0 features, lets fix that if len(X_redu[0]) < 3: tmpMethod = SelectKBest(chi2, k=3) tmpMethod.fit(X_full, y_full) X_redu = tmpMethod.transform(X_full) fileOut = open( "ReducedDatasets/" + eachDataset + "_" + methodsNames[x], 'wb') dump_svmlight_file(X_redu, y_full, fileOut, zero_based=False)
import numpy as np import pandas as pd from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from tpot.builtins import ZeroCount # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9822176245306669 exported_pipeline = make_pipeline( ZeroCount(), SelectFwe(score_func=f_classif, alpha=0.015), KNeighborsClassifier(n_neighbors=5, p=2, weights="uniform")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.8255806224382443 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=91), StackingEstimator( estimator=MLPClassifier(alpha=0.0001, learning_rate_init=0.001)), StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features=0.6000000000000001, min_samples_leaf=15, min_samples_split=4, n_estimators=100, subsample=0.05)), SelectFwe(score_func=f_classif, alpha=0.035), MultinomialNB(alpha=0.1, fit_prior=False)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler, RobustScaler from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBRegressor from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:-148.31589276097782 exported_pipeline = make_pipeline( make_union( make_pipeline(MaxAbsScaler(), RobustScaler(), ZeroCount(), SelectFwe(score_func=f_regression, alpha=0.038)), FunctionTransformer(copy)), XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=15, n_estimators=100, nthread=1, subsample=1.0)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
filename = "../training_data/ordered_tweets_no_duplicates.txt" tweets_and_labels = parse_labeled_data(filename) # print tweets_and_labels # random.shuffle(tweets_and_labels) Y, X = get_x_y(tweets_and_labels) # X, Y = make_moons(noise=0.3, random_state=0) # print X, Y # print nX[0], nY[0] # splitting training and test set x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42) # C = regularization parameter (keeps from overfitting): C is the degree of penalty (L1 or L2) (powers of 10) # penalty sparse = l2 lowers angle so that no unigram can be super weighted, l1 removes features to shift the curve # TODO: separate into train test eval fs = SelectFwe(alpha=700.0) print "Before", x_train.shape clf = svm.LinearSVC(C=100, penalty="l2", dual=False) clf.fit(x_train, y_train) print "NO FEATURE SELECTION" print "Training Accuracy" print clf.decision_function(x_train) print (classification_report(y_train, clf.predict(x_train), target_names=target_names)) print "Testing Accuracy" print (classification_report(y_test, clf.predict(x_test), target_names=target_names)) x_train = fs.fit_transform(x_train, y_train)
'TODO: Allow user to select desired function - CV model, or feature reduction' 'TODO: Use os.path.join - for file names/locations/dirs..' #Set by user input: fileName = r'/trainingSetFeatures.csv' filePath = str(argv[1]) X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) print("Classes:", y_inv) # 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("F-test filter ->",X.shape) FeatSelection_SVM=True FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1) X = LogRegFeats.fit_transform(X,y) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X.shape) elif FeatSelection_SVM == True:
import seaborn as sns from sklearn.preprocessing import LabelEncoder from PipeTasks import Get_yPred,balance_weights # Import some data to play with ######################################### os.chdir(r'/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile') ##os.chdir(r'/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SP_Cleaved+NP+Neg_Big') df = pd.read_csv('trainingSetFeatures.csv') ## df.drop('proteinname',axis=1, inplace=True) feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']] X=df[feature_cols].values y=df.classname.values Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) le = LabelEncoder() y = le.fit_transform(y) # Binarize the output # y = label_binarize(y, classes=[0, 1, 2]) # y = label_binarize(y) ##n_classes = y.shape[1] n_classes=len(set(y)) target_names=list(le.classes_) print ("n_classes",n_classes,"target_names",target_names) # shuffle and split training and test sets ##X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, ## random_state=0)
model_fs = SelectKBest(f_classif, k=15).fit(x_train, y_train) # grid search for the parameter #-- method 2-2 SelectFdr: f_classif if method == 'select_fdr_f_classif': from sklearn.feature_selection import SelectFdr from sklearn.feature_selection import f_classif model_fs = SelectFdr(f_classif, alpha=1e-7).fit( x_train, y_train) # grid search for the parameter #-- method 2-3 SelectFwe: f_classif if method == 'select_fwe_f_classif': from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import f_classif model_fs = SelectFwe(f_classif, alpha=0.0001).fit(x_train, y_train) #-- method 3 RFECV: SVC if method == 'rfecv_svc': from sklearn.feature_selection import RFECV from sklearn.svm import SVC svc = SVC(kernel="linear") model_fs_pre = RFECV(estimator=svc, step=1, cv=5) model_fs = model_fs_pre.fit(x_train, y_train) #-- method 4-1 select from model: LinearSVC (L1-based) if method == 'select_from_model_linear_svc': from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel model_fs_pre = LinearSVC(C=0.01, penalty="l1", dual=False) # grid search for the parameter
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
def run(): target_names = ["Self", "Another Person", "General Statement"] tweets_and_labels = parse_labeled_data(filename) #splitting training and test set y_train, x_test, x_train = get_x_y(tweets_and_labels, testdata) #Chi-Squared Analysis sel = SelectPercentile(chi2, percentile=80) sel.fit(x_train, y_train) x_train = sel.transform(x_train) x_test = sel.transform(x_test) #Univariate Feature Selection fs = SelectFwe(alpha=150.0) x_train = fs.fit_transform(x_train, y_train) x_test = fs.transform(x_test) #Classifier Fitting clf = svm.LinearSVC(C=10, penalty='l2', loss='l1', dual=True, fit_intercept=False, class_weight='auto') clf.fit(x_train, y_train) returned = clf.predict(x_test) print returned #Print relevant usernames & tweets to .csv file t = time.strftime("%d_%m_%Y") output1 = 'classifications/' + t + '_self.csv' output2 = 'classifications/' + t + '_another_person.csv' with open(output1, 'w+') as o1: wr = csv.writer(o1, quoting=csv.QUOTE_ALL) for i, val in enumerate(returned): if val == 0: row = [testdata[i][1], testdata[i][0]] wr.writerow(row) with open(output2, 'w+') as o2: wr = csv.writer(o2, quoting=csv.QUOTE_ALL) for i, val in enumerate(returned): if val == 1: row = [testdata[i][1], testdata[i][0]] wr.writerow(row) ######################################################################## '''Graphing of Data''' '''Note, since there is no annotation for test data''' '''This is a visual representation of output data, not model accuracy''' ######################################################################## graph = True if (graph): #Graph setup X, Y, Z, new_y = graph_setup(clf, x_test, returned) #graph Scatter Plot of training data graph_scatter(x_train, y_train) #Graph 3D Plot of test data graph_3d(X, Y, Z, new_y) #Graph 2-D Plot of test data graph_2d(X, Y, new_y)
import numpy as np import pandas as pd from sklearn.feature_selection import SelectFwe, f_classif from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9147335715485821 exported_pipeline = make_pipeline( SelectFwe(score_func=f_classif, alpha=0.023), MinMaxScaler(), LogisticRegression(C=1.0, dual=False, penalty="l1")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_regression from sklearn.linear_model import RidgeCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Normalizer, RobustScaler from sklearn.tree import DecisionTreeRegressor from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-1.969947855356932 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=85), Normalizer(norm="l2"), RobustScaler(), StackingEstimator(estimator=RidgeCV()), StackingEstimator(estimator=DecisionTreeRegressor( max_depth=2, min_samples_leaf=8, min_samples_split=5)), SelectFwe(score_func=f_regression, alpha=0.014), RobustScaler(), StackingEstimator(estimator=RidgeCV()), StackingEstimator(estimator=DecisionTreeRegressor( max_depth=2, min_samples_leaf=2, min_samples_split=11)), RidgeCV()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import f_classif from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1) training_class_vals = result1.loc[training_indices, 'class'].values if len(training_features.columns.values) == 0: result1 = result1.copy() else: selector = SelectFwe(f_classif, alpha=0.05) selector.fit(training_features.values, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['class'] result1 = result1[mask_cols] # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.8551331638356954 exported_pipeline = make_pipeline( make_union( StackingEstimator( estimator=DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_leaf=4, min_samples_split=14)), FunctionTransformer(copy)), SelectFwe(score_func=f_classif, alpha=0.045), KNeighborsClassifier(n_neighbors=60, p=1, weights="distance")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']] feature_cols=numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X,y) X=k.transform(X) feature_cols=feature_cols[k.get_support()] param_dist = {"max_depth": [6,9, None], "max_features": ['auto',0.4], "min_samples_leaf": [1,2,3], "bootstrap": [True, False], 'min_samples_split':[2,3], "criterion": [ "gini"], "n_estimators":[100], "n_jobs":[-1]} rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean() print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
import numpy as np import pandas as pd from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.8158182768942263 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB(alpha=10.0, fit_prior=True)), SelectFwe(score_func=f_classif, alpha=0.043000000000000003), DecisionTreeClassifier(criterion="entropy", max_depth=10, min_samples_leaf=10, min_samples_split=17) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)