def write_csv(features_dict, out_file=None): """If out_file is not given, return features as a pandas.dataFrame object """ # Since different sequences can have different set of features(!) get the union of features: feature_names = set().union(*[tuple(val.keys()) for val in features_dict.values()]) if out_file: f = open(out_file, "w") else: csv_str = "" header = sorted(feature_names) # Print feature names if out_file: f.write("accession\t" + "\t".join(header) + "\n") else: csv_str += "accession\t" + "\t".join(header) + "\n" values_line_fmt = "%s\t" * len(header) + "%s\n" for acc, features in features_dict.items(): # If feature doesn't exists, put a 0 values_line = acc + "\t" + "\t".join([str(features.get(f_name, 0)) for f_name in header]) + "\n" if out_file: f.write(values_line) else: csv_str += values_line if not out_file: return load_data(csv_str)
def write_csv(features_dict, out_file=None): '''If out_file is not given, return features as a pandas.dataFrame object ''' #Since different sequences can have different set of features(!) get the union of features: feature_names = set().union( *[tuple(val.keys()) for val in features_dict.values()]) if out_file: f = open(out_file, 'w') else: csv_str = '' header = sorted(feature_names) #Print feature names if out_file: f.write('accession\t' + '\t'.join(header) + '\n') else: csv_str += 'accession\t' + '\t'.join(header) + '\n' values_line_fmt = '%s\t' * len(header) + '%s\n' for acc, features in features_dict.items(): #If feature doesn't exists, put a 0 values_line = acc + '\t' + '\t'.join( [str(features.get(f_name, 0)) for f_name in header]) + '\n' if out_file: f.write(values_line) else: csv_str += values_line if not out_file: return load_data(csv_str)
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True): ''' Gets best features using chosen method (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr), then prints top K features' names (from featNames). If reduceMatrix = True, then also returns X reduced to the K best features. Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'.. Note, that effectiveyl, Any scikit learn method could be used, if correctly imported.. ''' #est = method() ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' features, labels, lb_encoder,featureNames = load_data(filename) X, y = features, labels # change the names as ints back to strings class_names=lb_encoder.inverse_transform(y) print("Data and labels imported. PreFilter Feature matrix shape:") print(X.shape) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print('X After K filter:',X.shape) print("K_featnames: %s" %(K_featnames)) if reduceMatrix ==True : Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') print('Saved to REDUCED_Feat.csv') return Reduced_df
scoring='roc_auc') #Scoring metrics - #'f1' 'accuracy' gs = gs.fit(X_train, y_train) print ("Finished Gridsearch") # print (gs.best_score_) report(gs.grid_scores_) if __name__ == '__main__' : Kcv=4 #Number of stratified folds for cross validation. More = slower, more accurate. fileName = r'\trainingSetFeatures.csv' # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' filePath = str(input('Input DIRRectory containing TrainingData csv ')) ## features, labels, lb_encoder,featureNames = load_data(filename, 'file') features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file') X, y = features, labels print('len(set(y)',len(set(y))) print(X.shape,"X = samples, features") scale = StandardScaler(copy=False) X = scale.fit_transform(X) FD = SelectFdr(alpha=0.0005) FD_K = SelectPercentile(percentile=70) X = FD.fit_transform(X,y) print(X.shape,"X post FDR alpha filter") X_FD = FD_K.fit_transform(X,y) print(X_FD.shape,"X post FDR+K-best alpha filter") print("\n BASE X models: \n")
def GetAllPerf(filePaths=None): Dirr = r"K:\Lab-DataSets\test_seq" ##Dirr = r'./test_seq' if filePaths is None: filePaths = list(find_files(directory=Dirr, pattern="trainingSetFeatures.csv")) # Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n", filePaths) fileNames = fileNameFromPaths(filePaths) print("FileNames:", fileNames) resDict = pd.DataFrame( index=fileNames, columns=[ "Accuracy", "Accuracy_SD", "f1", "f1_SD", "dummy_freq:Accuracy", "dummy_freq:f1", "LargestClassPercent", "Classes", # 'TopRFE-Features','Best (f1) Model parameters', "# Classes", "Array-Acc-Scores", "Array-f1-Scores", "bestML-Acc", "bestML-f1", ], ) # redDict holds results for each file/class, for saving to output-file i = -1 for filePath in filePaths: i += 1 "http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/" filePath = os.path.normpath(filePath) print(filePath) fileName = str(fileNames[i]) print() print("fileName: %s \n" % (fileName)) "resDict['Name']= fileName" X, y, lb_encoder, featureNames = load_data(filePath, "file") # X, y = features, labels print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy="most_frequent", random_state=0) y_dummyPred = Get_yPred(X, y, clf_class=dummy_frequent) dummy_freq_acc = "{:.3}".format(metrics.accuracy_score(y, y_dummyPred)) dummy_freq_f1_mean = (metrics.f1_score(y, y_dummyPred, average=None)).mean() dummy_freq_f1 = "{:.3}".format(metrics.f1_score(y, y_dummyPred, average="weighted")) print("f1 Av:", dummy_freq_f1_mean) ## print ("f1 micro:",metrics.f1_score(y, y_dummyPred,average='micro')) print("dummy_freq_f1", dummy_freq_f1) print("Dummy, most frequent acc:", dummy_freq_acc) ## print("Diffence: dummy_freq_f1_def",dummy_freq_f1_def) print("\n classification_report: \n", metrics.classification_report(y, y_dummyPred)) print() resDict["dummy_freq:Accuracy"][fileName] = dummy_freq_acc resDict["dummy_freq:f1"][fileName] = dummy_freq_f1_mean resDict.to_csv("dum.csv")
n_iter=10, test_size=0.2), scoring=scoreParam, n_jobs=-1, pre_dispatch='1.8*n_jobs') print("Tuned Model's %s Score: %0.3f (+/- %0.3f)" % (scoreParam, scores.mean(), scores.std() * 2)) if __name__ == '__main__': 'TODO: Allow user to select desired function - CV model, or feature reduction' 'TODO: Use os.path.join - for file names/locations/dirs..' #Set by user input: fileName = r'/trainingSetFeatures.csv' filePath = str(argv[1]) X, y, lb_encoder, featureNames = load_data( filePath + fileName, 'file') # X, y = features, labels print(X.shape, "= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) print("Classes:", y_inv) # 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) featureNames = featureNames[Fwe.get_support()] print("F-test filter ->", X.shape) FeatSelection_SVM = True
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
def GetAllPerf(filePaths=None): Dirr = r'K:\Lab-DataSets\test_seq' ##Dirr = r'./test_seq' if filePaths is None: filePaths = list( find_files(directory=Dirr, pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n", filePaths) fileNames = fileNameFromPaths(filePaths) print("FileNames:", fileNames) resDict = pd.DataFrame( index=fileNames, columns=[ 'Accuracy', 'Accuracy_SD', 'f1', 'f1_SD', 'dummy_freq:Accuracy', 'dummy_freq:f1', 'LargestClassPercent', 'Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores', 'Array-f1-Scores', 'bestML-Acc', 'bestML-f1' ]) #redDict holds results for each file/class, for saving to output-file i = -1 for filePath in filePaths: i += 1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName = str(fileNames[i]) print() print("fileName: %s \n" % (fileName)) "resDict['Name']= fileName" X, y, lb_encoder, featureNames = load_data( filePath, 'file') # X, y = features, labels print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent', random_state=0) y_dummyPred = Get_yPred(X, y, clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y, y_dummyPred)) dummy_freq_f1_mean = (metrics.f1_score(y, y_dummyPred, average=None)).mean() dummy_freq_f1 = '{:.3}'.format( metrics.f1_score(y, y_dummyPred, average='weighted')) print("f1 Av:", dummy_freq_f1_mean) ## print ("f1 micro:",metrics.f1_score(y, y_dummyPred,average='micro')) print("dummy_freq_f1", dummy_freq_f1) print("Dummy, most frequent acc:", dummy_freq_acc) ## print("Diffence: dummy_freq_f1_def",dummy_freq_f1_def) print("\n classification_report: \n", metrics.classification_report(y, y_dummyPred)) print() resDict['dummy_freq:Accuracy'][fileName] = dummy_freq_acc resDict['dummy_freq:f1'][fileName] = dummy_freq_f1_mean resDict.to_csv("dum.csv")
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')