def feature_selection(train,test,y): print "特征选择" clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3) clf.fit(train,y) train = clf.transform(train) test = clf.transform(test) return train,test
def predictWithAdaBoost(config, X, Y, testFeatures): adaConfig = config.getConfig('model/adaboost') if adaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R') clf.fit(X,Y) return clf.predict(testFeatures)
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5): # Create the random forest object which will include all the parameters # for the fit randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200) # Fit the training data to the Survived labels and create the decision trees randomlr = randomlr.fit(train_x,train_y) train_x = randomlr.fit_transform(train_x,train_y) cv_x = randomlr.transform(cv_x) test_x = randomlr.transform(test_x) return train_x,cv_x,test_x
def predictWithQDA(config, X, Y, testFeatures): qdaConfig = config.getConfig('model/qda') if qdaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) priors = qdaConfig.get('priors', None) clf = QDA(priors = priors) clf.fit(X, Y) return clf.predict(testFeatures)
def test_rflasso(): train_X, test_X, train_Y, test_Y = train_test_split(index_data, index_lable, test_size=0.25, random_state=1) from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2) randomized_logistic.fit(train_X, train_Y) XX = randomized_logistic.transform(train_X) print XX.shape
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl, sel, paramsDict, bestmodelnum): print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM, TEST_PERSON_DEVICE_ID)) X_train_allfg = featmat_train.values Y_train = outcome_train_lbl.values # Y_train = Y_train.reshape(Y_train.size, 1)# does this help? featnames_allfg = featmat_train.columns X_test_allfg = featmat_test.values Y_test = outcome_test_lbl.values Y_true = Y_test[0] sel_featnames_per_fg = {} sel_featnames_list_ordered = [] sel_X_train = [] sel_X_test = [] countNumSel = 0 fgi = 0 for s in suffix_list: fgi = fgi + 1 # print fgi, suffix_list_str = ",".join(s) fgidxs = fgColIdxs[suffix_list_str] X_train = X_train_allfg[:, fgidxs] X_test = X_test_allfg[:, fgidxs] featnames_fg = featnames_allfg[fgidxs] # continue if empty if X_train.shape[1] == 0: continue ## scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # variance thresholding vartransform = VarianceThreshold() X_train = vartransform.fit_transform(X_train) X_test = vartransform.transform(X_test) varthres_support = vartransform.get_support() featnames_fg = featnames_fg[varthres_support] ## feature selection if sel == "rlog": #print (X_train.shape) randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train = randomized_rlog.fit_transform(X_train, Y_train) X_test = randomized_rlog.transform(X_test) chosen_col_idxs = randomized_rlog.get_support() #print (len(featnames_fg)) #print (len(chosen_col_idxs)) if len(chosen_col_idxs) > 0: featnames_fg_chosen = list(featnames_fg[chosen_col_idxs]) sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen sel_X_train.append(X_train) sel_X_test.append(X_test) countNumSel = countNumSel + len(featnames_fg_chosen) else: raise ("Unrecognized sel (feature selection algorithm)") ## feature selection: sel{sel{fg1}.....sel{fg45}} X_train_concat = np.hstack(sel_X_train) X_test_concat = np.hstack(sel_X_test) print("\nSum of number of features selected from all fgs = {0}".format( countNumSel)) print("Concatenated X_train has {0} features".format( X_train_concat.shape[1])) print("Concatenated X_test has {0} features".format( X_test_concat.shape[1])) if sel == "rlog": randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train) X_test_concat = randomized_rlog.transform(X_test_concat) chosen_col_idxs = randomized_rlog.get_support() sel_featnames_list_ordered = np.array(sel_featnames_list_ordered) chosen_col_idxs = np.array(chosen_col_idxs) chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs] else: raise ("Unrecognized sel (feature selection algorithm)") print("Final number of features in model = {0}".format( X_train_concat.shape[1])) # GBCT if modelname == "GBC": clf = GradientBoostingClassifier(random_state=0) elif modelname == "LOGR": clf = LogisticRegression(random_state=0, C=paramsDict["C"], tol=1e-3, penalty="l1", n_jobs=paramsDict["n_jobs"], intercept_scaling=1, class_weight="balanced") else: raise ("Unrecognized model name") clf.fit(X_train_concat, Y_train) pred = clf.predict(X_test_concat) pred_proba = clf.predict_proba(X_test_concat) Y_pred = pred[0] Y_pred_proba = pred_proba[0][1] ## Logging test_person_test.csv - outputs 1 line only ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns chosen_cols_final_str = ",".join(chosen_cols_final) paramsDict_str = ','.join("%s:%r" % (key, val) for (key, val) in paramsDict.iteritems()) fgIdxs_str = ','.join("%s:%r" % (key, val) for (key, val) in fgIdxs.iteritems()) cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl) cnts_per_lbl_str = ','.join("%s:%r" % (key, val) for (key, val) in cnts_per_lbl_dict.iteritems()) dfout = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "cnts_per_lbl": [cnts_per_lbl_str], "sel": [sel], "selParams": [paramsDict_str], "Y_pred": [Y_pred], "Y_pred_proba": [Y_pred_proba], "Y_true": [Y_true], "fgIdxs": [fgIdxs_str], "sel_final": [chosen_cols_final_str] }) dfout = dfout.set_index("did") cols = [ "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true", "fgIdxs", "sel_final" ] for s in suffix_list: suffix_list_str = ",".join(s) if suffix_list_str in sel_featnames_per_fg: sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str]) else: sel_feats_fg_str = "" dfcol = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "sel_{0}".format(suffix_list_str): [sel_feats_fg_str] }) dfcol = dfcol.set_index("did") dfout = pd.concat([dfout, dfcol], axis=1) cols.append("sel_{0}".format(suffix_list_str)) dfout.to_csv( folderpath + "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum), columns=cols, header=True) print("{0} minutes elapsed since start of program ".format( (time.time() - STARTTIME) / 60.0)) return (Y_pred, Y_pred_proba)
from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression import fmriUtils as fm #自定义函数 n_folds = 10 f = fm.outTo() #输出重定向到文件 X, y = fm.loadData2() X2, y2 = fm.loadData2() y = fm.defineClass(y) randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2) randomized_logistic.fit(X, y) XX = randomized_logistic.transform(X) print "============选择后剩余的特征================" print XX.shape yy = y cv = StratifiedKFold(yy, n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test]) cv_scores.append(np.sum(prediction == yy[test]) / float(np.size(yy[test]))) print "========分类准确率=======" print cv_scores, np.mean(cv_scores)
from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression import fmriUtils as fm #自定义函数 n_folds = 10 f = fm.outTo() #输出重定向到文件 X,y = fm.loadData2() X2,y2 = fm.loadData2() y = fm.defineClass(y) randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2) randomized_logistic.fit(X,y) XX = randomized_logistic.transform(X) print "============选择后剩余的特征================" print XX.shape yy = y cv = StratifiedKFold(yy,n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test]) cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) ) print "========分类准确率=======" print cv_scores,np.mean(cv_scores)
cv=StratifiedKFold(9), n_permutations=2000, n_jobs=2) # RandomizedLogisticRegression (RLR) feature selection # Grid search for optimal RLR params selection_threshold = np.arange(0.3, 0.9, 0.05) rlr_grid_search = pd.DataFrame() for st in selection_threshold: for i in range(100): print("Working on: %s (%d of 100)" % (st, (i + 1))) rlr = RandomizedLogisticRegression( n_resampling=5000, C=lr_mean.C, selection_threshold=st, n_jobs=2) rlr.fit(X, y) X_rlr = rlr.transform(X) if X_rlr.size: cv_scores_rlr = cross_val_score( lr_mean, X_rlr, y, scoring="roc_auc", cv=StratifiedKFold(9)) rlr_tmp = { "st": st, "cv_score": cv_scores_rlr.mean(), "cv_std": cv_scores_rlr.std(), "n_features": sum(rlr.get_support()) } rlr_grid_search = rlr_grid_search.append( rlr_tmp, ignore_index=True) rlr_grid_search_mean = rlr_grid_search.groupby(by="st").mean()
# Useful sources: # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV from sklearn.datasets import load_iris import numpy as np iris = load_iris() X, y = iris.data, iris.target print(X) print(y) ff_model = RandomizedLogisticRegression() # Finds best set of features X_new = ff_model.fit_transform(X, y) # Fit data and get transformed input rows print(X_new) print(X.shape) print(X_new.shape) print(X[0:4]) print(ff_model.transform(X[0:4])) # Transform the first 4 rows of data to get only best features model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming arr = np.array([[1,1,1]]) print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
from __future__ import division import numpy as np from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import LogisticRegression X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = RandomizedLogisticRegression() clf.fit(X, y) scores = clf.scores_ print 'Index : score' sortedIdx = [ i[0] for i in sorted(enumerate(scores), key=lambda x: x[1], reverse=True) ] top = 30 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) lr = LogisticRegression() lr.fit(clf.transform(X), y) pred = lr.predict(clf.transform(X_test)) accuracy = sum(pred == y_test) / y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
from sklearn.linear_model import LogisticRegression from sklearn import model_selection from sklearn.datasets import make_classification from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import RandomizedLasso from sklearn.datasets import make_regression X, y = make_classification(n_samples=100, n_features=100, n_informative=5, n_redundant=2, random_state=101) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.30, random_state=101) classifier = LogisticRegression(C=0.1, penalty='l1', random_state=101) classifier.fit(X_train, y_train) print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test, y_test)) selector = RandomizedLogisticRegression(n_resampling=300, random_state=101) selector.fit(X_train, y_train) print("Variance selected: %i" % sum(selector._get_support_mask() != 0)) X_train_s = selector.transform(X_train) X_test_s = selector.transform(X_test) classifier.fit(X_train_s, y_train) print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test_s, y_test)) XX, yy = make_regression(n_samples=100, n_features=10, n_informative=4, random_state=101) rlasso = RandomizedLasso() rlasso.fit(XX, yy) print(list(enumerate(rlasso.scores_)))
from __future__ import division import numpy as np from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import LogisticRegression X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = RandomizedLogisticRegression() clf.fit(X, y) scores = clf.scores_ print 'Index : score' sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)] top = 30 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) lr = LogisticRegression() lr.fit(clf.transform(X), y) pred = lr.predict(clf.transform(X_test)) accuracy = sum(pred == y_test)/y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
#print len(contentVectors[0]) # feature selection: Tree classifier importance #clf = ExtraTreesClassifier() #selector = clf.fit(contentVectors , label) #contentVectors = selector.transform(contentVectors) # feature selection: SGDClassifie importance #contentVectors = SGDClassifier(loss="hinge", penalty="l1").fit_transform(contentVectors,label) # feature selection: SVM importance #selector = svm.LinearSVC(C=1, penalty="l1", dual=False).fit(contentVectors,label) #contentVectors = selector.transform(contentVectors) selector = RandomizedLogisticRegression().fit(contentVectors,label) contentVectors = selector.transform(contentVectors) # LARS feature selection #l1-based feature selection #contentVectors = SGDClassifier(loss="hinge", penalty="l1").fit_transform(contentVectors,label) #contentVectors = svm.LinearSVC(C=1, penalty="l1", dual=False).fit_transform(contentVectors,label) #clf = svm.LinearSVC(C=1) #clf = SGDClassifier(loss="hinge", penalty="l1") print "Feature selection finished" # Cross validation #rfecv = RFECV(estimator=clf, step=2, cv=StratifiedKFold(label, 2),scoring='accuracy') #selector = rfecv.fit(contentVectors, label) #contentVectors = selector.transform(contentVectors)
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
# Useful sources: # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV from sklearn.datasets import load_iris import numpy as np iris = load_iris() X, y = iris.data, iris.target print(X) print(y) ff_model = RandomizedLogisticRegression() # Finds best set of features X_new = ff_model.fit_transform(X, y) # Fit data and get transformed input rows print(X_new) print(X.shape) print(X_new.shape) print(X[0:4]) print(ff_model.transform( X[0:4])) # Transform the first 4 rows of data to get only best features model = LogisticRegression().fit( X_new, y) # Fit logistic regression with best features print(model.predict_proba(ff_model.transform( X[0:4]))) # predict probabilities for first 4 rows of data print(ff_model.inverse_transform(ff_model.transform( X[0:4]))) # Test inverse transforming arr = np.array([[1, 1, 1]]) print( ff_model.inverse_transform(arr) ) # Get original matrix structure with 1's only in columns of retained features.
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')