def feature_selection(train,test,y):
    print "特征选择"
    clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3)
    clf.fit(train,y)
    train = clf.transform(train)
    test = clf.transform(test)

    return train,test
Exemple #2
0
def predictWithAdaBoost(config, X, Y, testFeatures):
    adaConfig = config.getConfig('model/adaboost')
    if adaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)
    clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R')
    clf.fit(X,Y)
    return clf.predict(testFeatures)
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5):
    # Create the random forest object which will include all the parameters
    # for the fit
    randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200)

    # Fit the training data to the Survived labels and create the decision trees
    randomlr = randomlr.fit(train_x,train_y)

    train_x = randomlr.fit_transform(train_x,train_y)
    cv_x = randomlr.transform(cv_x)
    test_x = randomlr.transform(test_x)

    return train_x,cv_x,test_x
Exemple #4
0
def predictWithQDA(config, X, Y, testFeatures):
    qdaConfig = config.getConfig('model/qda')
    if qdaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)

    priors = qdaConfig.get('priors', None)
    clf = QDA(priors = priors)
    clf.fit(X, Y)
    return clf.predict(testFeatures)
Exemple #5
0
def test_rflasso():
    train_X, test_X, train_Y, test_Y = train_test_split(index_data,
                                                        index_lable,
                                                        test_size=0.25,
                                                        random_state=1)
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.linear_model import RandomizedLogisticRegression
    randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2)
    randomized_logistic.fit(train_X, train_Y)
    XX = randomized_logistic.transform(train_X)
    print XX.shape
Exemple #6
0
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl,
            sel, paramsDict, bestmodelnum):
    print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM,
                                               TEST_PERSON_DEVICE_ID))
    X_train_allfg = featmat_train.values
    Y_train = outcome_train_lbl.values
    #     Y_train = Y_train.reshape(Y_train.size, 1)# does this help?
    featnames_allfg = featmat_train.columns
    X_test_allfg = featmat_test.values
    Y_test = outcome_test_lbl.values
    Y_true = Y_test[0]
    sel_featnames_per_fg = {}
    sel_featnames_list_ordered = []
    sel_X_train = []
    sel_X_test = []
    countNumSel = 0
    fgi = 0
    for s in suffix_list:
        fgi = fgi + 1
        #    print fgi,
        suffix_list_str = ",".join(s)
        fgidxs = fgColIdxs[suffix_list_str]
        X_train = X_train_allfg[:, fgidxs]
        X_test = X_test_allfg[:, fgidxs]
        featnames_fg = featnames_allfg[fgidxs]
        # continue if empty
        if X_train.shape[1] == 0:
            continue
        ## scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        # variance thresholding
        vartransform = VarianceThreshold()
        X_train = vartransform.fit_transform(X_train)
        X_test = vartransform.transform(X_test)
        varthres_support = vartransform.get_support()
        featnames_fg = featnames_fg[varthres_support]
        ## feature selection
        if sel == "rlog":
            #print (X_train.shape)
            randomized_rlog = RandomizedLogisticRegression(**paramsDict)
            X_train = randomized_rlog.fit_transform(X_train, Y_train)
            X_test = randomized_rlog.transform(X_test)
            chosen_col_idxs = randomized_rlog.get_support()
            #print (len(featnames_fg))
            #print (len(chosen_col_idxs))

            if len(chosen_col_idxs) > 0:
                featnames_fg_chosen = list(featnames_fg[chosen_col_idxs])
                sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen
                sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen
                sel_X_train.append(X_train)
                sel_X_test.append(X_test)
                countNumSel = countNumSel + len(featnames_fg_chosen)
        else:
            raise ("Unrecognized sel (feature selection algorithm)")
    ## feature selection:  sel{sel{fg1}.....sel{fg45}}
    X_train_concat = np.hstack(sel_X_train)
    X_test_concat = np.hstack(sel_X_test)
    print("\nSum of number of features selected from all fgs = {0}".format(
        countNumSel))
    print("Concatenated X_train has {0} features".format(
        X_train_concat.shape[1]))
    print("Concatenated X_test has {0} features".format(
        X_test_concat.shape[1]))
    if sel == "rlog":
        randomized_rlog = RandomizedLogisticRegression(**paramsDict)
        X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train)
        X_test_concat = randomized_rlog.transform(X_test_concat)
        chosen_col_idxs = randomized_rlog.get_support()
        sel_featnames_list_ordered = np.array(sel_featnames_list_ordered)
        chosen_col_idxs = np.array(chosen_col_idxs)
        chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs]
    else:
        raise ("Unrecognized sel (feature selection algorithm)")
    print("Final number of features in model = {0}".format(
        X_train_concat.shape[1]))
    # GBCT
    if modelname == "GBC":
        clf = GradientBoostingClassifier(random_state=0)
    elif modelname == "LOGR":
        clf = LogisticRegression(random_state=0,
                                 C=paramsDict["C"],
                                 tol=1e-3,
                                 penalty="l1",
                                 n_jobs=paramsDict["n_jobs"],
                                 intercept_scaling=1,
                                 class_weight="balanced")
    else:
        raise ("Unrecognized model name")
    clf.fit(X_train_concat, Y_train)
    pred = clf.predict(X_test_concat)
    pred_proba = clf.predict_proba(X_test_concat)
    Y_pred = pred[0]
    Y_pred_proba = pred_proba[0][1]
    ## Logging test_person_test.csv - outputs 1 line only
    ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns
    chosen_cols_final_str = ",".join(chosen_cols_final)
    paramsDict_str = ','.join("%s:%r" % (key, val)
                              for (key, val) in paramsDict.iteritems())
    fgIdxs_str = ','.join("%s:%r" % (key, val)
                          for (key, val) in fgIdxs.iteritems())
    cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl)
    cnts_per_lbl_str = ','.join("%s:%r" % (key, val)
                                for (key,
                                     val) in cnts_per_lbl_dict.iteritems())
    dfout = pd.DataFrame({
        "did": [TEST_PERSON_DEVICE_ID],
        "cnts_per_lbl": [cnts_per_lbl_str],
        "sel": [sel],
        "selParams": [paramsDict_str],
        "Y_pred": [Y_pred],
        "Y_pred_proba": [Y_pred_proba],
        "Y_true": [Y_true],
        "fgIdxs": [fgIdxs_str],
        "sel_final": [chosen_cols_final_str]
    })
    dfout = dfout.set_index("did")
    cols = [
        "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true",
        "fgIdxs", "sel_final"
    ]
    for s in suffix_list:
        suffix_list_str = ",".join(s)
        if suffix_list_str in sel_featnames_per_fg:
            sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str])
        else:
            sel_feats_fg_str = ""
        dfcol = pd.DataFrame({
            "did": [TEST_PERSON_DEVICE_ID],
            "sel_{0}".format(suffix_list_str): [sel_feats_fg_str]
        })
        dfcol = dfcol.set_index("did")
        dfout = pd.concat([dfout, dfcol], axis=1)
        cols.append("sel_{0}".format(suffix_list_str))
    dfout.to_csv(
        folderpath +
        "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum),
        columns=cols,
        header=True)
    print("{0} minutes elapsed since start of program ".format(
        (time.time() - STARTTIME) / 60.0))
    return (Y_pred, Y_pred_proba)
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import RandomizedLogisticRegression
import fmriUtils as fm  #自定义函数

n_folds = 10

f = fm.outTo()  #输出重定向到文件
X, y = fm.loadData2()
X2, y2 = fm.loadData2()

y = fm.defineClass(y)

randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2)
randomized_logistic.fit(X, y)
XX = randomized_logistic.transform(X)
print "============选择后剩余的特征================"
print XX.shape

yy = y
cv = StratifiedKFold(yy, n_folds)
cv_scores = []
for train, test in cv:
    svc = SVC(kernel='linear')
    svc.fit(XX[train], yy[train])
    prediction = svc.predict(XX[test])
    cv_scores.append(np.sum(prediction == yy[test]) / float(np.size(yy[test])))

print "========分类准确率======="
print cv_scores, np.mean(cv_scores)
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import RandomizedLogisticRegression
import fmriUtils as fm  #自定义函数

n_folds = 10

f = fm.outTo() #输出重定向到文件
X,y = fm.loadData2()   
X2,y2 = fm.loadData2()   

y = fm.defineClass(y)

randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2)
randomized_logistic.fit(X,y)
XX = randomized_logistic.transform(X)
print "============选择后剩余的特征================"
print XX.shape

yy = y
cv = StratifiedKFold(yy,n_folds)
cv_scores = []
for train, test in cv:
    svc = SVC(kernel='linear')
    svc.fit(XX[train], yy[train])
    prediction = svc.predict(XX[test])
    cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) )
    
print "========分类准确率======="
print cv_scores,np.mean(cv_scores)
    cv=StratifiedKFold(9),
    n_permutations=2000,
    n_jobs=2)

# RandomizedLogisticRegression (RLR) feature selection
# Grid search for optimal RLR params
selection_threshold = np.arange(0.3, 0.9, 0.05)
rlr_grid_search = pd.DataFrame()

for st in selection_threshold:
    for i in range(100):
        print("Working on: %s (%d of 100)" % (st, (i + 1)))
        rlr = RandomizedLogisticRegression(
            n_resampling=5000, C=lr_mean.C, selection_threshold=st, n_jobs=2)
        rlr.fit(X, y)
        X_rlr = rlr.transform(X)

        if X_rlr.size:
            cv_scores_rlr = cross_val_score(
                lr_mean, X_rlr, y, scoring="roc_auc", cv=StratifiedKFold(9))

            rlr_tmp = {
                "st": st,
                "cv_score": cv_scores_rlr.mean(),
                "cv_std": cv_scores_rlr.std(),
                "n_features": sum(rlr.get_support())
            }
            rlr_grid_search = rlr_grid_search.append(
                rlr_tmp, ignore_index=True)

rlr_grid_search_mean = rlr_grid_search.groupby(by="st").mean()
# Useful sources:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target
print(X)
print(y)
ff_model = RandomizedLogisticRegression() # Finds best set of features
X_new = ff_model.fit_transform(X, y)  # Fit data and get transformed input rows
print(X_new)
print(X.shape)
print(X_new.shape)
print(X[0:4])
print(ff_model.transform(X[0:4]))  # Transform the first 4 rows of data to get only best features
model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features
print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data
print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming
arr = np.array([[1,1,1]])
print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
Exemple #11
0
from __future__ import division
import numpy as np
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import LogisticRegression

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = RandomizedLogisticRegression()
clf.fit(X, y)
scores = clf.scores_
print 'Index    :   score'
sortedIdx = [
    i[0] for i in sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
]
top = 30
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])

lr = LogisticRegression()
lr.fit(clf.transform(X), y)
pred = lr.predict(clf.transform(X_test))
accuracy = sum(pred == y_test) / y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)
Exemple #12
0
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.datasets import make_classification
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import RandomizedLasso
from sklearn.datasets import make_regression
X, y = make_classification(n_samples=100,
                           n_features=100,
                           n_informative=5,
                           n_redundant=2,
                           random_state=101)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.30, random_state=101)
classifier = LogisticRegression(C=0.1, penalty='l1', random_state=101)
classifier.fit(X_train, y_train)
print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test, y_test))
selector = RandomizedLogisticRegression(n_resampling=300, random_state=101)
selector.fit(X_train, y_train)
print("Variance selected: %i" % sum(selector._get_support_mask() != 0))
X_train_s = selector.transform(X_train)
X_test_s = selector.transform(X_test)
classifier.fit(X_train_s, y_train)
print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test_s, y_test))
XX, yy = make_regression(n_samples=100,
                         n_features=10,
                         n_informative=4,
                         random_state=101)
rlasso = RandomizedLasso()
rlasso.fit(XX, yy)
print(list(enumerate(rlasso.scores_)))
Exemple #13
0
from __future__ import division
import numpy as np
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import LogisticRegression

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = RandomizedLogisticRegression()
clf.fit(X, y) 
scores = clf.scores_
print 'Index    :   score'
sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)]
top = 30
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])

lr = LogisticRegression()
lr.fit(clf.transform(X), y)
pred = lr.predict(clf.transform(X_test))
accuracy = sum(pred == y_test)/y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)

Exemple #14
0
	#print len(contentVectors[0])

	# feature selection: Tree classifier importance
	#clf = ExtraTreesClassifier()
	#selector = clf.fit(contentVectors , label)
	#contentVectors = selector.transform(contentVectors)

	# feature selection: SGDClassifie importance
	#contentVectors = SGDClassifier(loss="hinge", penalty="l1").fit_transform(contentVectors,label)

	# feature selection: SVM importance
	#selector =  svm.LinearSVC(C=1, penalty="l1", dual=False).fit(contentVectors,label)
	#contentVectors = selector.transform(contentVectors)

	selector =  RandomizedLogisticRegression().fit(contentVectors,label)
	contentVectors = selector.transform(contentVectors)

	# LARS feature selection
	
	#l1-based feature selection
	#contentVectors = SGDClassifier(loss="hinge", penalty="l1").fit_transform(contentVectors,label)
	#contentVectors =  svm.LinearSVC(C=1, penalty="l1", dual=False).fit_transform(contentVectors,label)
	
	#clf = svm.LinearSVC(C=1)
	#clf = SGDClassifier(loss="hinge", penalty="l1")
	print "Feature selection finished"

	# Cross validation
	#rfecv = RFECV(estimator=clf, step=2, cv=StratifiedKFold(label, 2),scoring='accuracy')
	#selector = rfecv.fit(contentVectors, label)
	#contentVectors = selector.transform(contentVectors)
Exemple #15
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
Exemple #16
0
# Useful sources:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression  #, LogisticRegressionCV
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target
print(X)
print(y)
ff_model = RandomizedLogisticRegression()  # Finds best set of features
X_new = ff_model.fit_transform(X, y)  # Fit data and get transformed input rows
print(X_new)
print(X.shape)
print(X_new.shape)
print(X[0:4])
print(ff_model.transform(
    X[0:4]))  # Transform the first 4 rows of data to get only best features
model = LogisticRegression().fit(
    X_new, y)  # Fit logistic regression with best features
print(model.predict_proba(ff_model.transform(
    X[0:4])))  # predict probabilities for first 4 rows of data
print(ff_model.inverse_transform(ff_model.transform(
    X[0:4])))  # Test inverse transforming
arr = np.array([[1, 1, 1]])
print(
    ff_model.inverse_transform(arr)
)  # Get original matrix structure with 1's only in columns of retained features.
Exemple #17
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')