コード例 #1
0
ファイル: mcnulty_analysis.py プロジェクト: jrcox/mcnulty
def func():
    for imbalance in imbalances:
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                         random_state=4444)
        
        X_train, y_train = imbalance.fit_sample(X_train, y_train)

        

        for clf in classifiers:
            print('-----------------')
            print("%s  "  %imbalance)
            print('-----------------')

            clf.fit(X_train, y_train)
            print('-----------------')
            print("%s   " %clf)
            print('-----------------')
            print("")

            print("Accuracy score", accuracy_score(y_test, clf.predict(X_test)))
            print('auc', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
            print("")

            print(classification_report(y_test, clf.predict(X_test)))
            print("")

            
            print('-----------------')
            best_dict[imbalance] = [clf, roc_auc_score(y_test, clf.predict(X_test))]
コード例 #2
0
def test_thresholded_scorers():
    """Test scorers that take thresholds."""
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = SCORERS['log_loss'](clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
コード例 #3
0
def randomforest(df1,df2):
	
	
	newsT=df1.L
	L= ['L']
	for x in L:
	 	del df1[x]
	news=df1
	TRAINING=df1.as_matrix(columns=None)
	TEST=newsT.as_matrix(columns=None)
	
	newsT=df2['L']
	L= ['L']
	for x in L:
	 	del df2[x]
	X_test=df2.as_matrix(columns=None)
	y_test=newsT.as_matrix(columns=None)

	clf = RandomForestClassifier(n_estimators=200)
	clf.fit(TRAINING, TEST)
	y_pred1 = clf.predict_proba(X_test)[:, 1]
	y_pred = clf.predict(X_test)
	recall_score(y_test, y_pred)
	precision_score(y_test, y_pred)
	precision_score(y_test, y_pred,pos_label=0)
	recall_score(y_test, y_pred,pos_label=0)
	roc_auc_score(y_test, y_pred1)
	print 'roc: ',roc_auc_score(y_test, y_pred1)
	print 'precision: ',precision_score(y_test, y_pred)
	print 'recall:', recall_score(y_test, y_pred)
	print 'precision Negatives: ',precision_score(y_test, y_pred,pos_label=0)
	print 'recall Negatives: ', recall_score(y_test, y_pred,pos_label=0)
	
	return roc_auc_score(y_test, y_pred1),precision_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred,pos_label=0), recall_score(y_test, y_pred,pos_label=0)
def roc_score(predictions):
    logreg = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 1]])
    svm = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 2]])
    knn = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 3]])
    tree = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 4]])

    return {'logreg': logreg, 'svm': svm, 'knn': knn, 'tree': tree}
コード例 #5
0
ファイル: execute.py プロジェクト: rahlk/Bellwether
    def ensemble_measure(lst, classifiers, weigths):

        def norm_lst(lst):
            import numpy as np
            s = np.sum(lst)
            arr = np.array(lst) / s
            return np.nan_to_num(arr)

        tst = pd.DataFrame([t[0] for t in lst], columns=train.columns)
        X = tst[tst.columns[:-1]]
        y = tst[tst.columns[-1]]
        y_hat = []
        y_pred = []

        for clf in classifiers:
            y_hat.append(clf.decision_function(X))

        if len(y_hat) == 1:
            y = [1 if p == True else -1 for p in y]
            auc = roc_auc_score(y, y_hat[0])

        else:
            for pred, wgt in zip(y_hat, norm_lst(weigths)):
                y_pred.append([wgt * p for p in pred])

            y_pred = np.sum(np.array(y_pred).T, axis=1)
            y = [1 if p == True else -1 for p in y]
            try:
                auc = roc_auc_score(y, y_pred)
            except:
                set_trace()
        return auc
コード例 #6
0
def analysis(fold_val):
    total = 0
    df_val = dict()
    for f in fold_val:
        df_val[total] = pd.read_csv(f)
        auc = roc_auc_score(df_val[total]['isDuplicate'].values, df_val[total]['probability'].values)
        print('Auc for experiment {}: {}'.format(total, auc))
        total += 1

    df_mean = df_val[0].copy()
    for i in range(1, len(fold_val)):
        df_mean['probability'] += df_val[i]['probability']
    df_mean['probability'] /= len(fold_val)
    auc = roc_auc_score(df_mean['isDuplicate'].values, df_mean['probability'].values)
    print('Auc for mean: {}'.format(auc))

    alls = []
    x0 = []
    for i in range(0, len(fold_val)):
        val = 'probability' + str(i)
        alls.append(val)
        df_mean[val] = df_val[i]['probability']
        x0.append(1.0)
    df_mean['probability_median'] = df_mean[alls].median(axis=1)
    auc = roc_auc_score(df_mean['isDuplicate'].values, df_mean['probability_median'].values)
    print('Auc for median: {}'.format(auc))

    res = minimize(get_ensemble_score, x0, args=(df_mean), method='Nelder-Mead', options={'xtol': 1e-8, 'disp': True})
    print(res)
    return res.x
コード例 #7
0
ファイル: lr_model.py プロジェクト: dier111320/LR_MODEL
def Classification(df_detail,features,featurey,featue_selection):

    df_X=df_detail[features]
    # print df_X.isnull().values.any()
    X=numpy.array(df_X)

    # from sklearn.preprocessing import StandardScaler
        # scaler = StandardScaler()
        # X = scaler.fit_transform(X)

    Y=list(df_detail[featurey])


    # print 'lenY',len(Y)


    # ############################################
    # #  classification
    # ############################################
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0)
    # print 'X_train[1]:',X_train[1]
    # print 'y_train[1]:',y_train[1]


    #####LR######
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    expected = y_test
    predicted = lr.predict(X_test)
    answer=lr.predict_proba(X_test)
    if featue_selection==True:
        prob_auc=pd.DataFrame({'feature':features,
                               'auc':roc_auc_score(numpy.array( map(int, y_test)), answer[:,1])})
        return prob_auc
    else:
        print '=====LogisticRegression======'
        print '1/0 in train:%d/%d\t1/0 in test:%d/%d'%(y_train.count('1'),y_train.count('0'),y_test.count('1'),y_test.count('0'))
        print 'N_train:N_test= %d:%d'%(len(y_train),len(y_test))
        print(metrics.classification_report(expected, predicted))
        print(metrics.confusion_matrix(expected, predicted))
        print 'lr.score=',lr.score(X_test,y_test)
        print 'lr.auc_score=',roc_auc_score(numpy.array( map(int, y_test)), answer[:,1])

    # print '****lr.coef_****'
    # print lr.coef_


        lr_coef=pd.DataFrame(lr.coef_)
        lr_coef.to_csv('lr_coef_new.txt',sep='\t' ,index=False, header=False)
        lr_intercept=pd.DataFrame(lr.intercept_)
        lr_intercept.to_csv('lr_intercept_new.txt',sep='\t' ,index=False, header=False)

        test_pair=pd.concat([pd.DataFrame(y_test),pd.DataFrame(answer),pd.DataFrame(X_test)],axis=1)
        test_pair.to_csv('test_pair_new.txt',sep='\t' ,index=False, header=False)


        feature_imp=pd.DataFrame(lr.coef_[0]
        )
        # print feature_imp
        feature_imp.to_csv('feature_imp.txt',sep='\t', mode='a',index=False, header=False)
コード例 #8
0
ファイル: predict.py プロジェクト: invisibleroads/noccn
    def report(self):
        from sklearn.metrics import roc_auc_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix

        y_pred_probas, y_true = self.make_predictions()[:2]
        y_pred = y_pred_probas.argmax(1)
        y_pred_probas = y_pred_probas[:, 1]
        y_true = y_true.reshape(-1)

        try:
            score = roc_auc_score(y_true, y_pred_probas)
        except ValueError:
            pass
        else:
            print
            print "AUC score:", score
            print "AUC score (binary):", roc_auc_score(y_true, y_pred)
            print

        print "Classification report:"
        print classification_report(y_true, y_pred)
        print

        print "Confusion matrix:"
        print confusion_matrix(y_true, y_pred)
        print
コード例 #9
0
ファイル: learnData.py プロジェクト: Mathieu-Seurin/dat-eeg
def getScores(y, yPredTrain, yTest, yPredTest):

    scores = dict()

    scores['f1Train'] = f1_score(y, yPredTrain)
    scores['f1Test'] = f1_score(yTest, yPredTest)


    scores['accTrain'] = accuracy_score(y, yPredTrain)
    scores['accTest'] = accuracy_score(yTest, yPredTest)
    

    scores['rocTrain'] = roc_auc_score(y, yPredTrain)
    scores['rocTest'] = roc_auc_score(yTest, yPredTest)
    

    scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain)
    scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest)

    proba = float(len(np.where(y==1)[0]))/len(y)
    if proba < 0.50:
        proba = 1 - proba
    scores['random'] = proba
    
    return scores
コード例 #10
0
ファイル: scoringfunctions.py プロジェクト: messiah1349/work
def giniGrowth(df,woeVarsInfo,badFlag):
    woeTable = woeVarsInfo.copy()
    woeTable.variable = woeTable.variable.apply(lambda x: x + '_WOE')
    IV = getIVfromWOE(woeTable)
    columns = IV.variable
    columnsForModeking = []
    giniTest = []
    giniTrain = []
    y = df[badFlag].values
    for col in columns:
        columnsForModeking.append(col)
        X = df[columnsForModeking].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3)
        lr = LogisticRegression()
        lr.fit(X_train,y_train)
        pr_test = lr.predict_proba(X_test)[:,1]
        pr_train = lr.predict_proba(X_train)[:,1]
        rocGiniTest =  met.roc_auc_score(y_test,pr_test) * 2 - 1
        rocGiniTrain =  met.roc_auc_score(y_train,pr_train) * 2 - 1
        giniTest.append(rocGiniTest)
        giniTrain.append(rocGiniTrain)
    trainDiff = [x-y for x,y in zip(giniTrain,[0]+giniTrain[:-1])]
    testDiff = [x-y for x,y in zip(giniTest,[0]+giniTest[:-1])]
    dfOut = pd.DataFrame({'variable':columns, 'giniTrain' : giniTrain,'giniTest': giniTest,'trainDiff':trainDiff,'testDiff':testDiff,'informationValue':list(IV.InformationValue)})
    dfOut[['trainDiff','testDiff']] = dfOut[['trainDiff','testDiff']]#.apply('${:,.2f}'.format)
    dfOut = dfOut.reindex_axis(['variable','informationValue','testDiff','trainDiff','giniTest','giniTrain'],axis=1)
    return dfOut
コード例 #11
0
def compare_models(xtraindata, ytraindata):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC, LinearSVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.lda import LDA
    from sklearn.qda import QDA

    classifier_dict = {
                #'linSVC': LinearSVC(),
                #'kNC5': KNeighborsClassifier(),
                #'kNC6': KNeighborsClassifier(6),
                #'SVC': SVC(kernel="linear", C=0.025),
                #'DT': DecisionTreeClassifier(max_depth=5),
                #'RF200': RandomForestClassifier(n_estimators=200, n_jobs=-1),
                'RF400gini': RandomForestClassifier(n_estimators=400, n_jobs=-1),
                'RF400entropy': RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion='entropy'),
                #'RF800': RandomForestClassifier(n_estimators=800, n_jobs=-1),
                #'RF1000': RandomForestClassifier(n_estimators=1000, n_jobs=-1),
                'Ada': AdaBoostClassifier(),
                #'SVClin': SVC(kernel='linear'),
                #'SVCpoly': SVC(kernel='poly'),
                #'SVCsigmoid': SVC(kernel='sigmoid'),
                'Gauss': GaussianNB(),
                'LDA': LDA(),
                #'QDA': QDA(),
                'SVC': SVC(),
               }

    results = {}
    ytrain_vals = []
    ytest_vals = []
    randint = reduce(lambda x,y: x|y, [ord(x)<<(n*8) for (n,x) in enumerate(os.urandom(4))])
    xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(xtraindata,
                                                                     ytraindata,
                                                                     test_size=0.4, random_state=randint)
    scale = StandardScaler()
    xTrain = scale.fit_transform(xTrain)
    xTest = scale.transform(xTest)

    for name, model in sorted(classifier_dict.items()):
        model.fit(xTrain, yTrain)
        ytrpred = model.predict(xTrain)
        ytpred = model.predict(xTest)
        results[name] = roc_auc_score(yTest, ytpred)
        ytrain_vals.append(ytrpred)
        ytest_vals.append(ytpred)
        print name, results[name], ytest_vals[-1]
    print '\n\n\n'

    print 'shape3', xTrain.shape, xTest.shape, ytrain_vals[0].shape, ytest_vals[0].shape
    xTrain = np.hstack([xTrain]+[y.reshape(xTrain.shape[0],1) for y in ytrain_vals])
    xTest = np.hstack([xTest]+[y.reshape(xTest.shape[0],1) for y in ytest_vals])

    print '\n\n\n'
    model = RandomForestClassifier(n_estimators=400, n_jobs=-1)
    model.fit(xTrain, yTrain)
    ytpred = model.predict(xTest)
    print 'RF400', roc_auc_score(yTest, ytpred)
コード例 #12
0
def print_metrics(y_test, y_pred, y_baseline):

    # clf_score       = metrics.log_loss(y_test, y_pred)
    # baseline_score  = metrics.log_loss(y_test, y_baseline)
    # never_score     = metrics.log_loss(y_test, np.zeros(y_test.shape))
    # always_score    = metrics.log_loss(y_test, np.ones(y_test.shape))
    #
    # print("-----")
    # print("log-loss score of classifier: "  + str(clf_score))
    # print("log-loss score of baseline: "    + str(baseline_score))
    # print("log-loss score of never: "       + str(never_score))
    # print("log-loss score of always: "      + str(always_score))
    #
    # clf_score       = metrics.brier_score_loss(y_test, y_pred)
    # baseline_score  = metrics.brier_score_loss(y_test, y_baseline)
    # never_score     = metrics.brier_score_loss(y_test, np.zeros(y_test.shape))
    # always_score    = metrics.brier_score_loss(y_test, np.ones(y_test.shape))
    #
    # print("-----")
    # print("Brier loss of classifier: "  + str(clf_score))
    # print("Brier loss of baseline: "    + str(baseline_score))
    # print("Brier loss of never: "       + str(never_score))
    # print("Brier loss of always: "      + str(always_score))

    clf_score       = metrics.roc_auc_score(y_test, y_pred)
    baseline_score  = metrics.roc_auc_score(y_test, y_baseline)
    never_score     = metrics.roc_auc_score(y_test, np.zeros(y_test.shape))
    always_score    = metrics.roc_auc_score(y_test, np.ones(y_test.shape))

    print("-----")
    print("ROC AUC of classifier: "     + str(clf_score))
    print("ROC AUC score of baseline: " + str(baseline_score))
    print("ROC AUC score of never: "    + str(never_score))
    print("ROC AUC score of always: "   + str(always_score))
コード例 #13
0
ファイル: ThreeMonth2.py プロジェクト: duxuhao/wo_plus
def predict(fea, df, t, t9):
    Un = df.columns == 'Blank'
    for f in Fea:
        '''        
        try:
            df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')]
            print(1)
        except:
            pass
        '''
        Un = Un | (df.columns == f)
        Un = Un | (df.columns == (f+'_x'))
        Un = Un | (df.columns == (f+'_y'))
    Un = Un & (df.columns != 'New_y')    
    clf = GradientBoostingClassifier()
    y = df[t].label
    X = df[t].ix[:,Un]
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)
    clf.fit(X_train, y_train)
    re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))  
    print re
    re =  'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1]))
    print re
    print(X.columns)
    print(clf.feature_importances_)
    return Un, clf
コード例 #14
0
ファイル: evaluator.py プロジェクト: thran/experiments2.0
    def _basic_metrics(self, data, brier_bins=20, prediction_column="prediction", observation_column="correct", brier_min=0, brier_max=1):
        report = {}

        n = 0           # log count
        sse = 0         # sum of square error
        llsum = 0       # log-likely-hood sum
        brier_counts = np.zeros(brier_bins)          # count of answers in bins
        brier_correct = np.zeros(brier_bins)        # sum of correct answers in bins
        brier_prediction = np.zeros(brier_bins)     # sum of predictions in bins

        for log in data:
            n += 1
            sse += (log[prediction_column] - log[observation_column]) ** 2
            llsum += math.log(max(0.0001, log[prediction_column] if log[observation_column] else (1 - log[prediction_column])))

            # brier
            bin = min(int((log[prediction_column] - brier_min) / (brier_max - brier_min) * brier_bins), brier_bins - 1)
            brier_counts[bin] += 1
            brier_correct[bin] += log[observation_column]
            brier_prediction[bin] += log[prediction_column]

        answer_mean = sum(brier_correct) / n

        report["extra"] = {"answer_mean": answer_mean}
        report["rmse"] = math.sqrt(sse / n)
        report["log-likely-hood"] = llsum
        if observation_column == "correct":
            try:
                report["AUC"] = metrics.roc_auc_score(self._data.get_dataframe_test()[observation_column],
                                                      self._data.get_dataframe_test()[prediction_column])
            except ValueError:
                print("AUC - converting responses to 0, 1")
                report["AUC"] = metrics.roc_auc_score(self._data.get_dataframe_test()[observation_column] > 0,
                                                      self._data.get_dataframe_test()[prediction_column])

        # brier
        brier_prediction_means = brier_prediction / brier_counts
        brier_prediction_means[np.isnan(brier_prediction_means)] = \
            ((np.arange(brier_bins) + 0.5) / brier_bins)[np.isnan(brier_prediction_means)]
        brier_correct_means = brier_correct / brier_counts
        brier_correct_means[np.isnan(brier_correct_means)] = 0
        brier = {
            "reliability":  sum(brier_counts * (brier_correct_means - brier_prediction_means) ** 2) / n,
            "resolution":  sum(brier_counts * (brier_correct_means - answer_mean) ** 2) / n,
            "uncertainty": answer_mean * (1 - answer_mean),

        }
        report["brier"] = brier

        report["extra"]["brier"] = {
            "max": brier_max,
            "min": brier_min,
            "bin_count": brier_bins,
            "bin_counts": list(brier_counts),
            "bin_prediction_means": list(brier_prediction_means),
            "bin_correct_means": list(brier_correct_means),
        }
        report["evaluated"] = True

        return report
コード例 #15
0
def main():
    n_folds = 10
    n_genes, n_terms = dicty[gene][go_term][0].data.shape

    for t, term_idx in enumerate(range(n_terms)):
        term = dicty[gene][go_term][0].col_names[term_idx]
        print("Term: %s" % term)
        y_true = dicty[gene][go_term][0].data[:, term_idx]

        cls_size = int(y_true.sum())
        if cls_size > n_genes - 20 or cls_size < 20:
            continue

        cv = cross_validation.StratifiedKFold(y_true, n_folds=n_folds)
        y_pred_mf = np.zeros_like(y_true)
        y_pred_rf = np.zeros_like(y_true)
        for i, (train_idx, test_idx) in enumerate(cv):
            print("\tFold: %d" % (i+1))
            # Let"s make predictions from fused data representation
            y_pred_mf[test_idx] = mf(train_idx, test_idx, term_idx)
            # Let"s make predictions from raw data
            y_pred_rf[test_idx] = rf(train_idx, test_idx, term_idx)

        mfa = metrics.roc_auc_score(y_true, y_pred_mf)
        rfa = metrics.roc_auc_score(y_true, y_pred_rf)
        print("(%2d/%2d): %10s MF: %0.3f RF: %0.3f" % (t+1, n_terms, term, mfa, rfa))
コード例 #16
0
ファイル: module3_bagging.py プロジェクト: mircean/ML
def func_cv_2(X, y, folds, model, verbose, seed):
    scores = []
    for train, test in folds:
        print('**func_cv_2 Fold', 1 + len(scores), 'of', n_folds_outer)
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        #todo: pass folds to predict?
        y_predicted = np.zeros(len(y_test))
        for i in range(len(models)):
            y_predicted_per_model = my_predict(X_train, y_train, X_test, n_iterations_inner, n_folds_inner, func_predict_1, models[i], verbose=False)
            score_per_model = roc_auc_score(y_test, y_predicted_per_model)
            print('model', i, score_per_model, models[i])

            if i == 0:
                y_predicted = y_predicted_per_model
            elif i <= 3:
                y_predicted += 1/3*y_predicted_per_model
            else:
                foo += 1

        if len(models) == 4:
            y_predicted = y_predicted/2
        else:
            foo += 1

        score = roc_auc_score(y_test, y_predicted)
        print('**func_cv_2 auc score for combined model', score)
        scores.append(score)

    scores = np.array(scores)
    print('****func_cv_2: mean, std', scores.mean(), scores.std())
    return scores.mean()
コード例 #17
0
 def on_epoch_end(self, batch, logs={}):
     # losses
     self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0))
     self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0))
     
     # Roc train
     train_preds = self.model.predict_proba(X_train, verbose=0)
     train_preds = train_preds[:, 1]
     roc_train = metrics.roc_auc_score(y_train, train_preds)
     self.roc_train.append(roc_train)
     
     # Roc val
     val_preds = self.model.predict_proba(X_val, verbose=0)
     val_preds = val_preds[:, 1]
     roc_val = metrics.roc_auc_score(y_val, val_preds)
     self.roc_val.append(roc_val)
     
     # Metrics train
     y_preds = self.model.predict_classes(X_train,verbose = 0)
     self.f1_train.append(metrics.f1_score(y_train,y_preds))
     self.recal_train.append(metrics.recall_score(y_train,y_preds))
     self.preci_train.append(metrics.precision_score(y_train,y_preds))
     
     # Metrics val
     y_preds = self.model.predict_classes(X_val,verbose =0)
     self.f1_val.append(metrics.f1_score(y_val,y_preds))
     self.recal_val.append(metrics.recall_score(y_val,y_preds))
     self.preci_val.append(metrics.precision_score(y_val,y_preds))
コード例 #18
0
ファイル: ThreeMonth.py プロジェクト: duxuhao/wo_plus
def predict(fea1,fea2, df, t, t9):
    n = 0
    weight = [0.73,0.27]
    tave = np.zeros(len(df[t9]))
    y = df[t].label
    X_1 = df[t]
    df9 = df[t9]
    for fea in [fea1,fea2]:
        Un = df.columns == 'Blank'
        for f in fea:
            Un = Un | (df.columns == f)
            Un = Un | (df.columns == (f+'_x'))
            Un = Un | (df.columns == (f+'_y'))
        Un = Un & (df.columns != 'quarterly_attrition_rate_y')
        clf = GradientBoostingClassifier()
        X = X_1.ix[:,Un]
        X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)  
        min_max_scaler = preprocessing.MinMaxScaler()
        clf.fit(min_max_scaler.fit_transform(X_train), y_train)
        re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(min_max_scaler.transform(X_test))[:,1]))
        print re
        t = clf.predict_proba(min_max_scaler.fit_transform(df9.ix[:,Un]))[:,1]
        re =  'September AUC: \t' + str(roc_auc_score(df9.label,t))
        print re
        tave = t * weight[n] + tave
        n += 1
        
    
    print '-' * 30
    print(weight)
    print 'Total AUC'
    re =  'September AUC: \t' + str(roc_auc_score(df9.label,tave))
    print re
    return Un, clf
コード例 #19
0
ファイル: Model.py プロジェクト: Ewen2015/Kaggle
def modelfit(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    """ Fit models w/ parameters """

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(
            dtrain[predictors].values, label=dtrain[target].values)
        xgtest = xgb.DMatrix(dtest[predictors].values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(cvresult)

    alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc')

    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
    dtest['predprob'] = alg.predict_proba(dtest[predictors])[:, 1]

    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(
        dtrain[target].values, dtrain_predictions))
    print("AUC Score (Train): %f" %
          metrics.roc_auc_score(dtrain[target], dtrain_predprob))
    print('AUC Score (Test): %f' %
          metrics.roc_auc_score(dtest[target], dtest['predprob']))

    feat_imp = pd.Series(alg.booster().get_fscore()
                         ).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
コード例 #20
0
ファイル: main.py プロジェクト: pkravik/kaggle
def run_svm_model(df, continuous_vars, categorical_vars, outcome_var):

    X_train, y_train, X_test, y_test = test_train_data(df, continuous_vars, categorical_vars, outcome_var, categorical='continuous', seed = 124, test_pct=0)

    scale = StandardScaler()
    X_train = scale.fit_transform(X_train)

    svc = SVC(probability=True, cache_size=500)

    params = [{'C': [10, 5, 1, 0.1, 0.001],
               'class_weight': ['auto'],
               'kernel': ['rbf'],
               'gamma': [0, 0.1, 0.01]}]

    cv_strat = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=98)

    clf = GridSearchCV(svc, params, scoring='roc_auc', cv=cv_strat, n_jobs=3, verbose=1, iid=False)

    clf.fit(X_train, y_train)

    clf.grid_scores_
    clf.best_estimator_
    clf.best_params_
    clf.best_score_

    roc_auc_score(y_test, clf.best_estimator_.predict_proba(scale.transform(X_test))[:,1])

    svc_predict = clf.best_estimator_.predict_proba(scale.transform(X_test))[:,1]
コード例 #21
0
ファイル: learn.py プロジェクト: sankroh/infiniband
def evaluate_fold(clf, X_train, y_train, X_test, y_test):
    """
    This is the business section
    """
    tmp = dict()
    tmp['X_train.shape'] = X_train.shape
    tmp['X_test.shape'] = X_test.shape
    try:
        pred_test = clf.predict_proba(X_test)
        pred_train = clf.predict_proba(X_train)
        tmp['roc'] = roc_info(y_test, pred_test[:,1])   
        tmp['roc_area'] = roc_auc_score(y_test, pred_test[:,1])
        pred_test = clf.predict(X_test)
        pred_train = clf.predict(X_train)
        tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1)        
        tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) 

    except (AttributeError, NotImplementedError):
        pred_test = clf.predict(X_test)
        pred_train = clf.predict(X_train)
        tmp['roc'] = roc_info(y_test, pred_test)
        tmp['roc_area'] = roc_auc_score(y_test, pred_test)
        tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1)        
        tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) 

    return tmp
コード例 #22
0
def do_all_study(X,y):
    
    names = [ "Decision Tree","Gradient Boosting",
             "Random Forest", "AdaBoost", "Naive Bayes"]

    classifiers = [
        #SVC(),
        DecisionTreeClassifier(max_depth=10),
        GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1),
        RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
        AdaBoostClassifier()]
    for name, clf in zip(names, classifiers):
        estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


    clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1)
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_GBC, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_GBC.fit(X_train,y_train)
    y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1]
    print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC))

    clf_AB = AdaBoostClassifier()
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_AB, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_AB.fit(X_train,y_train)
    y_pred_AB = clf_AB.predict_proba(X_test)[:,1]
    print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
コード例 #23
0
def exercise_3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    error = []
    error_cart = []
    error_mean = []
    error_mean_cart = []

    clf = RandomForestClassifier(n_estimators=100, oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    clf_cart = DecisionTreeClassifier()
    error_mean = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        clf_cart.fit(X_train, y_train)

        error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) )
        error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) )

    error.append( np.array(error_mean).mean() )
    error_cart.append( np.array(error_mean_cart).mean() )

    print 'Error RandomForest: ', error
    print 'Error CART: ', error_cart
コード例 #24
0
ファイル: dnn.py プロジェクト: ktaneishi/DLVS
 def on_epoch_end(self, epoch, logs={}):
     train_x, train_y = self.train_data
     train_y_score = self.model.predict_proba(train_x, verbose=0)
     test_x, test_y = self.test_data
     test_y_score = self.model.predict_proba(test_x, verbose=0)
     logs['auc'] = roc_auc_score(test_y, test_y_score) 
     print('train roc_auc %.3f, test roc_auc %.3f\n' % (roc_auc_score(train_y, train_y_score), roc_auc_score(test_y, test_y_score)))
コード例 #25
0
    def process():

        data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet)

        if feature_mask is not None:
            s = [slice(None),] * data.X_train.ndim
            s[-1] = np.where(np.array(feature_mask) == True)[0]
            data['X_train'] = data.X_train[s]
            data['X_cv'] = data.X_cv[s]
            if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape

        train(classifier, data, quiet=quiet)
        if not quiet: print "Making predictions...",
        timer = time.Timer()
        mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments)
        if not quiet: print timer.pretty_str()

        mean_score = roc_auc_score(data.y_cv, mean_predictions)
        median_score = roc_auc_score(data.y_cv, median_predictions)

        return jsdict({
            'mean_score': mean_score,
            'median_score': median_score,
            'mean_predictions': mean_predictions,
            'median_predictions': median_predictions,
            'y_cv': data.y_cv
        })
コード例 #26
0
def show_roc(fold, targets, pred):
    # print 'fold : ',fold
    # print 'Size of targets : ',len(targets)
    # print 'Size of predictions : ', len(pred)
    roc_labels = []
    for t in targets:
        if t > 0.0:
            roc_labels.append(1)
        else:
            roc_labels.append(0)
    print roc_auc_score(roc_labels, pred)
    # plots
    fpr, tpr, thresholds = roc_curve(roc_labels, pred)
    
    roc_auc = auc(fpr, tpr)
    # print fpr, ' , ', tpr, ' , ', roc_auc
    print fold,' , ',roc_auc
    
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (fold, roc_auc))
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
コード例 #27
0
ファイル: TrainModel.py プロジェクト: kumaran-5555/ML
    def evalmetric(pred, truth):
        return 'auc_mine', metrics.roc_auc_score(truth.get_label(), pred)

        thresholds =  np.arange(99.6, 99.9, 0.025)
        bestScore =  0
        bestT = 0
        bestAcc = 0
        bestCf = np.zeros((2,2))

        thresholds = [0.10]
        for t in thresholds:
            temp = np.copy(pred)
            temp[np.where(pred > np.percentile(pred, t))] = 1
            temp[np.where(pred <= np.percentile(pred, t))] = 0
            score = metrics.matthews_corrcoef(truth.get_label(), temp)
            

            if score > bestScore:
                bestScore = score
                bestT = np.percentile(pred, t)
                bestAuc = metrics.roc_auc_score(truth.get_label(), temp, reorder=True)
                bestCf = metrics.confusion_matrix(truth.get_label(), temp)

        
        print('threshold {} mcc {} auc {} TN {} FP {} FN {} TP {}\n'.format(bestT, bestScore, bestAcc, bestCf[0][0], bestCf[0][1], bestCf[1][0], bestCf[1][1]))

        return 'mcc', -1 * bestScore
コード例 #28
0
def train_and_evaluate():
    nn_training_error = 0
    nn_test_error = 0
    training_error = 0
    test_error = 0

    for train, test in ss:
        # Train NN
        nn.initialize(x[train])
        #print 'NN pre-training train error: %f' % metrics.mean_absolute_error(y[train], nn.predict(x[train]).reshape(x[train].shape[0],))
        #print 'NN pre-training f1 score: %f' %metrics.f1_score(y[train], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[train])).T)
        #print 'NN pre-training auc score: %f' %metrics.roc_auc_score(y[train], nn.predict(x[train]).T)
        
        nn.train(x[train], y[train], passes=500, alpha=0.7, lam=0.0)

        cat=1
        nn_training_auc = metrics.roc_auc_score(y[train][:,cat], nn.predict(x[train]).T[:,cat])
        nn_test_auc = metrics.roc_auc_score(y[test][:,cat], nn.predict(x[test]).T[:,cat])
        nn_training_error = metrics.f1_score(y[train][:,cat], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[train])).T[:,cat])
        nn_test_error = metrics.f1_score(y[test][:,cat], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[test])).T[:,cat])

        #nn_training_error += metrics.mean_absolute_error(y[train], nn.predict(x[train]).reshape(x[train].shape[0],))
        #nn_test_error += metrics.mean_absolute_error(y[test], nn.predict(x[test]).reshape(x[test].shape[0],))



        print 'NN F1: (Training) %f, (Test) %f' %(nn_training_error, nn_test_error)
        print 'NN AUC: (Training) %f, (Test) %f' %(nn_training_auc, nn_test_auc)
コード例 #29
0
ファイル: Part1.py プロジェクト: katherinez22/projects
def modelSelection(x_train, y_train, x_test, y_test, model, n_folds):
    """
    Select various models and return the AUCs of training and test sets and predicted offer acceptance probabilities.
    """
    if model == "Random Forest":
        clf = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=0, min_samples_split=1)
    elif model == "Logistic Regression L1":
        clf = LogisticRegression(penalty='l1', random_state=0, class_weight='auto')
    elif model == "Logistic Regression L2":
        clf = LogisticRegression(penalty='l2', random_state=0, class_weight='auto')
    elif model == "Decision Tree":
        clf = DecisionTreeClassifier(random_state=0)
    elif model == "Naive Bayes":
        clf = GaussianNB()
    elif model == "KNN":
        clf = KNeighborsClassifier(n_neighbors=10)
    # Perform cross-validation on training dataset and calculate AUC
    cv = StratifiedKFold(y_train, n_folds=n_folds)
    auc_train = []
    auc_validation = []
    auc_test = []
    pred_prob = []
    for i, (train, validation) in enumerate(cv):
        clf = clf.fit(x_train[train], y_train[train])
        auc_train.append(metrics.roc_auc_score(y_train[train], clf.predict_proba(x_train[train])[:, 1]))
        auc_validation.append(metrics.roc_auc_score(y_train[validation], clf.predict_proba(x_train[validation])[:, 1]))
        auc_test.append(metrics.roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]))
        pred_prob.append(clf.predict_proba(x_test)[:, 1])
    return np.mean(auc_train), np.mean(auc_validation), np.mean(auc_test), np.mean(pred_prob, axis=0)
コード例 #30
0
ファイル: embed_eval_benet.py プロジェクト: cfld/tile2vec
    model = MLPClassifier(n_classes=19, n_input=X.shape[1])
    model = model.cuda()

    train_loader = DataLoader(TensorDataset(X_train, y_train),
                              batch_size=32,
                              shuffle=True,
                              pin_memory=True)

    opt = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(50):
        _ = model.train()
        for x, y in train_loader:
            x, y = x.cuda(), y.cuda()

            out = model(x)
            loss = F.binary_cross_entropy_with_logits(out, y)

            opt.zero_grad()
            loss.backward()
            opt.step()

        _ = model.eval()
        z = model(X_valid.cuda())
        p_valid = to_numpy(z)
        auc_valid = [
            metrics.roc_auc_score(y, p) for y, p in zip(y_valid.T, p_valid.T)
        ]
        print(np.mean(auc_valid))
コード例 #31
0
ファイル: 2_ensembles.py プロジェクト: raumannsr/hints_crowd
                                                        bounds=[(0.0, 1.0) for _ in range(len(weights))],
                                                        args=(valid_label_c, pred_validation),
                                                        maxiter=1000,
                                                        tol=1e-7)
        if VERBOSE:
            print(weights_optim.x)
        scores = np.average(pred_test, weights=weights_optim.x, axis=0)
    else:
        scores = []
        for i in range(0, len(test_id)):
            if ENSEMBLE_LEARNING == ensemble_learning_type.soft_voting:
                max_prob = asy_pred_score[i]
                if bor_pred_score[i] > max_prob:
                    max_prob = bor_pred_score[i]
                if col_pred_score[i] > max_prob:
                    max_prob = col_pred_score[i]
                scores.append(max_prob)
            else:
                if ENSEMBLE_LEARNING == ensemble_learning_type.averaging:
                    scores.append((asy_pred_score[i] + bor_pred_score[i] + col_pred_score[i]) / 3.0)

    aucs = []
    auc = roc_auc_score(test_label_c, scores)
    aucs.append(auc)

    if CONV_LAYER_FROZEN:
        filename = os.path.join(pipeline, 'out', 'aucs_frozen' + str(seed) + '.csv')
    else:
        filename = os.path.join(pipeline, 'out', 'aucs_not_frozen' + str(seed) + '.csv')

    report_auc(aucs, filename)
コード例 #32
0
# aggregate table to view statistics
print(tabulate(X.describe(), X))

# fill NULLS
X["Age"].fillna(X.Age.mean(), inplace=True)

# BUILD FAST SIMPLE MODEL TO GET FIRST BENCHMARK*********************************
# get numeric variables
numeric_variables = list(X.dtypes[X.dtypes != "object"].index)
print(tabulate((X[numeric_variables].head()), X))

# build model
model = RandomForestRegressor(n_estimators=100,
                              oob_score=True,
                              random_state=42)
model.fit(X[numeric_variables], y)
# model score is c-stat.
# model.oob_score

y_oob = model.oob_prediction_
print("c-stat: ", roc_auc_score(y, y_oob))
# print(y_oob)  #probability of survival (this is what is then converted into classes)

# *******************************************************************************

# # function that describes categorical variables
# def describe_categorical(X):
#

print("EOF")
コード例 #33
0
ファイル: tf_serve_pack.py プロジェクト: syyunn/DeepWTO
def test_ann(word2vec_path, model_number):
    # Parameters
    # =============================================================================

    logger = feed.logger_fn("tflog",
                            "logs/test-{0}.log".format(time.asctime()))

    # MODEL = input("☛ Please input the model file you want to test, "
    #               "it should be like(1490175368): ")

    MODEL = str(model_number)

    while not (MODEL.isdigit() and len(MODEL) == 10):
        MODEL = input("✘ The format of your input is illegal, "
                      "it should be like(1490175368), please re-input: ")

    logger.info("✔︎ The format of your input is legal, "
                "now loading to next step...")

    TRAININGSET_DIR = 'models/citability/data/Train.json'
    VALIDATIONSET_DIR = 'models/citability/data/Validation.json'
    # TEST_DIR = 'data/Test.json'
    cwd = os.getcwd()
    TEST_DIR = os.path.join(cwd, 'web/test_data.json')

    cwd = os.getcwd()
    MODEL_DIR = os.path.join(cwd, 'web/runs/' + MODEL + '/checkpoints/')
    print(MODEL_DIR)
    BEST_MODEL_DIR = 'runs/' + MODEL + '/bestcheckpoints/'
    SAVE_DIR = 'results/' + MODEL

    # Data Parameters
    tf.flags.DEFINE_string("training_data_file", TRAININGSET_DIR,
                           "Data source for the training data.")
    tf.flags.DEFINE_string("validation_data_file", VALIDATIONSET_DIR,
                           "Data source for the validation data")
    tf.flags.DEFINE_string("test_data_file", TEST_DIR,
                           "Data source for the test data")
    tf.flags.DEFINE_string("checkpoint_dir", MODEL_DIR,
                           "Checkpoint directory from training run")
    tf.flags.DEFINE_string("best_checkpoint_dir", BEST_MODEL_DIR,
                           "Best checkpoint directory from training run")

    # Model Hyperparameters
    tf.flags.DEFINE_integer(
        "pad_seq_len", 35842, "Recommended padding Sequence length of data "
        "(depends on the data)")
    tf.flags.DEFINE_integer(
        "embedding_dim", 300, "Dimensionality of character embedding "
        "(default: 128)")
    tf.flags.DEFINE_integer("embedding_type", 1,
                            "The embedding type (default: 1)")
    tf.flags.DEFINE_integer(
        "fc_hidden_size", 1024, "Hidden size for fully connected layer "
        "(default: 1024)")
    tf.flags.DEFINE_float("dropout_keep_prob", 0.5,
                          "Dropout keep probability (default: 0.5)")
    tf.flags.DEFINE_float("l2_reg_lambda", 0.0,
                          "L2 regularization lambda (default: 0.0)")
    tf.flags.DEFINE_integer("num_classes", 80,
                            "Number of labels (depends on the task)")
    tf.flags.DEFINE_integer("top_num", 80,
                            "Number of top K prediction classes (default: 5)")
    tf.flags.DEFINE_float("threshold", 0.5,
                          "Threshold for prediction classes (default: 0.5)")

    # Test Parameters
    tf.flags.DEFINE_integer("batch_size", 1, "Batch Size (default: 1)")

    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True,
                            "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False,
                            "Log placement of ops on devices")
    tf.flags.DEFINE_boolean("gpu_options_allow_growth", True,
                            "Allow gpu options growth")

    FLAGS = tf.flags.FLAGS
    FLAGS(sys.argv)
    dilim = '-' * 100
    logger.info('\n'.join([
        dilim, *[
            '{0:>50}|{1:<50}'.format(attr.upper(), FLAGS.__getattr__(attr))
            for attr in sorted(FLAGS.__dict__['__wrapped'])
        ], dilim
    ]))
    """Test ANN model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(
        FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = feed.load_data_and_labels(FLAGS.test_data_file,
                                          FLAGS.num_classes,
                                          FLAGS.embedding_dim,
                                          data_aug_flag=False,
                                          word2vec_path=word2vec_path)

    logger.info("✔︎ Test data padding...")
    x_test, y_test = feed.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_labels = test_data.labels

    # Load ann model
    # BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")
    BEST_OR_LATEST = 'L'

    while not (BEST_OR_LATEST.isalpha()
               and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = \
            input("✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST.upper() == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = checkpoints.get_best_checkpoint(
            FLAGS.best_checkpoint_dir, select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output
            # nodes
            output_node_names = "output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-ann-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = feed.batch_iter(list(zip(x_test, y_test, y_test_labels)),
                                      FLAGS.batch_size,
                                      1,
                                      shuffle=False)

            test_counter, test_loss = 0, 0.0

            test_pre_tk = [0.0] * FLAGS.top_num
            test_rec_tk = [0.0] * FLAGS.top_num
            test_F_tk = [0.0] * FLAGS.top_num

            # Collect the predictions here
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_labels = zip(
                    *batch_test)
                print("x_batch_test", x_batch_test)
                print("y_batch_test", y_batch_test)
                feed_dict = {
                    input_x: x_batch_test,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Prepare for calculating metrics
                for i in y_batch_test:
                    true_onehot_labels.append(i)
                for j in batch_scores:
                    predicted_onehot_scores.append(j)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    feed.get_label_threshold(scores=batch_scores,
                                             threshold=FLAGS.threshold)

                # Add results to collection
                for i in y_batch_test_labels:
                    true_labels.append(i)
                for j in batch_predicted_labels_ts:
                    predicted_labels.append(j)
                for k in batch_predicted_scores_ts:
                    predicted_scores.append(k)

                # Get onehot predictions by threshold
                batch_predicted_onehot_labels_ts = \
                    feed.get_onehot_label_threshold(scores=batch_scores,
                                                    threshold=FLAGS.threshold)
                for i in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(i)

                # Get onehot predictions by topK
                for top_num in range(FLAGS.top_num):
                    batch_predicted_onehot_labels_tk = feed.\
                        get_onehot_label_topk(scores=batch_scores,
                                              top_num=top_num + 1)

                    for i in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[top_num].append(i)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1 (threshold & topK)
            test_pre_ts = precision_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')
            test_rec_ts = recall_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')
            test_F_ts = f1_score(y_true=np.array(true_onehot_labels),
                                 y_pred=np.array(predicted_onehot_labels_ts),
                                 average='micro')

            for top_num in range(FLAGS.top_num):
                test_pre_tk[top_num] = precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')
                test_rec_tk[top_num] = recall_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')
                test_F_tk[top_num] = f1_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores),
                                     average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(
                y_true=np.array(true_onehot_labels),
                y_score=np.array(predicted_onehot_scores),
                average="micro")
            test_loss = float(test_loss / test_counter)

            logger.info(
                "☛ All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}".
                format(test_loss, test_auc, test_prc))

            # Predict by threshold
            logger.info(
                "☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                .format(test_pre_ts, test_rec_ts, test_F_ts))

            # Predict by topK
            logger.info("☛ Predict by topK:")
            for top_num in range(FLAGS.top_num):
                logger.info(
                    "Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}".format(
                        top_num + 1, test_pre_tk[top_num],
                        test_rec_tk[top_num], test_F_tk[top_num]))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            feed.create_prediction_file(output_file=SAVE_DIR +
                                        "/predictions.json",
                                        data_id=test_data.testid,
                                        all_labels=true_labels,
                                        all_predict_labels=predicted_labels,
                                        all_predict_scores=predicted_scores)

    logger.info("✔︎ Done.")
コード例 #34
0
ファイル: lstm.py プロジェクト: cjc96/toxic
res = model.fit(X, Y, batch_size = 512, epochs = 2, validation_data = (X_val, Y_val), callbacks = [stop])


# In[ ]:


Y_test = model.predict(X_test)


# In[ ]:


sum = 0
for i in range(6):
    score = roc_auc_score(test_labels[:,i], Y_test[:,i])
    sum += score


# In[ ]:


print (sum / 6)


# In[ ]:




コード例 #35
0
ファイル: Logistic.py プロジェクト: nick95a/Machine-Learning
C = 10
l = len(Y)
x1 = X[:, 0]
x2 = X[:, 1]
n = 0
d = 10

y = []

while n < 10000 and d > 0.00001:
    v1 = w1
    v2 = w2
    a = 1 + np.exp(-Y * (w1 * x1 + w2 * x2))
    w1 = w1 + (k / l) * (np.sum(Y * x1 * (1 - 1 / a))) - k * C * w1
    w2 = w2 + (k / l) * (np.sum(Y * x2 * (1 - 1 / a))) - k * C * w2
    d = np.sqrt((w1 - v1)**2 + (w2 - v2)**2)
    n = n + 1
    y.append(a)

y = [item for sublist in y for item in sublist]
y[:] = [x - 1 for x in y]
print(type(y))
zero = []
np.array(zero)
zero.append(np.zeros((39, ), dtype=np.int))
zero1 = [item for sublist in zero for item in sublist]
np.append(Y, zero1)
y = np.asarray(y)

print(roc_auc_score(y, Y))
コード例 #36
0
def evaluate_model(label_df, y_predicted, **kwargs):
    """Evaluate the performance of the model   
    Args:
        label_df (:py:class:`pandas.DataFrame`): Dataframe containing true y label
        y_predicted (:py:class:`pandas.DataFrame`): Dataframe containing predicted probability and score
    Returns: 
        confusion_df (:py:class:`pandas.DataFrame`): Dataframe reporting confusion matrix
    """
    try:
        # get predicted scores
        y_pred_prob = y_predicted.iloc[:, 0]
        y_pred = y_predicted.iloc[:, 1]
        # get true labels
        y_true = label_df.iloc[:, 0]
    # raise IndexError when the input dataframe does not have two columns as desired
    except:
        raise IndexError('Index out of bounds!')

    # check if label_df and y_predicted have only numeric columns
    for col in label_df.columns:
        if label_df[col].dtype not in [
                np.dtype('float64'),
                np.dtype('float32'),
                np.dtype('int64')
        ]:
            raise ValueError(
                'Input dataframe can only have numeric or boolean types!')
    for col in y_predicted.columns:
        if y_predicted[col].dtype not in [
                np.dtype('float64'),
                np.dtype('float32'),
                np.dtype('int64')
        ]:
            raise ValueError(
                'Input dataframe can only have numeric or boolean types!')

    # classification metrics can only take binary classes - 0 or 1 in this case
    # check if y_pred and label_df are all either 0 or 1
    if (not y_pred.isin([0, 1]).all()) or (not y_true.isin([0, 1]).all()):
        raise ValueError('Class can only be 0 or 1!')

    # check if predicted probabilities are within 0-1
    if not y_pred_prob.between(0, 1, inclusive=True).all():
        raise ValueError('Probabilities needs to be in 0-1 range!')

    # calculate auc and accuracy and f1_score if specified
    if "auc" in kwargs["metrics"]:
        auc = roc_auc_score(label_df, y_pred_prob)
        print('AUC on test: %0.3f' % auc)
    if "accuracy" in kwargs["metrics"]:
        accuracy = accuracy_score(label_df, y_pred)
        print('Accuracy on test: %0.3f' % accuracy)
    if "f1_score" in kwargs["metrics"]:
        f1 = f1_score(label_df, y_pred)
        print('F1-score on test: %0.3f' % f1)

    # generate confusion matrix and classification report
    print(classification_report(label_df, y_pred))
    confusion = confusion_matrix(label_df, y_pred)
    print(confusion)
    confusion_df = pd.DataFrame(
        confusion,
        index=['Actual Negative', 'Actual Positive'],
        columns=['Predicted Negative', 'Predicted Positive'])

    return confusion_df
コード例 #37
0
def evaluate(model, val_loader):
    model.eval()
    outputs = [step(model, batch) for batch in val_loader]
    Preds = [x['preds'] for x in outputs]
    Labels = [x['labels'] for x in outputs]
    Outs = [x['out'] for x in outputs]

    Preds = torch.cat(Preds, dim=0).cpu()
    Labels = torch.cat(Labels, dim=0).cpu()
    Outs = torch.cat(Outs, dim=0).cpu()
    Scores = F.softmax(Outs, dim=1)

    print(Preds, Labels, Scores)
    print(Preds.size(), len(Labels), Scores.size())
    print('General Evaluation')

    # Precision | Recall | F1 - score | AUC
    acc_all = accuracy_score(Labels, Preds)
    ap_all = precision_score(Labels, Preds, average='macro')
    ar_all = recall_score(Labels, Preds, average='macro')
    f1_all = f1_score(Labels, Preds, average='macro')
    print(acc_all, ap_all, ar_all, f1_all)

    y_pred = []
    y_true = []
    for i in range(1900):
        if Preds[i] == 0:
            y_pred.append('AMD')
        elif Preds[i] == 1:
            y_pred.append('DME')
        elif Preds[i] == 2:
            y_pred.append('NM')
        elif Preds[i] == 3:
            y_pred.append('PCV')
        elif Preds[i] == 4:
            y_pred.append('PM')

        if Labels[i] == 0:
            y_true.append('AMD')
        elif Labels[i] == 1:
            y_true.append('DME')
        elif Labels[i] == 2:
            y_true.append('NM')
        elif Labels[i] == 3:
            y_true.append('PCV')
        elif Labels[i] == 4:
            y_true.append('PM')
    t1 = classification_report(y_true, y_pred,
                               target_names=['AMD', 'DME', 'NM', 'PCV', 'PM'])
    t2 = classification_report(y_true, y_pred, output_dict=True,
                               target_names=['AMD', 'DME', 'NM', 'PCV', 'PM'])
    print(t1)
    print(t2)

    # draw confuse matrix
    classes = ['AMD', 'DME', 'NM', 'PCV', 'PM']
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots()
    plt.imshow(cm, cmap=plt.cm.Greens)
    indices = range(len(cm))
    plt.xticks(indices, classes)
    plt.yticks(indices, classes)
    plt.colorbar()
    plt.xlabel('Pred')
    plt.ylabel('True')

    for first_index in range(len(cm)):
        for second_index in range(len(cm[first_index])):
            plt.text(first_index, second_index, cm[first_index][second_index])

    fig.savefig("./img/{}/Best-cm-img{}.png".format(args.model, args.bsize),
                dpi=320, format='png')

    # cal auc
    roc_ovr = roc_auc_score(Labels, Scores, multi_class='ovr')
    print('--roc-ovr:', roc_ovr)
    roc_ovo = roc_auc_score(Labels, Scores, multi_class='ovo')
    print('--roc-ovo:', roc_ovo)
コード例 #38
0
for TRAIN_INDEX, TEST_INDEX in SKF.split(X_DATA, Y_DATA):

    X_TRAIN = X_DATA[TRAIN_INDEX]
    X_TEST = X_DATA[TEST_INDEX]
    Y_TRAIN = Y_DATA[TRAIN_INDEX]
    Y_TEST = Y_DATA[TEST_INDEX]
    X_RES = X_TRAIN
    Y_RES = Y_TRAIN

    classifier = KNeighborsClassifier(n_neighbors=2)
    classifier.fit(X_RES, Y_RES)
    Y_PRED = classifier.predict(X_TEST)
    CM = np.add(CM, confusion_matrix(Y_TEST, Y_PRED))
    Y_TEST_TOTAL = np.concatenate((Y_TEST_TOTAL, Y_TEST))
    Y_PRED_TOTAL = np.concatenate((Y_PRED_TOTAL, Y_PRED))

prec = CM[1][1] / (CM[1][1] + CM[0][1])
rec = CM[1][1] / (CM[1][1] + CM[1][0])
fmes = 2 * prec * rec / (prec + rec)
auc = roc_auc_score(Y_TEST_TOTAL, Y_PRED_TOTAL)
balan = bal(CM)
print(
    str(prec) + ' ' + str(rec) + ' ' + str(fmes) + ' ' + str(auc) + ' ' +
    str(balan))
# print('Confusion Matrix')
# print(CM)
# print('Precision: ' + str(prec))
# print('Recall: ' + str(rec))
# print('fmeasure: ' + str(fmes))
# print('Balance: ' + str(balan))
コード例 #39
0
def class_prob(simulation_name,
               device,
               csv_files,
               test_idx,
               epoch_max=40,
               batch_size=1,
               logits=False,
               calibrator=None,
               mscourse=None):

    csv_file_path = csv_files["path"]
    csv_file_tags = csv_files["tags"]
    csv_file_cov = csv_files["cov"]

    N = pd.read_csv(csv_file_tags)["ID"].nunique()

    validation = True

    if "cont" in csv_file_path:
        val_options = {
            "T_val": 1095,
            "max_val_samples": 1,
            "T_closest": 1825,
            "T_val_from": 1460
        }
    else:
        val_options = {
            "T_val": 36,
            "max_val_samples": 1,
            "T_closest": 60,
            "T_val_from": 48
        }

    if mscourse is not None:
        df_cov = pd.read_csv(csv_file_cov)
        test_idx = np.array(df_cov.loc[(df_cov.ID.isin(test_idx)) &
                                       ((df_cov[mscourse] > 0).any(1)),
                                       "ID"].unique().tolist())

    data_test = data_utils.ODE_Dataset(csv_file=csv_file_path,
                                       label_file=csv_file_tags,
                                       cov_file=csv_file_cov,
                                       idx=test_idx,
                                       validation=validation,
                                       val_options=val_options)

    dl_test = DataLoader(dataset=data_test,
                         collate_fn=data_utils.custom_collate_fn,
                         shuffle=False,
                         batch_size=batch_size)

    params_dict = np.load(f"./trained_models/{simulation_name}_params.npy",
                          allow_pickle=True).item()

    nnfwobj = gru_ode.NNFOwithBayesianJumps(
        input_size=params_dict["input_size"],
        hidden_size=params_dict["hidden_size"],
        p_hidden=params_dict["p_hidden"],
        prep_hidden=params_dict["prep_hidden"],
        logvar=params_dict["logvar"],
        mixing=params_dict["mixing"],
        classification_hidden=params_dict["classification_hidden"],
        cov_size=params_dict["cov_size"],
        cov_hidden=params_dict["cov_hidden"],
        dropout_rate=params_dict["dropout_rate"],
        full_gru_ode=params_dict["full_gru_ode"])
    nnfwobj.to(device)

    nnfwobj.load_state_dict(
        torch.load(f"./trained_models/{simulation_name}_MAX.pt"))

    class_criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
    val_metric_prev = -1000
    nnfwobj.eval()
    class_preds = []
    labels_list = []
    for i, b in enumerate(tqdm.tqdm(dl_test)):

        prob_path = []
        times = b["times"]
        time_ptr = b["time_ptr"]
        X = b["X"].to(device)
        M = b["M"].to(device)
        obs_idx = b["obs_idx"]
        cov = b["cov"].to(device)
        labels = b["y"].to(device)
        batch_size = labels.size(0)

        if labels.shape[0] > 1:
            _, _, class_pred, _ = nnfwobj(times,
                                          time_ptr,
                                          X,
                                          M,
                                          obs_idx,
                                          delta_t=params_dict["delta_t"],
                                          T=params_dict["T"],
                                          cov=cov)

            if logits:
                return labels.detach().cpu().numpy(), class_pred.detach().cpu(
                ).numpy()
            else:
                return labels.detach().cpu().numpy(), torch.sigmoid(
                    class_pred).detach().cpu().numpy()

        for samp in range(0, len(times) + 1):
            times_samp = times[:samp]
            time_ptr_samp = time_ptr[:samp]
            X_samp = X[:samp]
            M_samp = M[:samp]
            obs_idx_samp = obs_idx[:samp]

            hT, loss, class_pred, _ = nnfwobj(times,
                                              time_ptr,
                                              X_samp,
                                              M_samp,
                                              obs_idx_samp,
                                              delta_t=params_dict["delta_t"],
                                              T=params_dict["T"],
                                              cov=cov)

            prob_path += [clf.predict_proba((class_pred).detach().cpu())[:, 1]]

        class_preds += [class_pred.detach().cpu().numpy().item()]
        labels_list += [labels.detach().cpu().numpy().item()]

        plt.figure()
        times /= 12
        times -= 3
        fig, ax1 = plt.subplots()

        color = 'tab:red'
        ax1.set_xlabel('Time before visit [Years]')
        ax1.set_ylabel('EDSS', color=color)
        edss_x = np.round(2 * (X.detach().cpu().numpy() * 1.6764 + 2.4818)) / 2
        ax1.scatter(times, edss_x, color=color)
        ax1.tick_params(axis='y', labelcolor=color)
        ax1.set_ylim(edss_x.min() - 1, edss_x.max() + 1)
        min_tick = np.max((0, edss_x.min() - 0.5))
        max_tick = np.min((10, edss_x.max() + 1))
        ax1.set_yticks(np.arange(min_tick, max_tick, step=0.5))

        ax2 = ax1.twinx(
        )  # instantiate a second axes that shares the same x-axis

        color = 'tab:blue'
        ax2.set_ylabel('Probability',
                       color=color)  # we already handled the x-label with ax1
        ax2.step(np.concatenate((np.array([-3]), times)),
                 prob_path,
                 where="post",
                 color=color)
        ax2.tick_params(axis='y', labelcolor=color)
        ax2.set_ylim((0, 1))
        fig.tight_layout()

        #plt.scatter(times,X.detach().cpu().numpy())
        #plt.step(np.concatenate((np.array([0]),times)), prob_path, where = "post")
        plt.title(
            f"Progression of the worsening prediction over time. Label : {labels.detach().cpu().numpy()[0][0]}"
        )
        fig.savefig(f"./figs/prob_prop_{i}.pdf")
        plt.close(fig)
        plt.close("all")

        #if i >100:
        #    break
    print(roc_auc_score(np.array(labels_list), np.array(class_preds)))

    return class_preds, labels_list
コード例 #40
0
# 6. Fit model
model.fit(x_train,
          y_train,
          batch_size=batchSize,
          epochs=E,
          verbose=1,
          validation_data=(x_val, y_val),
          callbacks=[model_checkpoint])

# 7. Evalute model prediction
# Based on all 4 runs, CNN model weights from epoch 3 appears to have the
# lowest validation loss and highest validation accuracy.
# So we load this model to determine ROC_AUC score

# so, we load the weights from the second epoch
model.load_weights(output_dir + "\\weights.03.hdf5")

# Compute predictions
y_hat = model.predict(x_val)

# Visualise distribution of predicted y_hat
plt.hist(y_hat)
plt.axvline(0.5, color="orange")

# Measure performance with ROC_AUC score
pct_auc = roc_auc_score(y_val, y_hat) * 100.0
print("ROC AUC = %.2f percent" % pct_auc)

# Convolutional neural net      : 95.24% (model weights from epoch #3)
cm = confusion_matrix(y_test, y_predEnsem)
print("time taken: ", round(time() - t, 3), "s")
print(classification_report(y_test, y_predEnsem, target_names=['brand', 'female', 'male']))
print("accuracy: ", ensemble_classifier.score(x_test,y_test))



#plotting roc curve
y_test = [ i if (i==1) else 0 for i in y_test  ]

plt.figure()
plt.subplots(figsize=(8,6))

y_predGNB = [ i if (i==1) else 0 for i in y_predGNB  ]
fpr, tpr, _ = roc_curve(y_test, y_predGNB, pos_label=1)
plt.plot(fpr, tpr, 'b', label="Naive Bayes, AUC=" + str(round(roc_auc_score(y_test, y_predGNB), 3)))

y_predRFC = [ i if (i==1) else 0 for i in y_predRFC ]
fpr, tpr, _ = roc_curve(y_test, y_predRFC)
plt.plot(fpr, tpr, 'r', label="Random Forest, AUC=" + str(round(roc_auc_score(y_test, y_predRFC), 3)))

y_predLR = [ i if (i==1) else 0 for i in y_predLR  ]
fpr, tpr, _ = roc_curve(y_test, y_predLR)
plt.plot(fpr, tpr, 'g', label="Logistic Regression, AUC=" + str(round(roc_auc_score(y_test, y_predLR), 3)))

y_predEnsem = [ i if (i==1) else 0 for i in y_predEnsem  ]
fpr, tpr, _ = roc_curve(y_test, y_predEnsem)
plt.plot(fpr, tpr, 'y', label="Ensemble, AUC=" + str(round(roc_auc_score(y_test, y_predEnsem), 3)))

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend()
コード例 #42
0
    X_train, X_test, y_train, y_test = train_test_split(hdd,
                                                        hdd_labels,
                                                        test_size=0.2)
    smote = SMOTE(kind="regular")
    X_train, y_train = smote.fit_sample(X_train, y_train)
    #clf=ensemble.RandomForestClassifier()
    clf = tree.DecisionTreeClassifier(max_depth=None,
                                      criterion='gini',
                                      min_samples_split=3,
                                      min_samples_leaf=2,
                                      max_leaf_nodes=5)
    clf = clf.fit(X_train, y_train)
    preds = clf.predict_proba(X_test)
    preds_ = clf.predict(X_test)

    roc_auc = metrics.roc_auc_score(y_true=y_test, y_score=preds[:, 1])
    print('roc_auc', roc_auc)
    print('NACC', metrics.recall_score(y_true=y_test, y_pred=preds_))
    print('accuracy', metrics.accuracy_score(y_true=y_test, y_pred=preds_))
    if ((metrics.recall_score(y_true=y_test, y_pred=preds_) > 0.8) &
        (metrics.accuracy_score(y_true=y_test, y_pred=preds_) > 0.8)):
        break
#recall.append(metrics.recall_score(y_true=test_label,y_pred=preds_))
#accuracy.append(metrics.accuracy_score(y_true=test_label,y_pred=preds_))
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds[:, 1])

fig = plt.figure()
plt.title('ROC curve')
plt.plot(fpr, tpr, 'b')

plt.legend(loc='lower right')
コード例 #43
0
    for c in df_train.columns:
        if c != 'ncodpers':
            print(c)
            y_train = df_train[c]
            x_train = df_train.drop([c, 'ncodpers'], 1)

            clf = LogisticRegression(solver='saga', max_iter=400)
            clf.fit(x_train, y_train)
            p_train = clf.predict_proba(x_train)[:, 1]

            models[c] = clf
            model_preds[c] = p_train
            for id, p in zip(ids, p_train):
                id_preds[id].append(p)

            print(roc_auc_score(y_train, p_train))

    already_active = {}
    for row in df_train.values:
        row = list(row)
        id = row.pop(0)
        active = [c[0] for c in zip(df_train.columns[1:], row) if c[1] > 0]
        already_active[id] = active

    train_preds = {}
    for id, p in id_preds.items():
        # Here be dragons
        preds = [
            i[0] for i in sorted([
                i for i in zip(df_train.columns[1:], p)
                if i[0] not in already_active[id]
コード例 #44
0
                #print outputs[-1], targets
                losses.append(criterion(outputs[-1], targets))

            loss = sum(losses) / len(batch_icd)
            loss.backward()
            optimizer.step()

        ## Validation phase
        vpredictions = np.zeros(len(valid_input_seqs))
        for i in range(len(valid_input_seqs)):
            test_seq = valid_input_seqs[i]
            vpredictions[i] = model.predict(
                Variable(
                    torch.from_numpy(convert_to_one_hot(test_seq)).float()))
        print "Validation Test AUC_ROC: ", roc_auc_score(
            valid_labels, vpredictions)

        ## Testing phase
        predictions = np.zeros(len(test_input_seqs))
        for i in range(len(test_input_seqs)):
            test_seq = test_input_seqs[i]
            predictions[i] = model.predict(
                Variable(
                    torch.from_numpy(convert_to_one_hot(test_seq)).float()))
        print "Test AUC_ROC: ", roc_auc_score(test_labels, predictions)
        # actual_predictions = (predictions>0.5)*1
        # print classification_report(test_labels, actual_predictions)

        aucrocs.append(roc_auc_score(test_labels, predictions))

    best_aucrocs.append(max(aucrocs))
コード例 #45
0
def train_model():

    x_train, x_test, y_train, y_test = preprocess_data()
    clf = RandomForestClassifier(n_estimators=500,
                                 max_depth=10,
                                 random_state=18,
                                 max_leaf_nodes=64,
                                 verbose=1,
                                 n_jobs=4)
    scores_rfc = []
    # models1 = []
    # initialize KFold, we vcan use stratified KFold to keep the same imblance ratio for target
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    for i, (train_idx, valid_idx) in enumerate(kf.split(x_train, y_train)):
        print('...... training {}th fold \n'.format(i + 1))
        tr_x = x_train[train_idx]
        tr_y = y_train[train_idx]

        val_x = x_train[valid_idx]
        val_y = y_train[valid_idx]
        model = clf
        model.fit(tr_x, tr_y)
        # picking best model?
        pred_val_y = model.predict(val_x)
        # measuring model vs validation
        score_rfc = roc_auc_score(val_y, pred_val_y)
        scores_rfc.append(score_rfc)
        print('current performance by auc:{}'.format(score_rfc))

        # auc_scores1.append(auc)
        # models1.append(model)
    best_f1 = -np.inf
    best_thred = 0
    v = [i * 0.01 for i in range(50)]
    for thred in v:
        preds = (pred_val_y > thred).astype(int)
        f1 = f1_score(val_y, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thred = thred

    y_pred_rfc = (pred_val_y > best_thred).astype(int)
    print(confusion_matrix(val_y, y_pred_rfc))
    print(f1_score(val_y, y_pred_rfc))
    print('the average mean auc is:{}'.format(np.mean(scores_rfc)))
    model_lgb = lgb.LGBMClassifier(
        n_jobs=4,
        n_estimators=10000,
        boost_from_average='false',
        learning_rate=0.01,
        num_leaves=64,
        num_threads=4,
        max_depth=-1,
        tree_learner="serial",
        feature_fraction=0.7,
        bagging_freq=5,
        bagging_fraction=0.7,
        min_data_in_leaf=100,
        silent=-1,
        verbose=-1,
        max_bin=255,
        bagging_seed=11,
    )
    auc_scores = []
    models = []
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    for i, (train_idx, valid_idx) in enumerate(kf.split(x_train, y_train)):
        print('...... training {}th fold \n'.format(i + 1))
        tr_x = x_train[train_idx]
        tr_y = y_train[train_idx]

        va_x = x_train[valid_idx]
        va_y = y_train[valid_idx]
        model = model_lgb  # you need to initialize your lgb model at each loop, otherwise it will overwrite
        model.fit(tr_x,
                  tr_y,
                  eval_set=[(tr_x, tr_y), (va_x, va_y)],
                  eval_metric='auc',
                  verbose=500,
                  early_stopping_rounds=300)
        # calculate current auc after training the model
        pred_va_y = model.predict_proba(va_x,
                                        num_iteration=model.best_iteration_)[:,
                                                                             1]
        auc = roc_auc_score(va_y, pred_va_y)
        print('current best auc score is:{}'.format(auc))
        auc_scores.append(auc)
        models.append(model)

    best_f1 = -np.inf
    best_thred = 0
    v = [i * 0.01 for i in range(50)]
    for thred in v:
        preds = (pred_va_y > thred).astype(int)
        f1 = f1_score(va_y, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thred = thred

    y_pred_lgb = (pred_va_y > best_thred).astype(int)
    print(confusion_matrix(va_y, y_pred_lgb))
    print(f1_score(va_y, y_pred_lgb))
    print('the average mean auc is:{}'.format(np.mean(auc_scores)))
    fpr, tpr, _ = roc_curve(va_y, pred_va_y)
    # plot model roc curve
    plt.plot(fpr, tpr, marker='.', label='LGB model')
    # axis labels
    plt.title('ROC AUC CURVE')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.savefig('LGB ROC_auc_curve.png')
    plt.show()
    # Test data
    pred_test_1 = models[0].predict_proba(
        x_test, num_iteration=models[0].best_iteration_)[:, 1]
    pred_test_2 = models[1].predict_proba(
        x_test, num_iteration=models[1].best_iteration_)[:, 1]
    pred_test_3 = models[2].predict_proba(
        x_test, num_iteration=models[2].best_iteration_)[:, 1]
    pred_test_4 = models[3].predict_proba(
        x_test, num_iteration=models[3].best_iteration_)[:, 1]
    pred_test_5 = models[4].predict_proba(
        x_test, num_iteration=models[4].best_iteration_)[:, 1]
    pred_test = (pred_test_1 + pred_test_2 + pred_test_3 + pred_test_4 +
                 pred_test_5) / 5.0
    print(pred_test)
コード例 #46
0
y = data["class"]
X = data.groupby("class").transform(lambda x: x.fillna(x.mean()))
#data["value"] = data.groupby("name").transform(lambda x: x.fillna(x.mean()))
#X = data.loc[:,"Attr1":"Attr64"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=45,
                                                    stratify=y)

rf = GradientBoostingClassifier(n_estimators=1000)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)
rf.score(X_test, y_test)
roc_auc_score(y_test, rf.predict(X_test))

from sklearn.metrics import confusion_matrix
import itertools

plt.figure(dpi=150)
cm = confusion_matrix(y_test, rf.predict(X_test))
plt.imshow(cm, cmap=plt.cm.Blues)
plt.colorbar()
plt.xticks([0, 1])
plt.yticks([0, 1])
plt.title("Predicting Polish Bankruptcy within 5 Years")
plt.ylabel("True")
plt.xlabel("Predicted")
fmt = '.2f'
thresh = cm.max() / 2.
n = 0
cv = []
for index_train, index_eval in kf.split(train,train_y):

    x_train, x_eval = train_x[index_train], train_x[index_eval]
    y_train, y_eval = train_y[index_train], train_y[index_eval]

    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_eval, label=y_eval)
    watchlist = [(d_valid, 'valid')]

    bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=100, verbose_eval=100)

    print('Start predicting...')
    y_pred = bst.predict(xgb.DMatrix(x_eval))
    cv.append(roc_auc_score(y_eval, y_pred))

    print('start predicting on test...')
    testpreds = bst.predict(xgb.DMatrix(test.values))
    if n > 0:
        totalpreds = totalpreds + testpreds
    else:
        totalpreds = testpreds

    # bst.save_model('xgb_model_fold_{}.model'.format(n))
    n += 1

totalpreds = totalpreds / n
print('xgb best score', np.mean(cv))

# submit result
コード例 #48
0
ファイル: metrics.py プロジェクト: Saqibm128/eeg-tasks
def non_error_roc_auc_score(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except:
        return 0.0
コード例 #49
0
 def test_auc(self, X, Y):
     y_pred = self.model.predict(X)
     return metrics.roc_auc_score(Y, y_pred)
コード例 #50
0
ファイル: experiments.py プロジェクト: shliujing/RSRAE
                cae.fit(X_test, X_test)

                features = cae.get_output(X_test)
                flat_output = np.reshape(features, (np.shape(X_test)[0], -1))
                flat_input = np.reshape(X_test, (np.shape(X_test)[0], -1))

                cosine_similarity = np.sum(flat_output * flat_input, -1) / (
                    np.linalg.norm(flat_output, axis=-1) + 0.000001) / (
                        np.linalg.norm(flat_input, axis=-1) + 0.000001)

                tEnd = time.time()
                tDiff = tEnd - tStart
                with open(filename, 'a') as f_log:
                    f_log.write("Time elapsed: " + str(tDiff) + "\n")

                auc = roc_auc_score(y_test, -cosine_similarity)
                ap = average_precision_score(y_test, -cosine_similarity)

                print("auc = ", auc)
                print("ap = ", ap)
                print("time elapse = ", tDiff)
                aucs.append(auc)
                aps.append(ap)
                time_elapses.append(tDiff)

            std_auc = np.std(aucs)
            std_ap = np.std(aps)
            std_time = np.std(time_elapses)

            to_save_auc[cvalue][anomaly] = aucs
            to_save_ap[cvalue][anomaly] = aps
コード例 #51
0
 def compute_auc(self, X: np.ndarray, y: np.ndarray) -> float:
     """ Distance to hyperplane is used for AUC-style metrics. """
     return metrics.roc_auc_score(y, self.decision_function(X))
コード例 #52
0
ファイル: utils.py プロジェクト: datascisteven/Medium-Blogs
def auc(X, y, model):
    probs = model.predict_proba(X)[:, 1]
    return roc_auc_score(y, probs)
コード例 #53
0
        filenames_list = np.concatenate(
            (filenames_list, np.array(filenames_current).reshape(-1, 1)),
            axis=0)
        target_list = np.concatenate(
            (target_list, target_now.detach().numpy()), axis=0)

import pandas as pd
activation_dataframe = pd.DataFrame(filenames_list, columns=['Filename'])
activation_dataframe['Activations'] = activations_test
activation_dataframe['Target'] = target_list
#Saving dataframe to file
activation_dataframe.to_csv('activations_test.csv')

import sklearn
from sklearn.metrics import roc_auc_score
auroc_score = roc_auc_score(y_true=activation_dataframe['Target'],
                            y_score=activation_dataframe['Activations'])
threshold = 0.5

#Confusion matrix
activation_dataframe['Predicted'] = 0
activation_dataframe.loc[activation_dataframe['Activations'] > threshold,
                         'Predicted'] = 1

from sklearn.metrics import confusion_matrix
confusion_matrix_table = pd.DataFrame(confusion_matrix(
    y_true=activation_dataframe['Target'],
    y_pred=activation_dataframe['Predicted'],
    labels=[0, 1]),
                                      columns=[0, 1])

#Recall
コード例 #54
0
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['Male'] = df['Sex'] == 'Male'
X = df[['Pclass','Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X,y)

model = LogisticRegression() # select the model
model.fit(X_train, y_train) # train the model
y_pred_proba = model.predict_proba(X_test) # predict the test data
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
auc = roc_auc_score(y_test,model.predict(X_test))

plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (LogisticRegression(), auc))
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.legend(loc="lower right")
plt.show()   # Display
コード例 #55
0
y_pred=model1.predict(x_test)

plt.show()
cm1=confusion_matrix(y_test,y_pred)
sac=accuracy_score(y_test,y_pred)
accper=sac*100
accper

plt.figure(figsize=(10,10))
sns.heatmap(cm1,annot=True)
model1.summary()

y_pred_proba = model1.predict_proba(x_test)[::,1]
#pyplot.plot(fpr, tpr, linestyle='--', label='No Skill')
fpr, tpr, _ =metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.legend(loc=4)
print(classification_report(y_test,y_pred))


コード例 #56
0
        ds = DataLoader(sl=sl)

    if whole_map:
        test_seq = ds.load_whole_test(bmode)
    else:
        test_seq, test_label = ds.load_test(bmode)

    model_checkpoint_dir = os.path.join(s.intermediate_folder,
                                        'model_checkpoints/opt')
    model_checkpoint_file = os.path.join(model_checkpoint_dir, uid + '.hdf5')
    model = load_model(model_checkpoint_file)

    test_predictions = model.predict(test_seq, verbose=1)
    results = {'test_predictions': test_predictions}

    logs_dir = os.path.join(s.intermediate_folder, 'logs', logs_dir)
    test_log_dir = os.path.join(logs_dir, 'test_logs/')

    if not whole_map:
        test_auc = roc_auc_score(test_label, test_predictions)
        spio.savemat(test_log_dir + uid + '.mat', results)
        print(["Test AUC: ", test_auc])
    else:
        spio.savemat(test_log_dir + uid + '_whole.mat', results)

    # print('-' * 50)
    # print('UID: {}'.format(uid))
    # print('-' * 50)

    k.clear_session()
コード例 #57
0
                                embedding_dim,
                                embedding_matrix,
                                max_length,
                                out_size=6)


keras_model_trainer = trainer.KerasModelTrainer(model_stamp='kmax_text_cnn',
                                                epoch_num=50,
                                                learning_rate=1e-3)

models, val_loss, total_auc, fold_predictions = keras_model_trainer.train_folds(
    data, y_train, fold_count=10, batch_size=256, get_model_func=get_model)
print("Overall val-loss:", val_loss, "AUC", total_auc)

train_fold_preditcions = np.concatenate(fold_predictions, axis=0)
training_auc = roc_auc_score(y_train[:-1], train_fold_preditcions)
print("Training AUC", training_auc)

CLASSES = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
submit_path_prefix = "results/rnn/nds/fasttext-SC2-nds-randomNoisy-capNet-" + str(
    max_nb_words) + "-RST-lp-ct-" + str(max_length)

print("Predicting testing results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict(test_data, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)
    np.save("predict_path/", test_predicts)
コード例 #58
0
def create_model(dataset):

    print("dataset : ", dataset)
    df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None)

    print('reading', dataset)
    df['label'] = df[df.shape[1] - 1]
    #
    df.drop([df.shape[1] - 2], axis=1, inplace=True)
    labelencoder = LabelEncoder()
    df['label'] = labelencoder.fit_transform(df['label'])
    #
    X = np.array(df.drop(['label'], axis=1))
    y = np.array(df['label'])

    number_of_clusters = 23
    sampler = RandomUnderSampler()
    normalization_object = Normalizer()
    X = normalization_object.fit_transform(X)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    n_classes = 2

    for train_index, test_index in skf.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]

        y_train = y[train_index]
        y_test = y[test_index]

        break
    print('training', dataset)
    top_roc = 0

    depth_for_rus = 0
    split_for_rus = 0

    for depth in range(3, 20, 20):
        for split in range(3, 9, 20):

            classifier = AdaBoostClassifier(DecisionTreeClassifier(
                max_depth=depth, min_samples_split=split),
                                            n_estimators=100,
                                            learning_rate=1,
                                            algorithm='SAMME')

            X_train, y_train = sampler.fit_sample(X_train, y_train)

            classifier.fit(X_train, y_train)

            predictions = classifier.predict_proba(X_test)

            score = roc_auc_score(y_test, predictions[:, 1])

            if top_roc < score:
                top_roc = score

                tpr = dict()
                fpr = dict()
                roc = dict()
                for i in range(n_classes):
                    fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i])
                    roc[i] = roc_auc_score(y_test, predictions[:, i])

    major_class = max(sampler.fit(X_train, y_train).stats_c_,
                      key=sampler.fit(X_train, y_train).stats_c_.get)

    major_class_X_train = []
    major_class_y_train = []
    minor_class_X_train = []
    minor_class_y_train = []

    for index in range(len(X_train)):
        if y_train[index] == major_class:
            major_class_X_train.append(X_train[index])
            major_class_y_train.append(y_train[index])
        else:
            minor_class_X_train.append(X_train[index])
            minor_class_y_train.append(y_train[index])

    # optimize for number of clusters here
    kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters)
    kmeans.fit(major_class_X_train)

    # get the centroids of each of the clusters
    cluster_centroids = kmeans.cluster_centers_

    # get the points under each cluster
    points_under_each_cluster = {
        i: np.where(kmeans.labels_ == i)[0]
        for i in range(kmeans.n_clusters)
    }

    for i in range(number_of_clusters):
        size = len(points_under_each_cluster[i])
        random_indexes = np.random.randint(low=0,
                                           high=size,
                                           size=int(size / 2))
        temp = points_under_each_cluster[i]
        feature_indexes = temp[random_indexes]
        X_train_major = np.concatenate(
            (X_train_major, X_train[feature_indexes]), axis=0)
        y_train_major = np.concatenate(
            (y_train_major, y_train[feature_indexes]), axis=0)

    final_train_x = np.concatenate((X_train_major, minor_class_X_train),
                                   axis=0)
    final_train_y = np.concatenate((y_train_major, minor_class_y_train),
                                   axis=0)

    classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150))
    # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True)
    # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True)

    classifier.fit(final_train_x, final_train_y)

    predicted = classifier.predict_proba(X_test)

    tpr_c = dict()
    fpr_c = dict()
    roc_c = dict()
    for i in range(n_classes):
        fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i])
        roc_c[i] = auc(y_test, predictions[:, i])

    print('ploting', dataset)
    #    plt.clf()
    plt.plot(fpr[1],
             tpr[1],
             lw=2,
             color='red',
             label='Roc curve: Clustered sampling')

    plt.plot(fpr_c[1],
             tpr_c[1],
             lw=2,
             color='navy',
             label='Roc curve: random under sampling')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Area under ROC curve')
    plt.legend(loc="lower right")
    plt.show()
コード例 #59
0
ファイル: train.py プロジェクト: sheneman/deep_lipid
# Output the most important random forest features to a telemetry file
feature_file = open(config["importance"], "w")
for feature in sorted(zip(flabels, rf_classifier.feature_importances_),
                      key=lambda x: x[1],
                      reverse=True):
    feature_file.write("%s,%f\n" % feature)
feature_file.close()

# Create Results Table Here for ROC Curves
result_table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc'])

# SVM
Y_pred = svm_classifier.predict(X_test)
yproba = svm_classifier.predict_proba(X_test)[::, 1]
fpr, tpr, _ = roc_curve(Y_test, yproba)
auc = roc_auc_score(Y_test, yproba)
result_table = result_table.append(
    {
        'classifiers': "SVM",
        'fpr': fpr,
        'tpr': tpr,
        'auc': auc
    },
    ignore_index=True)
print('SVM Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('SVM Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('SVM Root Mean Squared Error:',
      numpy.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))
print('SVM FPR:', fpr)
print('SVM TPR:', tpr)
print('SVM AUC:', auc)
コード例 #60
-1
ファイル: test_movielens.py プロジェクト: linggom/lightfm
def test_user_supplied_features_accuracy():

    model = LightFM(random_state=SEED)
    model.fit_partial(
        train,
        user_features=train_user_features,
        item_features=train_item_features,
        epochs=10,
    )

    train_predictions = model.predict(
        train.row,
        train.col,
        user_features=train_user_features,
        item_features=train_item_features,
    )
    test_predictions = model.predict(
        test.row,
        test.col,
        user_features=test_user_features,
        item_features=test_item_features,
    )

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76