def test_zero_estimator_clf():
    # Test if ZeroEstimator works for classification.
    X = iris.data
    y = np.array(iris.target)
    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init=ZeroEstimator())
    est.fit(X, y)

    assert_greater(est.score(X, y), 0.96)

    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='zero')
    est.fit(X, y)

    assert_greater(est.score(X, y), 0.96)

    # binary clf
    mask = y != 0
    y[mask] = 1
    y[~mask] = 0
    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='zero')
    est.fit(X, y)
    assert_greater(est.score(X, y), 0.96)

    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='foobar')
    assert_raises(ValueError, est.fit, X, y)
def run_gradient_boosting_classifier(data, _max_depth):
    (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int),
                                                                              test_size=0.25)
    # TODO: Vary Number of Estimators and Learning Rate
    gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True)
    gbc.fit(feature_train, label_train)
    training_error = gbc.score(feature_train, label_train)
    #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10)
    testing_error = gbc.score(feature_test, label_test)

    print "Random Forest Results for Max Depth:", _max_depth
    print "Training Accuracy:", training_error
    #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2)
    print "Testing Accuracy:", testing_error

    feature_importance = gbc.feature_importances_
    stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0)
    indices = np.argsort(feature_importance)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(len(feature_importance)):
        print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]]))

    plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def test_classification_synthetic():
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    for loss in ('deviance', 'exponential'):

        gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1,
                                          max_depth=1, loss=loss,
                                          learning_rate=1.0, random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.09, \
            "GB(loss={}) failed with error {}".format(loss, error_rate)

        gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1,
                                          max_depth=1,
                                          learning_rate=1.0, subsample=0.5,
                                          random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) "
                                   "failed with error {}".format(loss, error_rate))
Beispiel #4
0
def TestGradBoost(dat, lab):

    '''
    This function finds the optimal parameters for the classifier

    Parameters:
    -----------
    dat: numpy array with all records
    lab: numpy array with class labels of all records

    Returns:
    --------
    par: optimal parameters for the classifier

    '''

    # Gradient Boost parameters. Will choose one based on which does best on the validation set
    # learning_rate, subsample
    lr = np.linspace(0.01, 0.2, num = 5)
    sub = np.linspace(0.1, 1.0, num = 5)
    par = [(e,f) for e in lr for f in sub]

    # want to try different ensembles to get error bar on score
    num = 10
    seed = np.random.randint(1000000, size = num)
    valScore = np.zeros((num, len(par)))
    testScore = np.zeros((num, len(par)))

    for nv in xrange(0, num):

        print 'Ensemble:', nv + 1

        # split training data into train, validation, test (60, 20, 20)
        xTrain, xTmp, yTrain, yTmp = cross_validation.train_test_split(dat, lab, 
                                                                       test_size = 0.4, 
                                                                       random_state = seed[nv])
        xVal, xTest, yVal, yTest = cross_validation.train_test_split(xTmp, yTmp, 
                                                                     test_size = 0.5, 
                                                                     random_state = seed[nv])

        # now train RF for each parameter combination
        for i in xrange(0,len(par)):
        
            clf = GradientBoostingClassifier(learning_rate = par[i][0], subsample = par[i][1])
            clf = clf.fit(xTrain, yTrain)
            valScore[nv,i] = clf.score(xVal, yVal)
            testScore[nv,i] = clf.score(xTest, yTest)

    # Find optimal parameters
    tmp = np.argmax(np.mean(valScore, axis = 0))
    print
    print 'Optimal parameters (learning rate, subsampling):', par[tmp]
    print ('Mean | Std Score (Validation set):', np.mean(valScore, axis = 0)[tmp], 
           '|', np.std(valScore, axis = 0)[tmp])
    print ('Mean | Std Score (Test set):', np.mean(testScore, axis = 0)[tmp],
           '|', np.std(testScore, axis = 0)[tmp])

    # Return optimal parameters
    return par[tmp]
Beispiel #5
0
def plotLearningCurve(dat,lab,optim):

    '''
    This function plots the learning curve for the classifier

    Parameters:
    -----------
    dat: numpy array with all records
    lab: numpay array with class labels of all records
    optim: optimal parameters for classifier

    '''

    clf = GradientBoostingClassifier(learning_rate = optim[0], subsample = optim[1])

    # split training data into train and test (already chose optimal parameters)
    xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(dat, lab, 
                                                                     test_size = 0.3)

    # choose various sizes of training set to model on to generate learning curve
    szV = range(10, np.shape(xTrain)[0], int(np.shape(xTrain)[0]) / 10)
    szV.append(np.shape(xTrain)[0])

    LCvals = np.zeros((len(szV),3), dtype = np.float64) # store data points of learning curve
    for i in xrange(0, len(szV)):
        clf = clf.fit(xTrain[:szV[i],:], yTrain[:szV[i]])
        LCvals[i,0] = szV[i]
        LCvals[i,1] = clf.score(xTest, yTest)
        LCvals[i,2] = clf.score(xTrain[:szV[i],:], yTrain[:szV[i]])

    #print LCvals

    # generate figure
    fig = plt.figure(1, figsize = (10,10))
    prop = matplotlib.font_manager.FontProperties(size=15.5)
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,1], 
            label = 'Test Set')
    ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,2],
            label = 'Training Set')
    ax.set_ylabel(r"Error", fontsize = 20)
    ax.set_xlabel(r"% of Training Set Used", fontsize = 20)
    ax.axis([0.0, 1.0, -0.1, 0.5])
    plt.legend(loc = 'upper right', prop = prop)
    plt.savefig('LC_GB.pdf', bbox_inches = 'tight')
    fig.clear()

    # where is model failing?
    
    predProb = clf.predict_proba(xTest)
    tmp = np.zeros((np.shape(predProb)[0], np.shape(predProb)[1] + 2))
    tmp[:,:-2] = predProb
    tmp[:,-2] = clf.predict(xTest)
    tmp[:,-1] = yTest
    mask = tmp[:,-2] != tmp[:,-1]
    print tmp[mask]
    print mask.sum(), len(xTest)
    
    print tmp[:50,:]
Beispiel #6
0
def trainAndPredict(num_trees, train_num):
    train_X = X[:train_num]
    train_y = y[:train_num]  

    test_X = X[train_num:]
    test_y = y[train_num:]

    #clf = svm.SVC()
    clf = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.5, max_depth=2, random_state=0)
    clf.fit(train_X, train_y) 
    return (clf.score(train_X, train_y), clf.score(test_X, test_y))
Beispiel #7
0
def main():

    # generate synthetic binary classification data
    # (name refers to example 10.2 in ESL textbook...see refs below)
    X, y = make_hastie_10_2()

    # perform train/test split (no need to shuffle)
    split_pt = int(TRAIN_PCT * len(X))
    X_train, X_test = X[:split_pt], X[split_pt:]
    y_train, y_test = y[:split_pt], y[split_pt:]

    # single dec stump
    stump_clf = DecisionTreeClassifier(
        max_depth=1)
    stump_clf.fit(X_train, y_train)
    stump_score = round(stump_clf.score(X_test, y_test), 3)
    print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score)

    # single dec tree (max_depth=3)
    tree_clf = DecisionTreeClassifier(max_depth=3)
    tree_clf.fit(X_train, y_train)
    tree_score = round(tree_clf.score(X_test, y_test), 3)
    print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score)

    # gbt: a powerful ensemble technique
    gbt_scores = list()
    for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500):
        print 'fitting gbt for n_estimators = {}...'.format(k)

        gbt_clf = GradientBoostingClassifier(
            n_estimators=k,         # number of weak learners for this iteration
            max_depth=1,            # weak learners are dec stumps
            learning_rate=1.0)      # regularization (shrinkage) hyperparam

        gbt_clf.fit(X_train, y_train)
        gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3))

    print '\ngbt accuracy =\n{}\n'.format(gbt_scores)

    # stochastic gbt (using subsampling)
    sgbt_scores = list()
    for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500):
        print 'fitting sgbt for n_estimators = {}...'.format(k)

        sgbt_clf = GradientBoostingClassifier(
            n_estimators=k,         # number of weak learners for this iteration
            max_depth=1,            # weak learners are dec stumps
            subsample=0.5,          # % of training set used by each bc
            learning_rate=1.0)      # regularization (shrinkage) hyperparam

        sgbt_clf.fit(X_train, y_train)
        sgbt_scores.append(round(sgbt_clf.score(X_test, y_test), 3))

    print '\nsgbt accuracy =\n{}'.format(sgbt_scores)
Beispiel #8
0
def l1_penalty_solver(train_data,test_data,n_est,m_d):

    best = 0.0
    best_Output = []
    for j in [10**(x) for x in xrange(-3,-2,1)]:
        
        X, y = train_data[:,1::], train_data[:,0]
        x1, y1 = test_data[:,1::], test_data[:,0]
        
        # Set regularization parameter
        for C in range(10,11,1):
            # turn down tolerance for short training time
            #cls = svm.SVC(kernel='poly',degree=3).fit(X,y)
            cls = GradientBoostingClassifier(n_estimators=n_est,max_depth=m_d).fit(X,y)
            #cls = DecisionTreeClassifier().fit(X,y)
            #cls = LogisticRegression(C=C, penalty='l1', tol=j).fit(X, y)
            #cls = LogisticRegression(C=C, penalty='l2', tol=j).fit(X, y) 

            val1 = cls.predict(x1)
            #val1 = cls.predict(x1)
            val2 = val1 #cls.predict(x1)
                        
            count = 0.
            for i in range(len(val1)):
                if val1[i] == y1[i]:
                    count +=1.
                else:
                    continue
            result1 = count/len(val1)

            count = 0.
            for i in range(len(val2)):
                if val2[i] == y1[i]:
                    count +=1.
                else:
                    continue
            result2 = count/len(val2)
    
            if result1>best:
                best = result1
                best_Output = val1
            if result2>best:
                best = result2
                best_Output = val2
     
        
    pr.print_results(best_Output)
    #return best
    return [cls.score(X,y),cls.score(x1,y1)]
Beispiel #9
0
def test_iris():
    """Check consistency on dataset iris."""
    for subsample in (1.0, 0.5):
        clf = GradientBoostingClassifier(n_estimators=100, loss="deviance", random_state=1, subsample=subsample)
        clf.fit(iris.data, iris.target)
        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with subsample %.1f " "and score = %f" % (subsample, score)
Beispiel #10
0
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb):
    # feature extraction
    ### clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    ### extA = delFeatMin(clf.feature_importances_, EX_F)
    ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA]
    # k-fold validation
    kf   = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD)
    tesV = 0.0
    for train_index, test_index in kf:
        X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index]
        y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index]
        clf  =  GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train)
        tesK =  1 - clf.score(X_test, y_test)
        tesV += tesK
    eVal = tesV / N_FOLD
    # train all data
    clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    TEST__DATA_X = TEST__DATA_X[:, extA]
    if isProb:
        data = clf.predict_proba(TEST__DATA_X)
    else:
        data = clf.predict(TEST__DATA_X)

    print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS

    return (data, eVal)
def train_gbt(filename, color, name):
	'''Train on Gradient Boosted Trees Classifier'''
	# Read data
	data2 = pd.read_csv(filename, encoding="utf")
	X = data2.ix[:, 1:-1]
	y = data2.ix[:, -1]

	# Split into train, validation and test
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define model
	clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42)
	
	# Fit model
	t0 = time()
	clf1.fit(X_train, y_train)
	pred_probas = clf1.predict_proba(X_val)

	predictions = clf1.predict(X_val)
	
	print "Score", clf1.score(X_val, y_val)

	importances = clf1.feature_importances_
	indices = np.argsort(importances)[::-1]
	
	# Metrics & Plotting
	metrics[1, 0] = precision_score(y_val, predictions)
	metrics[1, 1] = recall_score(y_val, predictions)
	metrics[1, 2] = f1_score(y_val, predictions)
	metrics[1, 3] = time() - t0

	fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions)
	plt.plot(fpr_rf, tpr_rf, color=color, label=name)

	return importances, indices
Beispiel #12
0
def classify(train, train_sample_ids, test_sample_ids, whichClassifier):
  feature_names = list(train.columns)
  feature_names.remove("click_bool")
  feature_names.remove("booking_bool")
  feature_names.remove("gross_bookings_usd")
  #feature_names.remove("date_time")
  feature_names.remove("position")

  # Create Train and Test
  trainX = train[feature_names][train_sample_ids]
  testX = train[feature_names][test_sample_ids]
  Y_columns = ["click_bool", "booking_bool", "position"]
  trainY = train[Y_columns][train_sample_ids].apply(lambda x: objective(x, whichClassifier), axis=1)
  testY = train[Y_columns][test_sample_ids].apply(lambda x: objective(x, whichClassifier), axis=1)

  print "Train: ", len(trainY)
  print "Test: ", len(testY)

  print("Training the Classifier")
  classifier = GradientBoostingClassifier(n_estimators=1024, 
                                          verbose=3,
                                          subsample=0.8,
                                          min_samples_split=10,
                                          max_depth = 6,
                                          random_state=1)
  classifier.fit(trainX, trainY)
    
  print "Score = ", classifier.score(testX, testY)

  return classifier
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Gradient Boosting***************")
    t0 = time()
    clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Gradient Boosting - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)

    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Gradient Boosting***************")
    return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
Beispiel #14
0
    def rand_forest_train(self):
        # 读取本地用户特征信息
        users = pd.read_csv('names.csv')
        # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        y = users['human_or_machine']

        # 对原始数据进行分割, 25%的数据用于测试
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

        # 对类别特征进行转化,成为特征向量
        from sklearn.feature_extraction import DictVectorizer
        vec = DictVectorizer(sparse=False)
        X_train = vec.fit_transform(X_train.to_dict(orient='record'))
        X_test = vec.transform(X_test.to_dict(orient='record'))

        # 使用单一决策树进行集成模型的训练及预测分析
        from sklearn.tree import DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        dtc_y_pred = dtc.predict(X_test)

        # 使用随机森林分类器进行集成模型的训练及预测分析
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_test)

        # 使用梯度提升决策树进行集成模型的训练及预测分析
        from sklearn.ensemble import GradientBoostingClassifier
        gbc = GradientBoostingClassifier()
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_test)

        from sklearn.metrics import classification_report
        # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标
        print("单一决策树的准确性为", dtc.score(X_test, y_test))
        print(classification_report(dtc_y_pred, y_test))

        # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
        print("随机森林分类器的准确性为", rfc.score(X_test, y_test))
        print(classification_report(rfc_y_pred, y_test))

        # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
        print("梯度提升决策树的准确性为", gbc.score(X_test, y_test))
        print(classification_report(gbc_y_pred, y_test))


        users = pd.read_csv('values.csv')

        # 检验是否为机器或人类
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        X = vec.transform(X.to_dict(orient='record'))
        print(rfc.predict(X))

        self.dtc = dtc
        self.rfc = rfc
        self.gbc = gbc
def main():
    
    train_f = pd.read_csv(train_path, header=0, parse_dates=['Dates'])
    print train_f.dtypes

    X, Y = get_feature(train_f, "training_set")
    

    ### TRAINING
    clf = GradientBoostingClassifier(n_estimators=50)
    # clf = RandomForestClassifier(n_estimators=2)
    # clf = LogisticRegression(n_jobs=4)

    X, Y = shuffle_XY(X, Y)
    data_len = len(X)
    train_len = data_len * 95 / 100 
    val_len = data_len - train_len
    X_train = X[:train_len]
    X_val = X[train_len:]
    Y_train = Y[:train_len]
    Y_val = Y[train_len:]
    
    clf = clf.fit(X_train, Y_train)
    print "Training done"

    
    val_acc = clf.score(X_val, Y_val)
    print "Val acc:", val_acc

    val_pred = clf.predict_proba(X_val)
    

    # print max(Y_val), min(Y_val)
    # print Y_val, Y_val + 1
    val_log = 0.0
    cnt = 0
    for y in Y_val:
        val_log += math.log(val_pred[cnt, y]+0.0000001)
        cnt += 1
    val_log =  - val_log / len(Y_val)
    print "Val log loss:", val_log
 
    # print "Val loss:", log_loss(Y_val+1, val_pred) # Note the +1 here!
    """
    # scores = cross_val_score(clf, X, Y)
    # print "Cross val acc:", scores.mean()
    """

    ### Testing

    test_f = pd.read_csv(test_path, header=0, parse_dates=['Dates'])
    # print test_f.dtypes

    X_test, _ = get_feature(test_f, "test_set")
    Y_test = clf.predict_proba(X_test)

    ### Write results
    # write_results(Y_test)
    write_results_prob(Y_test)
def test_oob_multilcass_iris():
    # Check OOB improvement on multi-class dataset.
    clf = GradientBoostingClassifier(n_estimators=100, loss='deviance',
                                     random_state=1, subsample=0.5)
    clf.fit(iris.data, iris.target)
    score = clf.score(iris.data, iris.target)
    assert_greater(score, 0.9)
    assert_equal(clf.oob_improvement_.shape[0], clf.n_estimators)
Beispiel #17
0
def gbdt_clf(x_train,x_test,y_train,y_test):
    clf = GradientBoostingClassifier(n_estimators=100)
    clf.fit(x_train,y_train)
    y_pred = clf.predict_proba(x_test)[:,1]
    print "gbdt F1 scores",clf.score(x_test,y_test)
    scores = roc_auc_score(y_test,y_pred)
    print "gbdt_clf scores: ",scores
    joblib.dump(clf,'./output/gbdt_clf.model')
def test_oob_multilcass_iris():
    """Check OOB improvement on multi-class dataset."""
    clf = GradientBoostingClassifier(n_estimators=100, loss="deviance", random_state=1, subsample=0.5)
    clf.fit(iris.data, iris.target)
    score = clf.score(iris.data, iris.target)
    assert score > 0.9, "Failed with subsample %.1f " "and score = %f" % (0.5, score)

    assert clf.oob_improvement_.shape[0] == clf.n_estimators
def gradientBoostingClassify():
    maximumValue = 0
    returnParameters = ['0']
    for value in xrange(50,350,50):
        clfDeviance = GradientBoostingClassifier(n_estimators = value,loss='deviance')
        clfDeviance.fit(trainData, trainLabel)

        scoreEnt = clfDeviance.score(validationData, validationLabel)

        if scoreEnt > maximumValue:
            maximumValue = scoreEnt
            returnParameters[0] = str(value)
    neighTest = GradientBoostingClassifier(n_estimators = int(returnParameters[0]),loss='deviance')

    neighTest.fit(trainData, trainLabel)
    scoreTest = neighTest.score(testData, testLabel)
    guideToGraph['Gradient Boosting'] = scoreTest
def sentiment_analysis_random_forest(train, test, word2vec_model, num_features, num_estimators):
    trainDataVecs = getAvgFeatureVecs(train["review"], word2vec_model, num_features)
    testDataVecs = getAvgFeatureVecs(test["review"], word2vec_model, num_features)
    #forest = RandomForestClassifier(n_estimators=num_estimators)
    forest = GradientBoostingClassifier(n_estimators=num_estimators)
    forest.fit(trainDataVecs, train["sentiment"])
    result = forest.score(testDataVecs, test["sentiment"])
    print result
    return result
Beispiel #21
0
def performGTBClass(X_train, y_train, X_test, y_test):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = GradientBoostingClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
def gradientBoost(X, y, train, valid):
	from sklearn.ensemble import GradientBoostingClassifier
	clf1 = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=0).fit(X[train], y[train])
	print("gradientboosting" + str(clf1.score(X[valid].toarray(), y[valid])))
	yhat = clf1.predict(X[valid].toarray())
	yhat_prob = clf1.predict_proba(X[valid].toarray())[:,1]
	print(classification_report(y[valid], yhat))
	print("gradient boosting roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob)))
	np.savetxt("y_gb.csv", yhat_prob)
	return yhat_prob
Beispiel #23
0
def gb_classify(self):
	print "Gradient Boosting"
	clf = GradientBoostingClassifier()
	clf.fit(self.descr, self.target)
	mean = clf.score(self.test_descr, self.test_target)
	pred = clf.predict(self.test_descr)

	print "Pred ", pred
	print "Mean : %3f" % mean
	print "Feature Importances ", clf.feature_importances_
def test_iris():
    # Check consistency on dataset iris.
    for subsample in (1.0, 0.5):
        for sample_weight in (None, np.ones(len(iris.target))):
            clf = GradientBoostingClassifier(n_estimators=100, loss='deviance',
                                             random_state=1, subsample=subsample)
            clf.fit(iris.data, iris.target, sample_weight=sample_weight)
            score = clf.score(iris.data, iris.target)
            assert score > 0.9, "Failed with subsample %.1f " \
                "and score = %f" % (subsample, score)
Beispiel #25
0
def predictGBC(X, y):
	col_mean = np.nanmean(X,axis=0)
	inds = np.where(np.isnan(X))
	X[inds]=np.take(col_mean,inds[1])
	
	gbc = GBC(n_estimators = 100)

	X_train, X_test, y_train, y_test = chooseRandom(X, y)
	gbc.fit(X_train, y_train)
	return gbc.score(X_test, y_test)
def train(f, file_path):
  file_pt = open(file_path, "r")
  title = file_pt.readline()
  ret = None
  for l in file_pt.readlines():
    res = l.split(",")
    fet = f.create_features_from_res(res)
    if ret == None:
      ret = fet
    elif fet != None:
      ret = numpy.vstack((ret, fet))
    print ret.shape

#  classifier = RandomForestClassifier(n_estimators=100, 
#                                      verbose=2,
#                                      n_jobs=1,
#                                      min_samples_split=10,
#                                      random_state=1)

  classifier = GradientBoostingClassifier(n_estimators=512, 
                                          verbose=3,
                                          max_depth=6,
                                          min_samples_split=10,
                                          subsample=0.8,
                                          random_state=1)

  valid_ret = validate(f,  data_io.get_paths()["valid_sol_path"], classifier)
  ret = numpy.vstack( (ret, valid_ret) )
  print "Final size: ", ret.shape

  trainX, testX, trainY, testY = train_test_split(ret[:, 3:], ret[:, 0], random_state=1)

  classifier.fit(trainX, trainY)

  numpy.savetxt(data_io.get_paths()["feature_path"], ret.astype(float), fmt='%f', delimiter=",")

  print classifier.score(testX, testY)
  #validate(f, data_io.get_paths()["valid_sol_path"], classifier)
  print classifier.score(valid_ret[:, 3:], valid_ret[:, 0])

  return classifier
Beispiel #27
0
def do_gradient_boost(lr = 1.0, md = 1):
    #The best values of lr and md have to be determined through grid search
    # for this dataset ~ lr =0.05, md =3 gave 0.769 on the test set
    from sklearn.ensemble import GradientBoostingClassifier

    train_X, train_Y, test_X, test_Y = analysis_glass()

    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr,\
                                     max_depth=md, \
                                     random_state=0).fit(train_X, train_Y)
    
    return clf.score(test_X, test_Y)
Beispiel #28
0
def test_classification_synthetic():
    """Test GradientBoostingClassifier on synthetic dataset used by
    Hastie et al. in ESLII Example 12.7. """
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(
        n_estimators=100, min_samples_split=1, max_depth=1, learning_rate=1.0, random_state=0
    )
    gbrt.fit(X_train, y_train)
    error_rate = 1.0 - gbrt.score(X_test, y_test)
    assert error_rate < 0.085, "GB failed with error %.4f" % error_rate

    gbrt = GradientBoostingClassifier(
        n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0
    )
    gbrt.fit(X_train, y_train)
    error_rate = 1.0 - gbrt.score(X_test, y_test)
    assert error_rate < 0.08, "Stochastic GB failed with error %.4f" % error_rate
def check_iris(presort, subsample, sample_weight):
    # Check consistency on dataset iris.
    clf = GradientBoostingClassifier(n_estimators=100,
                                     loss='deviance',
                                     random_state=1,
                                     subsample=subsample,
                                     presort=presort)
    clf.fit(iris.data, iris.target, sample_weight=sample_weight)
    score = clf.score(iris.data, iris.target)
    assert_greater(score, 0.9)

    leaves = clf.apply(iris.data)
    assert_equal(leaves.shape, (150, 100, 3))
def check_classification_synthetic(presort, loss):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.09)

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0,
                                      presort=presort)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.08)
def execute_classifiers(X_train, y_train, X_test, y_test, X_train_scaled,
                        X_test_scaled):

    ########################################### Random Forest ##########################################

    print(datetime.datetime.now())
    print('\n')
    print('Random Forests')
    print('\n')
    clf = RandomForestClassifier(verbose=1, n_estimators=2000)
    clf.fit(X_train, y_train)
    print("Accuracy on training set is : {}".format(clf.score(
        X_train, y_train)))
    print("Accuracy on test set is : {}".format(clf.score(X_test, y_test)))
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    ############################################ XGBoost ###############################################

    print(datetime.datetime.now())
    print('\n')
    print('XGB Classifier')
    print('\n')

    xgb_cls = XGBClassifier(objective="multi:softprob",
                            num_class=20,
                            random_state=61,
                            colsample_bytree=0.6,
                            learning_rate=0.1,
                            n_estimators=200,
                            max_depth=8,
                            alpha=0.01,
                            gamma=0.001,
                            subsamples=0.6)

    xgb_cls.fit(X_train, y_train)
    print("Accuracy on training set is : {}".format(
        xgb_cls.score(X_train, y_train)))
    print("Accuracy on test set is : {}".format(xgb_cls.score(X_test, y_test)))
    y_pred = xgb_cls.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    ############################################# GB  ##################################################

    print(datetime.datetime.now())
    print('\n')
    print('GB Classifier')
    print('\n')

    gb_cls = GradientBoostingClassifier(min_samples_split=500,
                                        min_samples_leaf=50,
                                        max_depth=8,
                                        max_features='sqrt',
                                        subsample=0.8,
                                        n_estimators=200,
                                        learning_rate=0.2)

    gb_cls.fit(X_train, y_train)
    print("Accuracy on training set is : {}".format(
        gb_cls.score(X_train, y_train)))
    print("Accuracy on test set is : {}".format(gb_cls.score(X_test, y_test)))
    y_pred = gb_cls.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    ############################################ Knn ##################################################

    print(datetime.datetime.now())
    print('\n')
    print('Knn Classifier')
    print('\n')
    k = 11
    knn_cls = KNeighborsClassifier(n_neighbors=k)
    knn_cls.fit(X_train_scaled, y_train)
    print("Accuracy on training set is : {}".format(
        knn_cls.score(X_train_scaled, y_train)))
    print("Accuracy on test set is : {}".format(
        knn_cls.score(X_test_scaled, y_test)))
    y_pred = knn_cls.predict(X_test_scaled)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    ########################################### SVM Classifier ########################################

    print(datetime.datetime.now())
    print('\n')
    print('LinearSVC Classifier')
    print('\n')
    svm_cls = LinearSVC(C=1)
    svm_cls.fit(X_train_scaled, y_train)
    print("Accuracy on training set is : {}".format(
        svm_cls.score(X_train_scaled, y_train)))
    print("Accuracy on test set is : {}".format(
        svm_cls.score(X_test_scaled, y_test)))
    y_pred = svm_cls.predict(X_test_scaled)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(datetime.datetime.now())

    return True
from sklearn.ensemble import GradientBoostingClassifier

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(\
     cancer.data, cancer.target, 
     stratify=cancer.target, random_state=0)

clf = GradientBoostingClassifier(random_state=0, 
                                 n_estimators=1500,
                                 max_depth=3, 
                                 learning_rate=0.01, 
                                 subsample=0.5)
clf.fit(X_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))

import os
import pickle

try:
    if not(os.path.isdir("../../save")):
        os.makedirs(os.path.join("../../save"))
except OSError as e:
    if e.errno != errno.EEXIST:
        print("Failed to create directory!!!!!")
        raise
        
with open("../../save/save_model_using_pickle.bin", "wb") as f :
    pickle.dump(clf, f)
Y_pred_SVM = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

#tree decision
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred_tree = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

# GradientBoostingClassifier
gbs= GradientBoostingClassifier(random_state=1990)
gbs.fit(X_train,Y_train)
Y_pre_gbs=gbs.predict(X_test)

acc_decision_tree = round(gbs.score(X_train, Y_train) * 100, 2)

from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")

#confusion metrics
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions)

## precision and recall
from sklearn.metrics import precision_score, recall_score
print("Precision:", precision_score(Y_train, predictions))
print("Recall:",recall_score(Y_train, predictions))
    # # os.system('afplay /System/Library/Sounds/Sosumi.aiff')
    #
    #
    # Predict test data.
    y_true = np.array(Y_test)
    y_pred = clf.predict(X_test)
    #
    # # https://de.wikipedia.org/wiki/Kontingenztafel
    # # assess
    table = pd.crosstab(pd.Series(y_true),
                        pd.Series(y_pred),
                        rownames=['True'],
                        colnames=['Predicted'],
                        margins=True)
    print(table)
    print(clf.score(X_test, Y_test))

    # print(hits[hits>0])
    # print(importance[importance>0])
    if i % 100 == 0:
        print(i)
        # print(big_test_featureImportance[big_test_featureImportance>0])
        # np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/hits.csv', hits, fmt='%.1d', delimiter=',')
        # np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/importance.csv', importance,  delimiter=',')
# big_test_featureImportance = importance / hits
#
# np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/featureImportance_bigTest.csv',big_test_featureImportance,  delimiter=',')

# exit()
'''
Predicted  Avian  Human  Swine   All
Beispiel #35
0
    plt.ylabel("Признак")
    plt.show()


plot_feature_importances_cancer(forest)

"In[72]:"
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    random_state=0)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.3f}".format(
    gbrt.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.3f}".format(
    gbrt.score(X_test, y_test)))

"In[73]:"
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

print("Правильность на обучающем наборе: {:.3f}".format(
    gbrt.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.3f}".format(
    gbrt.score(X_test, y_test)))

"In[74]:"
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)
print('voting', vcr.score(X_train, Y_train))

logreg.fit(X_train, Y_train)
rf.fit(X_train, Y_train)
xg.fit(X_train, Y_train)
svc.fit(X_train, Y_train)
extree.fit(X_train, Y_train)
knn.fit(X_train, Y_train)
gb.fit(X_train, Y_train)
print('logreg', logreg.score(X_train, Y_train))
print('randforest', rf.score(X_train, Y_train))
print('extree', extree.score(X_train, Y_train))
print('svc', svc.score(X_train, Y_train))
print('xg', xg.score(X_train, Y_train))
print('knn', knn.score(X_train, Y_train))
print('gb', gb.score(X_train, Y_train))
#report(random_search.cv_results_)
#report(random_search_xgb.cv_results_)

# In[145]:

# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = DataFrame(titanic_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df

# In[146]:
Beispiel #37
0
def RandomLearning(LPATH, LFILE, LCNT, b, color, idn):
    def print2f(MSG):
        lf = open(LOGPATH + LFILE + '_out' + idn + '_random.txt', 'a')
        print >> lf, MSG
        lf.close()

    ff = open(LPATH + LFILE, 'r')
    idx = 0
    fRNA = np.zeros((LCNT, 23 * 4))
    label = np.zeros((LCNT, ))
    for line in ff:
        f = line.split('\t')
        if (int(f[1]) == 1):
            label[idx] = 1
        else:
            label[idx] = -1
        fRNA[idx] = Ronehot(f[0])
        idx += 1

    X_train, X_test, y_train, y_test = train_test_split(fRNA,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=0)
    print2f((np.shape(X_train), np.shape(y_train), np.shape(X_test),
             np.shape(y_test)))

    TRAINLEN = np.shape(X_train)[0]
    INTLEN = int(TRAINLEN * 0.01)

    aa = np.split(X_train, [TRAINLEN - INTLEN, TRAINLEN - INTLEN])
    X_train = aa[0]
    inttrain_x = aa[2]
    aa = np.split(y_train, [TRAINLEN - INTLEN, TRAINLEN - INTLEN])
    y_train = aa[0]
    inttrain_y = aa[2]

    TRAINLEN = np.shape(X_train)[0]
    INTLEN = np.shape(inttrain_x)[0]
    AUGLEN = TRAINLEN * 16

    Uset = set()
    Lset = set()
    for i in range(0, INTLEN):
        Lset.add(i)
    for i in range(INTLEN, TRAINLEN):
        Uset.add(i)

    train_x = np.zeros((AUGLEN, 23 * 4))
    train_y = np.zeros((AUGLEN, ))
    train_l = np.zeros((AUGLEN, ))
    patch = [set() for x in range(0, TRAINLEN)]

    for i in range(0, TRAINLEN):
        sample = X_train[i]
        R1 = np.zeros((4))
        R2 = np.zeros((4))
        for j in range(0, 4):
            R1[j] = 1
            for k in range(0, 4):
                R2[k] = 1
                RR = np.concatenate((R1, R2))
                for x in range(0, 8):
                    sample[x] = RR[x]
                train_x[i * 16 + j * 4 + k] = sample
                train_y[i * 16 + j * 4 + k] = y_train[i]
                train_l[i * 16 + j * 4 + k] = i
                patch[i].add(i * 16 + j * 4 + k)
                R2[k] = 0
            R1[j] = 0

    print2f((TRAINLEN, AUGLEN, INTLEN))
    print2f(
        (np.shape(X_train)[0], np.shape(train_x)[0], np.shape(inttrain_x)[0]))
    print2f((patch[0]))

    clf = GradientBoostingClassifier().fit(inttrain_x, inttrain_y)
    print2f(("init: ", clf.score(X_test, y_test)))
    clf2 = GradientBoostingClassifier().fit(X_train, y_train)
    print2f(("complete: ", clf2.score(X_test, y_test)))

    #for i in range(10,20):
    #    print (clf.predict(X_test[i]), clf.predict_proba(X_test[i])[0][1], clf.predict_log_proba(X_test[i])[0][1], math.log(clf.predict_proba(X_test[i])[0][1]), y_test[i])

    eps = np.spacing(1)
    ITER = int(TRAINLEN / b)
    patchsize = 16
    predpatch = [0.0 for x in range(0, patchsize)]
    ACC = []
    ITR = []
    LAB = []

    for IT in range(0, ITER):
        if (INTLEN + b > TRAINLEN):
            print2f(("OUT OF RANGE "))
            break
        Rm = random.sample(Uset, b)
        for elm in Rm:
            Lset.add(elm)
            Uset.remove(elm)
            inttrain_x = np.concatenate((inttrain_x, [X_train[elm]]), axis=0)
            inttrain_y = np.concatenate((inttrain_y, [y_train[elm]]), axis=0)
            INTLEN += 1
        print2f((np.shape(inttrain_x)[0], len(Lset), len(Uset)))
        clf = GradientBoostingClassifier().fit(inttrain_x, inttrain_y)
        res = clf.score(X_test, y_test)
        print2f(("iter: ", IT, res))
        ACC.append(res)
        ITR.append(IT)
        LAB.append(len(Lset))

    plt.plot(LAB, ACC, color)
    #plt.plot(LAB,ACC,'b*')
    plt.xlabel('Num of labels')
    plt.ylabel('Accuracy')
    plt.ylim(0.5, 1.0)
    plt.title(LFILE)
    plt.show()
Beispiel #38
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

X = pd.read_csv('Features1.csv')
y = pd.read_csv('Res.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Gb = GradientBoostingClassifier(learning_rate=0.09, max_depth=2)
Gb.fit(X_train, y_train.values.ravel())
print("Accuracy on training set: {:.3f}".format(Gb.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(Gb.score(X_test, y_test)))
Beispiel #39
0
class_scores['rfc'] = rfc.predict(tweets[list(emoji_codes)])


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train,y_train)
lr.score(X_train,y_train)

class_scores['lr'] = lr.predict(imdb[list(emoji_codes)])

class_scores.sample(10)

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(verbose=1)
gbc.fit(X_train,y_train)
gbc.score(X_train,y_train)
gbc.score(X_test,y_test)


from sklearn.model_selection import GridSearchCV
param_grid = {"learning_rate":[1,.5,0.1,.01], "n_estimators":[100, 300, 1000], "max_leaf_nodes":[5,10,None]}
clf = GridSearchCV(gbc, param_grid, verbose=2)
clf.fit(X_train, y_train)

clf.best_estimator_.score(X_train, y_train)
clf.best_estimator_.score(X_test, y_test)

class_scores['gbc_norm'] = np.sign((gbc-np.mean(gbc))/np.std(gbc))
gbc = clf.best_estimator_.predict_proba(imdb[list(emoji_codes)])[:,0]

class_scores
Beispiel #40
0
#!/usr/bin/env python

# ~/spy611/script/simple/scikit-learn_demo.py

# Demo:
# python ~/spy611/script/simple/scikit-learn_demo.py

# Ref:
# http://scikit-learn.org/stable/modules/ensemble.html#classification

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=1.0,
                                 max_depth=1,
                                 random_state=0).fit(X_train, y_train)
myscore = clf.score(X_test, y_test)
print myscore
X = df[feature_cols]  #features
y = df.Decision  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=2)

lr = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for i in lr:
    gb = GradientBoostingClassifier(n_estimators=100,
                                    max_depth=2,
                                    learning_rate=i)
    gb.fit(X_train, y_train)
    score = gb.score(X_test, y_test)

    #if st.checkbox('Show the learning rate with corresponding score', ):
    #st.write('learning rate ',i,': ', score)

gb = GradientBoostingClassifier(n_estimators=100,
                                max_depth=2,
                                learning_rate=0.75)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
                                          y_test, title, subaxes)

plt.show()

X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),
                                                    y_fruits.as_matrix(),
                                                    random_state=0)
fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))

pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]

for pair, axis in zip(pair_list, subaxes):
    X = X_train[:, pair]
    y = y_train

    clf = GradientBoostingClassifier().fit(X, y)
    plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title,
                                              axis, target_names_fruits)

    axis.set_xlabel(feature_names_fruits[pair[0]])
    axis.set_ylabel(feature_names_fruits[pair[1]])

plt.tight_layout()
plt.show()
clf = GradientBoostingClassifier().fit(X_train, y_train)

print('GBDT, Fruit dataset, default settings')
print('Accuracy of GBDT classifier on training set: {:.2f}'.format(
    clf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}'.format(
    clf.score(X_test, y_test)))
Beispiel #43
0
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split

# 梯度 ------> 导数
X, y = datasets.load_iris(return_X_y=True)
# cond = y != 2
# X = X[cond]
# y = y[cond]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

gbdt = GradientBoostingClassifier(n_estimators=10)
gbdt.fit(X_train, y_train)
print(gbdt.score(X_test, y_test))
print(gbdt.estimators_)
Beispiel #44
0
              verbose=False)
    gnb = GaussianNB()
    LR = LogisticRegression()

    x_train_tmp = x_train.iloc[:, item]
    x_test_tmp = x_test.iloc[:, item]

    knc.fit(x_train_tmp, y_train)
    dtc.fit(x_train_tmp, y_train)
    rfc.fit(x_train_tmp, y_train)
    gbc.fit(x_train_tmp, y_train)
    abc.fit(x_train_tmp, y_train)
    svc.fit(x_train_tmp, y_train)
    gnb.fit(x_train_tmp, y_train)
    LR.fit(x_train_tmp, y_train)

    list_accuracy_knc.append(knc.score(x_test_tmp, y_test))
    list_accuracy_dtc.append(dtc.score(x_test_tmp, y_test))
    list_accuracy_rfc.append(rfc.score(x_test_tmp, y_test))
    list_accuracy_gbc.append(gbc.score(x_test_tmp, y_test))
    list_accuracy_abc.append(abc.score(x_test_tmp, y_test))
    list_accuracy_svc.append(svc.score(x_test_tmp, y_test))
    list_accuracy_gnb.append(gnb.score(x_test_tmp, y_test))
    list_accuracy_lr.append(LR.score(x_test_tmp, y_test))

print("knc,dtc,rfc,gbc,abc,svc,gnb,lr")
for i in range(100):
    print(list_accuracy_knc[i],list_accuracy_dtc[i],list_accuracy_rfc[i],list_accuracy_gbc[i],\
    list_accuracy_abc[i],list_accuracy_svc[i],list_accuracy_gnb[i],list_accuracy_lr[i])
    # y_predict_rfc=rfc.predict(x_test_tmp)
    # list_matrix.append(confusion_matrix(y_test,y_predict_rfc))
Beispiel #45
0
                                                  random_state=state)

# Varying the learning rate (various models)
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=learning_rate,
                                        max_features=2,
                                        max_depth=2,
                                        random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
        gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        gb_clf.score(X_val, y_val)))

# Final Model
gb_clf2 = GradientBoostingClassifier(n_estimators=20,
                                     learning_rate=0.5,
                                     max_features=2,
                                     max_depth=2,
                                     random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))
Beispiel #46
0
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    train_size=0.8,
                                                    shuffle=True)

# model = DecisionTreeClassifier(max_depth=10)
# model = RandomForestClassifier(n_estimators=100)
model = GradientBoostingClassifier()

model.fit(x_train, y_train)

acc = model.score(x_test, y_test)

print(f"acc : {acc}")

print(model.feature_importances_)
# max_features : 기본 값 사용
# n_estimators : 클수록 좋다. 클수록 메모리도 많이 먹음
# n_jobs : 병렬처리(gpu를 같이 돌릴 때는 사용 x)

import matplotlib.pyplot as plt
import numpy as np


def plot_feature_importances_cancer(model):
    n_features = cancer.data.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
print("################Gradient Boosting Classifier############")

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=20,
                                 learning_rate=0.1,
                                 n_estimators=1000,
                                 max_depth=3,
                                 min_samples_split=5,
                                 min_samples_leaf=1,
                                 subsample=1,
                                 max_features='sqrt')

clf.fit(feature_set, y_train)
print("\nAccuracy on Training Set :")
print(clf.score(feature_set, y_train))

print("\nAccuracy on Testing Set :")
print(clf.score(feature_set_test, y_test))

y_pred = clf.predict(feature_set_test)

print("\nPrecision Score")
print(precision_score(y_test, y_pred))
print("\nRecall Score")
print(recall_score(y_test, y_pred))
print("\nF1 Score")
print(f1_score(y_test, y_pred))

print("################AdaBoost Classifier############")
def predict(team1, team2, city, toss_winner, toss_decision):
#def predict(input):
    with open('ipl/teamCodes.json', encoding='utf-8') as data_file:
        teams = json.loads(data_file.read())
    data_file.close()
    #print(teams)

    with open('ipl/venueCodes.json', encoding='utf-8') as data_file:
        venue = json.loads(data_file.read())
    #print(venue)
    data_file.close()

    with open('ipl/tossCodes.json', encoding='utf-8') as data_file:
        toss = json.loads(data_file.read())
    #print(toss)
    data_file.close()

    with open('ipl/reverseteamCodes.json', encoding='utf-8') as data_file:
        reverseteams = json.loads(data_file.read())
    #print(reverseteams)
    data_file.close()
    print("Input : ")
    print("Team1 : ",team1)
    print("Team2 : ",team2)
    print("City : ", city)
    print("Toss Winner : ", toss_winner)
    print("Toss Decision : ", toss_decision)
    # print(homeTeam, awayTeam, City, tossW, tossD)
    input=[]
    input.append(teams[team1])
    input.append(teams[team2])
    input.append(venue[city])
    input.append(teams[toss_winner])
    input.append(toss[toss_decision])

    #print("Numerical Input :", input)


    matches_data = pd.read_csv('ipl/matches_new.csv')
    matches_data = matches_data[['season','team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']]

    training = matches_data.loc[matches_data.season != 2018]
    testing = matches_data.loc[matches_data.season == 2018]
    training = training[['team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']]
    testing = testing[['team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']]
    testing.to_csv('ipl/team_prediction.csv',index=False)


    trainvector = training.values
    x_train = trainvector[:, 0:5]
    y_train = trainvector[:, 5]
    testvector = testing.values
    x_test = testvector[:, 0:5]
    y_test = testvector[:, 5]


    predictions =[]
    model1 = DecisionTreeClassifier(random_state=1)
    model1.fit(x_train,y_train)
    model11 = DecisionTreeClassifier(criterion="entropy",random_state=1)
    model11.fit(x_train, y_train)

    model2 = RandomForestClassifier(n_estimators=10)
    model2.fit(x_train, y_train)

    model3 = MLPClassifier(hidden_layer_sizes=(3,), activation='logistic',
                       solver='lbfgs', alpha=0.0001,learning_rate='constant',
                      learning_rate_init=0.001, max_iter= 10000)
    model3.fit(x_train, y_train)
    model4 = SVC(gamma='auto', probability=True)
    model4.fit(x_train,y_train)
    model6 = KNeighborsClassifier()
    model6.fit(x_train,y_train)
    pred1 = model1.predict([input])
    accu1 = model1.predict(x_test)
    pred11 = model11.predict([input])
    accu11 = model11.predict(x_test)
    pred2 = model2.predict([input])
    accu2 = model2.predict(x_test)
    pred3 = model3.predict([input])
    accu3 = model3.predict(x_test)
    pred4 = model4.predict([input])
    accu4 = model4.predict(x_test)
    model5 = LogisticRegression(multi_class='auto',solver='lbfgs',max_iter=10000).fit(x_train, y_train)
    pred5 = model5.predict([input])
    accu5 = model5.predict(x_test)
    pred6 = model6.predict([input])
    accu6 = model6.predict(x_test)

    model17 = GaussianNB()
    model17.fit(x_train, y_train)
    model18 = LinearSVC(max_iter=100000)
    model18.fit(x_train, y_train)

    pred17 = model17.predict([input])
    accu17 = model17.predict(x_test)
    pred18 = model18.predict([input])
    accu18 = model18.predict(x_test)

    predictions.append(reverseteams[str(pred1[0])])
    predictions.append(reverseteams[str(pred2[0])])
    predictions.append(reverseteams[str(pred3[0])])
    predictions.append(reverseteams[str(pred4[0])])
    #predictions.append(reverseteams[str(pred5[0])])
    predictions.append(reverseteams[str(pred6[0])])


    #predictions.append(reverseteams[str(pred17[0])])
    #predictions.append(reverseteams[str(pred18[0])])

    print("<30% accuracy : ")
    print("Gaussian Naive Bayes :", reverseteams[str(pred17[0])])
    print("Gaussian Naive Bayes  Accuracy : ", round(accuracy_score(y_test, accu17)*100,2))
    print("Linear SVC :", reverseteams[str(pred18[0])])
    print("Linear SVC Accuracy : ", round(accuracy_score(y_test, accu18)*100,2))
    #print("Logistic Regression :", reverseteams[str(pred5[0])])
    #print("Logistic Regression Accuracy : ", round(accuracy_score(y_test, accu5) * 100, 2))

    print("\n>30% accuracy : ")
    print("DecisionTreeClassifier :", reverseteams[str(pred1[0])])
    print("DecisionTreeClassifier Accuracy : ", round(accuracy_score(y_test, accu1)*100,2))
    print("DecisionTreeClassifier with entropy:", reverseteams[str(pred11[0])])
    print("DecisionTreeClassifier Accuracy : ", round(accuracy_score(y_test, accu11) * 100, 2))
    print("SVC :", reverseteams[str(pred4[0])])
    print("SVC Accuracy : ", round(accuracy_score(y_test, accu4)*100,2))
    print("KNeighbors Classifier :", reverseteams[str(pred6[0])])
    print("KNeighbors Classifier Accuracy : ", round(accuracy_score(y_test, accu6)*100,2))
    print("RandomForestClassifier :", reverseteams[str(pred2[0])])
    print("RandomForestClassifier Accuracy : ", round(accuracy_score(y_test, accu2)*100,2))
    print("MLPClassifier  :", reverseteams[str(pred3[0])])
    print("MLPClassifier  Accuracy : ", round(accuracy_score(y_test, accu3)*100,2))


    #Bagging
    #Building multiple models(same type) from different subsamples of the training dataset
    seed = 7
    kfold = model_selection.KFold(n_splits=10, random_state= seed)
    #cart = DecisionTreeClassifier()
    num_trees = 100
    model8 = BaggingClassifier(base_estimator=model1, n_estimators=num_trees, random_state=seed)
    # results = model_selection.cross_val_predict(model,x,y, cv=kfold)
    # print(results.mean())
    model8.fit(x_train,y_train)
    pred8 = model8.predict([input])
    predictions.append(reverseteams[str(pred8[0])])
    print("Bagging Prediction : ",reverseteams[str(pred8[0])])
    print("Bagging Accuracy : ", round((model8.score(x_test,y_test))*100,2))
    #print(results)

    #Boosting
    num_trees = 30
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    #model9 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
    model9 = GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10)
    model9.fit(x_train,y_train)
    pred9 = model9.predict([input])
    predictions.append(reverseteams[str(pred9[0])])
    print("GradientBoostingClassifier(Adaboost) Prediction : ",reverseteams[str(pred9[0])])
    print("GradientBoostingClassifier(Adaboost) Accuracy : ",round((model9.score(x_test,y_test))*100,2))

    model7 = VotingClassifier(estimators=[('bg',model8),('bo',model9), ('dt', model1), ('rf', model2),('ls', model3), ('sv', model4),('kn', model6)], voting='soft')
    model7.fit(x_train, y_train)
    pred7 = model7.predict([input])
    print("Voting Prediction ", reverseteams[str(pred7[0])])
    print("Voting Prediction Accuracy : ",round((model7.score(x_test, y_test))*100,2))
    predictions.append(reverseteams[str(pred7[0])])

    frequent = most_frequent(predictions)
    #print(type(frequent))
    #print(type(team1))
    #print(type(team2))
    #print(frequent)
    for strwinner in frequent:
        if strwinner == team1 or strwinner == team2:
            #print("winner",strwinner)
            return strwinner
    return winningProbabolity(team1,team2,city,toss_winner)
Beispiel #49
0
            features += get_features(coeff)
        list_features.append(features)
    return list_features, list_labels


X_train_ecg, Y_train_ecg = get_ecg_features(train_data_ecg, train_labels_ecg,
                                            'db4')
X_test_ecg, Y_test_ecg = get_ecg_features(test_data_ecg, test_labels_ecg,
                                          'db4')

X, Y = get_ecg_features(data_ecg, labels_ecg, 'db4')

gb = GradientBoostingClassifier(n_estimators=10000)

gb.fit(X_train_ecg, Y_train_ecg)
train_score = gb.score(X_train_ecg, Y_train_ecg)
test_score = gb.score(X_test_ecg, Y_test_ecg)
print("Train Score for the ECG dataset is about: {}".format(train_score))
print("Test Score for the ECG dataset is about: {}".format(test_score))
predictions = gb.predict(X_test_ecg)
print("Confusion Matrix:")
print(confusion_matrix(Y_test_ecg, predictions))

pred_all = gb.predict(X)
scores = cross_val_score(gb, X, Y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
sns.heatmap(confusion_matrix(Y, pred_all),
            annot=True,
            fmt='3.0f',
            cmap="summer")
plt.title('Confusion_matrix', y=1.05, size=15)
Beispiel #50
0
        one_array = np.ones(len(tokenized_texts), len(word2id))
        result = np.log(result + one_array)
        result = result.multiply( 1 / word2freq)

    if scale:
        #result = result.tocsc()
        #result = result.std(0, ddof = 1)
        result = result.tocsc()
        result -= result.min()
        result /= (result.max() + 1e-6)

    return result.tocsr()

VECTORIZATION_MODE = 'tfidf'
train_vectors = vectorize_texts(text_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

print('Размерность матрицы признаков обучающей выборки', train_vectors.shape)

clf = GradientBoostingClassifier(random_state=value)
clf.fit(train_vectors, y)

print(train_vectors[:1])

"""1-onion  3-milk   7-fries"""

clf.predict(train_vectors[:1])

"""в выборке есть молоко, и классификатор правильно это определил"""

clf.score(train_vectors, y)
Beispiel #51
0
joblib.dump(tfidf, 'statement_tfidf.model', compress=3)
vec = tfidf.transform(train_data)
numpy.savez_compressed("statement_vec.npz", vec.todense())

print('train gbt...', print_mem())
gbt = GradientBoostingClassifier(
    learning_rate=0.01,
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=10,
    min_samples_split=20,
    # max_features=9,
    verbose=1,
).fit(vec, target)
joblib.dump(gbt, 'statement_som_gbt.model')

yp = gbt.predict(vec)
print("training score : %.3f " % gbt.score(vec, target))
# correct=0
# wrong=0
# for i in range(len(yp)):
#     if target[i]==0 and yp[i]==0:
#         correct += 1
#     elif target[i] == 1 and yp[i] == 1:
#         correct +=1
#     else:
#         wrong+=1
# print('precision:', correct*1.0/(correct+wrong))
# print("Mean squared error: %.2f" % mean_squared_error(vec, target))
# print('Variance score: %.2f' % r2_score(yp, target))
Beispiel #52
0
#加载cancer数据集
cancer = load_breast_cancer()

#划分数据集
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    random_state=0)

#构建gbdt模型
gbdt = GradientBoostingClassifier(random_state=0)

#拟合训练集
gbdt.fit(X_train, y_train)

#输出预测结果
train_score = gbdt.score(X_train, y_train)
test_score = gbdt.score(X_test, y_test)
print("-" * 5, "未处理的GBDT", "-" * 5)
print("训练集精度:{:.3f}".format(train_score))
print("测试集精度:{:.3f}".format(test_score))

#由于训练集精度达到100%,可能存在过拟合,限制大树的深度来加强预剪枝
gbdt2 = GradientBoostingClassifier(random_state=0, max_depth=1)  #利用深度为1的决策树

#拟合训练集
gbdt2.fit(X_train, y_train)

#输出预测结果
train_score2 = gbdt2.score(X_train, y_train)
test_score2 = gbdt2.score(X_test, y_test)
print("-" * 5, "限制最大深度的GBDT", "-" * 5)
Beispiel #53
0
def main(argv):
    global PATH_IN,PATH_SCRIPT,PATH_OUT
    PATH_IN,PATH_SCRIPT,PATH_OUT = def_context.get_path()
    PATH_OUT = get_temp_path()
    if not os.path.exists(PATH_OUT+'model_PTV/'):
        os.makedirs(PATH_OUT+'model_PTV/')
    if(len(argv) == 0):
        argv = ['all']
    if(argv[0] == 'test'):
        Y_test = pd.read_csv('results.csv').values
        y_pred = pd.read_csv('y_pred.csv')
        y_pred2 = pd.read_csv('y_pred2.csv')
        y_pred3 = pd.read_csv('y_pred2.csv')
        y_pred4 = pd.read_csv('y_pred4.csv')
        y_pred5 = pd.read_csv('y_pred5.csv')

        logreg = use_logisticreg(y_pred,y_pred2,y_pred3,y_pred4,y_pred5,Y_test)
        res = pd.concat([y_pred,y_pred2,y_pred3,y_pred4,y_pred5],axis=1).values
        res = logreg.predict_proba(res)
        for p1 in [0]:
            for p2 in [0]:
                def_context.Report('################### '+str(p1)+' ### '+str(p2)+'###################')
                def_context.Report('############XGB##############')
                mesure(y_pred.values,Y_test,p1,p2)
                mismatch(y_pred.values,Y_test,p1,p2)
                acc(y_pred.values,Y_test,p1,p2)
                def_context.Report('############CatBoost##############')
                mesure(y_pred2.values,Y_test,p1,p2)
                mismatch(y_pred2.values,Y_test,p1,p2)
                acc(y_pred2.values,Y_test,p1,p2)
                def_context.Report('############GradientBoostingClassifier##############')
                mesure(y_pred4.values,Y_test,p1,p2)
                mismatch(y_pred4.values,Y_test,p1,p2)
                acc(y_pred4.values,Y_test,p1,p2)
                def_context.Report('############RandomForestClassifier##############')
                mesure(y_pred5.values,Y_test,p1,p2)
                mismatch(y_pred5.values,Y_test,p1,p2)
                acc(y_pred5.values,Y_test,p1,p2)
                def_context.Report('############Stack##############')
                mesure(res,Y_test,p1,p2)
                mismatch(res,Y_test,p1,p2)
                acc(res,Y_test,p1,p2)

    elif(len(argv) == 1):
        X,Y = load_all(argv[0])
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        X_train = X_train.replace([np.inf, -np.inf], np.nan)
        X_train = X_train.fillna(0)
        X_test = X_test.replace([np.inf, -np.inf], np.nan)
        X_test = X_test.fillna(0)
        Y_test  = [Y[0] for Y in Y_test.values]
        ##########################################
        np.random.seed(42)
        clf = Classifier()
        clf.fit(X_train,Y_train)
        y_pred = clf.predict_proba(X_test)
        clf2 = Classifier2()
        clf2.fit(X_train,Y_train)
        y_pred2 = clf2.predict_proba(X_test)

        dtree_model = DecisionTreeClassifier(max_depth = 10).fit(X_train,Y_train)
        y_pred3 = dtree_model.predict_proba(X_test)

        tpot = GradientBoostingClassifier(learning_rate=0.05, max_depth=10, max_features=0.75, min_samples_leaf=7, min_samples_split=16, n_estimators=500, subsample=0.9)
        tpot.fit(X_train,Y_train)
        def_context.Report(tpot.score(X_test, Y_test))
        y_pred4 = tpot.predict_proba(X_test)

        RF_model = RandomForestClassifier(max_depth = 10).fit(X_train,Y_train)
        y_pred5 = RF_model.predict_proba(X_test)

        y_p = clf.predict_proba(X_train)
        y_p2 = clf2.predict_proba(X_train)
        y_p3 = dtree_model.predict_proba(X_train)
        y_p4 = tpot.predict_proba(X_train)
        y_p5 = RF_model.predict_proba(X_train)

        logreg = use_logisticreg(y_p,y_p2,y_p3,y_p4,y_p5,Y_train)

        ##########################################
        save_model_xgb(clf)
        save_model_cat(clf2)
        save_model(dtree_model,"DT")
        save_model(RF_model,"RF")
        pickle.dump(tpot, open(PATH_OUT+"model_PTV/GradientBoostingClassifier.pickle.dat", "wb"))
        pickle.dump(RF_model, open(PATH_OUT+"model_PTV/RandomForestClassifier.pickle.dat", "wb"))
        X = pd.concat([pd.DataFrame(y_pred),pd.DataFrame(y_pred2),pd.DataFrame(y_pred3),pd.DataFrame(y_pred4),pd.DataFrame(y_pred5)],axis = 1).values
        res = logreg.predict_proba(X)
        for p1,p2 in zip([0],[0]):
            def_context.Report('############XGB##############')
            mesure(y_pred,Y_test,p1,p2)
            mismatch(y_pred,Y_test,p1,p2)
            acc(y_pred,Y_test,p1,p2)
            def_context.Report('############CatBoost##############')
            mesure(y_pred2,Y_test,p1,p2)
            mismatch(y_pred2,Y_test,p1,p2)
            acc(y_pred2,Y_test,p1,p2)
            def_context.Report('############DecisionTreeClassifier##############')
            mesure(y_pred3,Y_test,p1,p2)
            mismatch(y_pred3,Y_test,p1,p2)
            acc(y_pred3,Y_test,p1,p2)
            def_context.Report('############GradientBoostingClassifier##############')
            mesure(y_pred4,Y_test,p1,p2)
            mismatch(y_pred4,Y_test,p1,p2)
            acc(y_pred4,Y_test,p1,p2)
            def_context.Report('############RandomForestClassifier##############')
            mesure(y_pred5,Y_test,p1,p2)
            mismatch(y_pred5,Y_test,p1,p2)
            acc(y_pred5,Y_test,p1,p2)
            def_context.Report('############Stack##############')
            mesure(res,Y_test,p1,p2)
            mismatch(res,Y_test,p1,p2)
            acc(res,Y_test,p1,p2)

        #ROC_curve(y_pred,Y_test)
        #ROC_curve(y_pred2,Y_test)
        pd.DataFrame(Y_test).to_csv('results.csv',index=False)
        pd.DataFrame(y_pred).to_csv('y_pred.csv',index=False)
        pd.DataFrame(y_pred2).to_csv('y_pred2.csv',index=False)
        pd.DataFrame(y_pred3).to_csv('y_pred3.csv',index=False)
        pd.DataFrame(y_pred4).to_csv('y_pred4.csv',index=False)
        pd.DataFrame(y_pred5).to_csv('y_pred5.csv',index=False)


    return ("process achevé sans erreures")
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
score_rf = cross_val_score(random_forest, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')
print('Random Forest Cross: {}\nRandom Forest:       {}'.format(round(np.mean(score_rf)*100,2), acc_random_forest))


# In[112]:


from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(X_train, Y_train)
Y_pred = gbk.predict(X_test)
acc_gbk = round(gbk.score(X_train, Y_train) * 100, 2)
score_gbk = cross_val_score(gbk, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')
print('Gradient Boosting Classifier Cross: {}\nGradient Boosting Classifier:       {}'.format(round(np.mean(score_gbk)*100,2), acc_gbk))


# In[113]:


from sklearn.svm import SVC

svc = SVC(gamma = 'scale')
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
score_svc = cross_val_score(svc, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')
print('SVC Cross: {}\nSVC:       {}'.format(round(np.mean(score_svc)*100,2), acc_svc))
Beispiel #55
0
X_test = vec.transform(X_test.to_dict(orient='record'))

#使用单一决策树
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)

#使用森林
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)

#使用梯度
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)

from sklearn.metrics import classification_report
print('The accuracy of decision tree is:', dtc.score(X_test, y_test))
print(classification_report(dtc_y_pred, y_test))

print('The accuracy of randomforestclassifier is:', rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))

print('The accuracy of gradientboostingclassifier is:',
      gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))
# Делим данные на учебную и тестовую части:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# Создаем модель:
clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=1.0,
                                 max_depth=1,
                                 random_state=0).fit(X_train, y_train)

y_pred = clf.predict(X_test)  # Делаем прогноз для тестовых данных

# Оцениваем точность, сравнивая прогноз с фактическими значениями:
print('Accuracy score for test data:', metrics.accuracy_score(y_test, y_pred))

# То же самое в одну строку:
print('\nMean accuracy for test data:', clf.score(X_test, y_test))

# Рассчитываем вероятности отнесения объектов к разным классам:
y_pred_prob = clf.predict_proba(X_test)
y_test_classes = pd.get_dummies(y_test)

# Выводим показатель ROC AUC:
print('\nROC AUC:', metrics.roc_auc_score(y_test_classes, y_pred_prob))

# Создадим confusion_matrix для анализа результатов и посмотрим,
# в классификации объектов какого класса модель чаще делает ошибки:
col_names = ['pred_' + i for i in target_names]
ind = ['fact_' + i for i in target_names]

conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred),
                           columns=col_names,
Beispiel #57
0
print(
    "\n ==================================================================== 03: AdaBoost with Decision Tree"
)
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(n_estimators=100)
ada_model.fit(seq_x, seq_y)
acc_ada = ada_model.score(seq_x_test, seq_y_test)
print('Ada Boost Score: ', acc_ada)

print(
    "\n ==================================================================== 04: Gradient boost"
)
from sklearn.ensemble import GradientBoostingClassifier
gra_model = GradientBoostingClassifier(n_estimators=100)
gra_model.fit(seq_x, seq_y)
acc_gra = gra_model.score(seq_x_test, seq_y_test)
print('Grad Boost: ', acc_gra)

print(
    "\n ==================================================================== 05: Voting"
)
from sklearn.ensemble import VotingClassifier

voting_model = VotingClassifier(estimators=[('m01', gra_model),
                                            ('m02', ada_model),
                                            ('m03', rf_model),
                                            ('m04', bag_model)],
                                voting='soft')
voting_model.fit(seq_x, seq_y)

acc_voting = voting_model.score(seq_x_test, seq_y_test)
model6.fit(train_x,train_y)


# In[ ]:


#Model Performance
plot_model_var_imp(model1, train_x, train_y)


# In[ ]:


#Model Performance
print ('Model 2', model2.score( train_x , train_y ) , model2.score( valid_x , valid_y ))
print ('Model 3', model3.score( train_x , train_y ) , model3.score( valid_x , valid_y ))
print ('Model 4', model4.score( train_x , train_y ) , model4.score( valid_x , valid_y ))
print ('Model 5', model5.score( train_x , train_y ) , model5.score( valid_x , valid_y ))
print ('Model 6', model6.score( train_x , train_y ) , model6.score( valid_x , valid_y ))


# In[ ]:


rfecv = RFECV( estimator = model1, step = 1 , cv = StratifiedKFold( train_y , 2 ) , scoring = 'accuracy' )
rfecv.fit( train_x , train_y )


# In[ ]:

Beispiel #59
0
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,f1_score


train_df,test_df, age_gender_df,countries_df,session_df =  utils.load_data("../data/")

train_df = utils.training_feature(train_df)
print(train_df.shape,test_df.shape)


#one hot encoding
categorical_features = list(train_df.select_dtypes('object').columns)
categorical_features.append('signup_flow')
categorical_features.remove('id')
categorical_features.remove('country_destination')
print(categorical_features,train_df.columns)
train_df = pd.get_dummies(train_df,columns=categorical_features,drop_first=True)

X_train,X_test,y_train,y_test = train_test_split(train_df.drop(['country_destination','id'],axis=1),train_df['country_destination'],test_size=0.3,random_state=42)

gbc = GradientBoostingClassifier()

gbc.fit(X_train,y_train)

gbc.score(X_train,y_train)

y_pred = gbc.predict(X_test)
f1 = f1_score(y_test,y_pred,average='weighted')

print(f1,'f1 score')
    Xtrain = pd.read_csv("MNIST_X_train.csv").values
    ytrain = pd.read_csv("MNIST_y_train.csv").values
    Xtest = pd.read_csv("MNIST_X_test.csv").values
    ytest = pd.read_csv("MNIST_y_test.csv").values
    ytrain, ytest = ytrain.flatten(), ytest.flatten()

    lb = LabelBinarizer(neg_label=0)
    lb.fit(ytrain)
    ytrain_ohe = lb.transform(ytrain)
    ytest_ohe = lb.transform(ytest)

    start = time.time()

    GBTC = GBT_classifier(n_estimators=100, max_depth=3, lr=0.5)
    GBTC.fit(Xtrain, ytrain_ohe)
    ypred = GBTC.predict(Xtest)

    end = time.time()
    score = accuracy(ytest, ypred)
    print("The accuracy of multiclass classification is {:.2f}%".format(
        score * 100))  #87.8%
    print("Takes {:.2f} seconds.".format(end - start))

    gbc = GradientBoostingClassifier(learning_rate=0.5,
                                     n_estimators=100,
                                     max_depth=3,
                                     max_features=2)
    gbc.fit(Xtrain, ytrain)
    score = gbc.score(Xtest, ytest)
    print(score)  # 87.2%