Esempio n. 1
0
def regression(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestRegressor as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeRegressor as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesRegressor as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "linear":
        from sklearn.linear_model import LinearRegression as lr
        cases = y.nunique()
        est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVR as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingRegressor as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPRegressor as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Esempio n. 2
0
def model_data(training_data):
    dtc = DecisionTreeClassifier(random_state=9, min_samples_split=5)
    dtc.fit(training_data['data'], training_data['result'])

    nn = MLPClassifier(solver='lbfgs',
                       alpha=1e-5,
                       hidden_layer_sizes=(5, 2),
                       random_state=1)
    nn.fit(training_data['data'], training_data['result'])

    svc = SVC(C=100, kernel="linear")
    svc.fit(training_data['data'], training_data['result'])

    rfc = RFC(n_estimators=10,
              criterion='entropy',
              max_depth=10,
              min_samples_split=5,
              bootstrap='true',
              random_state=None)
    rfc.fit(training_data['data'], training_data['result'])

    knc_map = knc(n_neighbors=15, weights='distance')
    knc_map.fit(training_data['data'], training_data['result'])

    gbc_map = gbc(n_estimators=150, verbose=0)
    gbc_map.fit(training_data['data'], training_data['result'])

    return {
        'Decision Tree Classifier': dtc,
        'Neural Networks': nn,
        'Support Vector Machines': svc,
        'Random Forest Classification': rfc,
        'k Nearest Neighbours': knc_map,
        'Gradient Boosting Classifier': gbc_map
    }
Esempio n. 3
0
def classifier(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeClassifier as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesClassifier as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "logistic":
        from sklearn.linear_model import LogisticRegression as lr
        cases = y.nunique()
        if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial")
        else: est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVC as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingClassifier as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPClassifier as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Esempio n. 4
0
def gradient_boosting_classifier(x_train, y_train, x_test, y_test, num_tree):
    model = gbc(loss='deviance', learning_rate=0.2, n_estimators=num_tree, subsample=1.0, min_samples_split=2,
                min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_depth=5, init=None, random_state=None,
                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)
    model.fit(x_train, y_train)
    expected = y_test
    predicted = model.predict(x_test)
    return expected, predicted
Esempio n. 5
0
def rfe_feature_output(x_train, y_train, x_test, y_test):
    rfe_tree = pd.DataFrame(recursive_ftr_elim(etc(), x_train, y_train, 1))
    rfe_tree.columns = ["etc_ftr_nm", "etc_ftr_rank"]
    rfe_gbst = pd.DataFrame(recursive_ftr_elim(gbc(), x_train, y_train, 1))
    rfe_gbst.columns = ["gbst_ftr_nm", "gbst_ftr_rank"]
    rfe_log = pd.DataFrame(
        recursive_ftr_elim(LogisticRegression(), x_train, y_train, 1))
    rfe_log.columns = ["log_regr_ftr_nm", "log_regr_ftr_rank"]
    pd.concat([rfe_tree, rfe_log, rfe_gbst],
              axis=1).to_csv(elim_out_path + timestamp + ".csv")
Esempio n. 6
0
def gradient_boosting_classifier(f_train, l_train, f_test):
    from sklearn.ensemble import GradientBoostingClassifier as gbc
    clf = gbc()
    import time
    start_time = time.time()
    clf.fit(f_train, l_train)
    print("Training Time: %s seconds" % (time.time() - start_time))
    start_time = time.time()
    pred = clf.predict_proba(f_test)
    print("Predicting Time: %s seconds" % (time.time() - start_time))
    return pred
def train_model_gbc (features, labels) :
	# Start with reduced param space
	#params_dict = {'n_estimators':[ 50, 60, 70, 80, 90], 'max_depth':[3], 'min_samples_leaf': [1, 2], 'learning_rate': [0.04, 0.05, 0.06], 'min_samples_split': [2, 5, 10], 'subsample': [0.8, 0.9, 1]}
	# params_dict = {'n_estimators':[70, 80, 90], 'max_depth':[3, 4], 'learning_rate': [0.03, 0.04, 0.05], 'subsample': [0.7, 0.8, 0.9], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5, 10]}
	params_dict = {'n_estimators':[60, 80, 100], 'max_depth':[4, 5, 6], 'learning_rate': [0.03, 0.05, 0.07], 'subsample': [0.5, 0.7, 0.9]}
	
	### Train estimator (initially only on final count
	clf = GridSearchCV(gbc(random_state = 30), params_dict, n_jobs = 4, scoring = 'roc_auc', cv = 5)
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best grid scores: %.4f" %(clf.best_score_))
	return clf
Esempio n. 8
0
def gbm():
    global features_train, labels_train, features_test
    from sklearn.ensemble import GradientBoostingClassifier as gbc
    from sklearn.grid_search import GridSearchCV as gscv
    param = {
        'learning_rate': [0.1, 0.01, 0.3, 0.4, 0.5, 0.2],
        "n_estimators": [10, 50, 100],
        'min_samples_split': [2, 5, 10, 15, 20, 25, 30]
    }
    svr = gbc()
    clf = gscv(svr, param)
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    return pred
Esempio n. 9
0
def model_gradientboosting_classifier(X_train, X_test, y_train, y_test):
    model_name = f'model_{count}_gradientboosting_classifier'

    model = gbc()
    model.fit(X_train, y_train)
    model.independentcols = independentcols
    y_pred = model.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # print(classification_report(y_test, y_pred))
    score = accuracy_score(y_test, y_pred)

    print(f'{model_name} accuracy: {score}')
    joblib.dump(model, f'model/{model_name}.joblib')
Esempio n. 10
0
def gradiant_boosting(X_train, y_train, X_valid, y_valid,feature_list=None,top_features_num=20):
    t0 = time()
    clf = gbc()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    # np.savetxt("random.csv", y_pred.astype(int), fmt='%i', delimiter=",")
    print("Classification report")
    print(classification_report(y_valid, y_pred))
    print("Confusion_matrix")
    print(confusion_matrix(y_valid, y_pred))
    print("done in %fs" % (time() - t0))
    y_score = clf.predict_proba(X_valid)[:, 1]
    if feature_list is not None:
        selected_features = rank_features(clf, feature_list, top_features_num)
    return y_score, selected_features
Esempio n. 11
0
def grad_bst_prediction(x_train, y_train, x_test, y_test):
    gb = gbc()
    gbst = gb.fit(x_train, y_train)
    gbst_pred = gbst.predict(x_test)
    if on_scrn == "Yes":
        print divider
        print "\nGradient Boosting Classifier mislabeled %d points out of a total of %d points" % (
            (y_test != gbst_pred).sum(), x_test.shape[0])
        print "\nGradient Boosting Classification Report:\n"
        print classification_report(y_test, gbst_pred)
        print divider
        cm(y_test, gbst_pred)
        print divider
    else:
        pass
    return gbst_pred
Esempio n. 12
0
    def classification(self, metric, folds, alphas, graph):
        size = 1.3 * self.report_width // 10

        models = {}
        models["K nearest neighbors classifier K2"]  = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]  = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10)        
        models["Decision tree classifier"]           = dtc()
        models["Logistic classifier"]                = logitc()
        models["SVM classifier with RBF kernel"]     = svc(gamma='scale')
        models["SVM classifier with linear kernel"]  = svc(kernel='linear')
        models["Gaussian naive bayes"]               = gnbc()
        models["Bernoulli naive bayes"]              = bnbc()
        models["SGD classifier"]                     = sgdc(max_iter=10000)
        models["Random forest classifier"]           = rfc(n_estimators=100)
        models["Gradient boosting classifier"]       = gbc()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        for model_name in models:
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Classifier': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
def train_model_gbc_calibrated_cv (features, labels, hold_out = False, train_sz = 0.9) :
	features_train, features_test = [], []
	labels_train, labels_test = [], []
	if (hold_out == True) :
		# First, set aside a some of the training set for calibration
		# Use stratified shuffle split so that class ratios are maintained after the split
		splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = train_sz, random_state = 30)

		# Length is 1 in this case since we have a single fold for splitting
		print (len(splitter))

		for train_idx, test_idx in splitter:
			features_train, features_test = features[train_idx], features[test_idx]
			labels_train, labels_test = labels[train_idx], labels[test_idx]
	else :
		features_train = features
		labels_train = labels

	print ("features_train shape: ", features_train.shape)
	print ("labels_train shape: ", labels_train.shape)
	if (hold_out == True) :
		print ("features_test shape: ", features_test.shape)
		print ("labels_test shape: ", labels_test.shape)
		
	print ("Parameters selected based on prior grid Search ...")
	# clf = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 80, learning_rate = 0.03, subsample = 0.8, max_features = 'sqrt')
	# clf = gbc(random_state = 30, max_depth = 3, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 120, learning_rate = 0.03, subsample = 0.8)
	clf = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 80, learning_rate = 0.03, subsample = 0.5)

	# Perform calibration 
	# Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting
	# 05/22 - Looks like isotonic does better than sigmoid for both Brier score and roc_auc_score.
	# Using 30-40% holdout actually improves ROC AUC for holdout score from 0.88 to 0.925 with CV=5
	print ("Performing Calibration now ...")
	# sigmoid = CalibratedClassifierCV(clf, cv=5, method='sigmoid')
	sigmoid = CalibratedClassifierCV(clf, cv=5, method='isotonic')
	sigmoid.fit(features_train, labels_train)

	if (hold_out == True) :
		# Calculate Brier score loss
		y_probs = sigmoid.predict_proba(features_test)[:, 1]
		clf_score = brier_score_loss(labels_test, y_probs)
		print ("Brier score: ", clf_score)
		auc_score = estimate_roc_auc (sigmoid, features_test, labels_test)

	return sigmoid
Esempio n. 14
0
def train():
    X_train, X_valid, y_train, y_valid = load_train_data()
    # Number of trees, increase this to beat the benchmark ;)
   
#   n_estimators = 10
#    clf = RandomForestClassifier(n_estimators=n_estimators)
   
    
#    for r in np.arange(3,50,3):
#        
#        tclf = gbc(n_estimators=50, learning_rate=0.35,max_depth=5,max_features=0.7,min_samples_leaf=6,min_samples_split=r)    
#        print(" --------- Testing Stuff -------------", "min_samples_split=", r)
#        tclf.fit(X_train, y_train)
#        ty_prob = tclf.predict_proba(X_valid)
#    
#        tencoder = LabelEncoder()
#        ty_true = tencoder.fit_transform(y_valid)
#        assert (tencoder.classes_ == tclf.classes_).all()
#    
#        tscore = logloss_mc(ty_true, ty_prob)
#        print(" -- Multiclass logloss While Testing Stuff: {:.4f}.".format(tscore))
        
        
    
    clf = gbc(n_estimators=25, learning_rate=0.18,min_samples_leaf=6, max_features=0.8,subsample=0.9,verbose=2,max_depth=10)  
#    clf = gbc(n_estimators=20, learning_rate=.13, min_samples_leaf=6,subsample=.8,max_features=0.6,verbose=2,max_depth=20)    
    
    print(" -- Start training Random Forest Classifier.")
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_valid)

    
    print(" -- Finished training.")

    encoder = LabelEncoder()
    y_true = encoder.fit_transform(y_valid)
    assert (encoder.classes_ == clf.classes_).all()

    score = logloss_mc(y_true, y_prob)
    print(" -- Multiclass logloss on validation set: {:.4f}.".format(score))

    return clf, encoder
Esempio n. 15
0
def grad_bst_ftr_eval(x_train, y_train, x_test, y_test):
    gb = gbc()
    gbst = gb.fit(x_train, y_train)
    gbst_ftr_imp = list(gbst.feature_importances_)
    gbst_ftr_eval = []
    for feature, importance in zip(depvar_ftrs, gbst_ftr_imp):
        ftr_update = {"name": feature, "score": importance}
        gbst_ftr_eval.append(ftr_update)
        if importance == 0.0:
            gbst_ftr_elim.append(feature)
    gbst_ftr_eval = sorted(gbst_ftr_eval,
                           key=itemgetter("score"),
                           reverse=True)
    if eval_on_scrn == "Yes":
        print divider
        print "\nGradient Boosting Classifier evaluated dependent variable data features as follows:\n"
        for i in range(len(gbst_ftr_eval)):
            print "{}. {}: %.5f".format(
                i + 1,
                gbst_ftr_eval[i]["name"].title()) % gbst_ftr_eval[i]["score"]
        print divider
    else:
        pass
    return gbst_ftr_eval
Esempio n. 16
0
        1
    ]
else:
    print "Searching over GBC and lasso"
    ess = [
        Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)),
                                     ("sc", StandardScaler()),
                       ("lr1", lr(n_jobs=args.n_jobs, penalty='l1', class_weight='balanced', random_state=args.random_state))]),
     #   Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)),
     #                                ("sc", StandardScaler()),
     #                  ("lr2", lr(n_jobs=n_jobs, class_weight='balanced', random_state=random_state))]),
     #   Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)),
     #                  ("rf", rf(n_jobs=n_jobs, class_weight='balanced', random_state=random_state))]),

        Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)),
                       ("gbc", gbc(random_state=args.random_state))]),
    ]

    es_names = [
        "lr1",
    #    "lr2",
    #    "rf",
        "gbc"
    ]

    paramss = [
        {
            #"lr1__C": expon(scale=0.1),
            #"lr1__C": uniform(0,5)#[2e-3],
            #One shot
        "lr1__C": [2e-3]
Esempio n. 17
0
pipeline = Pipeline([('deal_na', Deal_NAs()),
                     ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])),
                     scale_num])
#X_prepared = pipeline.fit_transform(X_)
X_train_p = pipeline.fit_transform(X_train)
X_vali_p = pipeline.transform(X_vali)

from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
model = lr(C=1)
model = dtc(min_samples_split=10, max_features=5)
model = abc(dtc(max_depth=4), n_estimators=100)
model = gbc(n_estimators=200)
#model = rfc(n_estimators=200 ,min_samples_split = 5)
model.fit(X_train_p, Y_train)
# print(model.score(X_train_p, Y_train))
# print(model.score(X_vali_p, Y_vali))
# coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]})
# coef_df.sort_values('coef', ascending = False)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
Y_pred = model.predict(X_vali_p)
print(classification_report(Y_vali, Y_pred))

print(submit.head())

test_p = pipeline.transform(test)
Esempio n. 18
0
def split_data(data,test_size):
    c=len(data)
    if(c<=test_size):
        test_size=int(c/2)
    t=np.arange(c)
    np.random.shuffle(t)
    data_train=data[t[:c-test_size]]
    data_test=data[t[c-test_size:]]
    return data_train,data_test
data_train,data_test=split_data(data,50)

x_train=data_train[:,0:-1]
y_train=data_train[:,-1]

pgrid={'n_estimators':range(5,21,5)}
clf=GridSearchCV(estimator=gbc(n_estimators=100,verbose=1),param_grid=pgrid,cv=3)
clf.fit(x_train,y_train)
print(clf.best_estimator_)
y_pred=clf.predict(x_train)
y_proba=clf.predict_proba(x_train)
print('accuracy:{0}'.format(metrics.accuracy_score(y_train,y_pred)))
print('AUC:{0}'.format(metrics.roc_auc_score(y_train,y_proba,multi_class='ovr')))

# get the accuracy of the test data
x_test=data_test[:,0:-1]
y_test=data_test[:,-1]
yhat=clf.predict(x_test)
r=np.mean(y_test==yhat)
print(r)

print("------------------------------------------------")
print("                                                ")
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train_new)
x_validate_scaled = scaler.transform(x_validate)

# Gradient boosting   ------------------------------------------------------------------
print(" Gradient Boosting ... ")
print("------------------------------------------------")
print("                                                ")

# Training...........................................
print("Training...........................")

classifier = gbc()
gbc_model = classifier.fit(x_train_scaled, y_train_new)

with open(
        '/home/mkolpe2s/rand/Classic_ML/Proper_method/GB/US8K/gbc_US8K_default_parameter.pkl',
        'wb') as f:
    pickle.dump(gbc_model, f)

print("Train score:", gbc_model.score(x_train_scaled, y_train_new))

#Validation..............................
print("Validation...........................")
print("Validation score:", gbc_model.score(x_validate_scaled, y_validate))

#Testing.................................
print("Testing...........................")
    def classification(self, metric, folds, printt=True, graph=False):
        size = self.graph_width

        if len(self.y.iloc[:,0].unique()) > 2:
            struct = 'multiclass'
        else:
            struct = 'binary'

        # significant model setup differences should be list as different models
        models = {}
        models["Linear discriminant analysis"]          = ldac()
        models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean')
        models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan')
        models["K nearest neighbors classifier K2"]     = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]     = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"]    = knnc(n_neighbors=10)        
        models["Decision tree classifier"]              = dtc()
        models["Gaussian naive bayes"]                  = gnbc()
        models["Bernoulli naive bayes"]                 = bnbc(binarize=0.5)
        models["Multinomial naive bayes"]               = mnbc()
        models["SGD classifier"]                        = sgdc(max_iter=10000)
        models["Ridge classifier"]                      = rc()

        if len(self.Xt_train) < 10000:
            models["SVM classifier RBF"]                = svc(gamma='scale')
            models["SVM classifier Linear"]             = svc(kernel='linear')
            models["SVM classifier Poly"]               = svc(kernel='poly')

        if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5:
            models["Gradient boosting classifier"]      = gbc()
            models["Random forest classifier"]          = rfc(n_estimators=100)

        if struct == 'multiclass':
            models["Logistic classifier multinomial"]   = logitc(multi_class='multinomial', solver='lbfgs')
            models["Logistic classifier auto"]          = logitc(multi_class='auto')
            models["Logistic One vs Rest"]              = ovrc(logitc())
            models["Logistic One vs One"]               = ovoc(logitc())

        if struct == 'binary':
            models["Logistic classifier"]               = logitc(max_iter=2000)

        self.models = models

        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        et = []
        for model_name in models:
            start = time.time()
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
            et.append((time.time() - start))
            #print(model_name, time.time() - start)
        report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        report.reset_index(inplace=True, drop=True)
        self.report_performance = report

        if printt:
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
            print(self.report_width * '*', '')
            print(report)
            print('\n')

        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0, bottom=0.25)
            self.graphs_model.append(fig)
            plt.show()             
        return None
Esempio n. 21
0
df_wins = df_concat[['pt_diff', 'ast_diff', 'or_diff', 'dr_diff', 'to_diff', 'stl_diff', 'blk_diff',
'pf_diff', 'fgp_diff', '3p_diff', 'ftp_diff', 'seed_diff', 'Season', 'Wteam', 'Lteam']]
df_wins['result'] = 1
df_losses = -df_concat[['pt_diff', 'ast_diff', 'or_diff', 'dr_diff', 'to_diff', 'stl_diff', 'blk_diff',
'pf_diff', 'fgp_diff', '3p_diff', 'ftp_diff', 'seed_diff', 'Season', 'Wteam', 'Lteam']]
df_losses['result'] = 0

df_for_predictions = pd.concat((df_wins, df_losses))

columns = df_for_predictions.columns.tolist()
columns = columns[:-4]

given_data = df_for_predictions.as_matrix(columns = columns)
target_data = np.array(df_for_predictions['result'].tolist())

clf = gbc(n_estimators = 10)
clf = clf.fit(given_data, target_data)
#
df_sample_sub = pd.read_csv('sample_submission.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(id):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in id.split('_'))

X_test = np.zeros(shape=(n_test_games, 12))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.id)
    # There absolutely must be a better way of doing this!
    t1_score = df_teams[(df_teams.Team_Id == t1) & (df_teams.Season == year)][['score','ast','or','dr','to','stl','blk','pf','fgp','3p','ftp']].values[0]
    t2_score = df_teams[(df_teams.Team_Id == t2) & (df_teams.Season == year)][['score','ast','or','dr','to','stl','blk','pf','fgp','3p','ftp']].values[0]
Esempio n. 22
0
train_data_file, test_data_file, test_result_dir = cmd.TrainTestFileParser(sys.argv, num)

trdata = pd.read_csv(train_data_file, header=None, sep=" ")
tedata = pd.read_csv(test_data_file, header=None, sep=" ")

# initialize classifier

params = {
    "n_estimators": 1000,
    "max_depth": 3,
    "subsample": 0.5,
    "learning_rate": 0.01,
    "min_samples_leaf": 1,
    "random_state": 3,
}

model = gbc(**params)

model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0])

acc = model.score(tedata.iloc[:, 1:], tedata.iloc[:, 0])

print("Accuracy: {:.4f}".format(acc))

# generate predictions
# preds = np.array(model.predict_proba(testdata))[:,1]


# save out
# mg.writeout(preds,testid,'predictions/rfcmodel_test.csv')
Esempio n. 23
0
    # standardize data

    num_index = [0, 1, 3, 4, 5]
    num_mu = np.zeros(X_train.shape[1])
    num_std = np.ones(X_train.shape[1])

    x_all = np.concatenate((X_train, X_test), axis=0)
    mu = np.mean(x_all, axis=0)
    std = np.std(x_all, axis=0)

    num_mu[num_index] = mu[num_index]
    num_std[num_index] = std[num_index]

    x_all_normal = (x_all - num_mu) / num_std
    x_train = x_all_normal[:X_train.shape[0]]
    x_test = x_all_normal[X_train.shape[0]:]

    y_train = Y_train.reshape(-1, )

    # initialize model and test and output result

    gbc_best = gbc(n_estimators=356, learning_rate=0.165,
                   random_state=112).fit(x_train, y_train)
    y_predict = gbc_best.predict(x_test)

    with open(str(sys.argv[6]), 'w', newline='') as csvfile:
        wr = csv.writer(csvfile)
        wr.writerow(['id', 'label'])
        for row_ind, row in enumerate(y_predict):
            wr.writerow([str(row_ind + 1), row])
Esempio n. 24
0
def make_model(col_labels = None, year = 2017, model_type = None):
    """make and run model"""

    data = pd.read_csv('NCAA2001_2017.csv')
    data_2018 = pd.read_csv('NCAA2018.csv')
    data_2018['year'] = 2018
    data = data.append(data_2018)

    # data to pull from the data frame
    if col_labels is None:
        col_labels = [
                'TopEFGPer', # effective field goal percentage
                'TopFTR', # free throw rate
                'TopTOPer', # turnover percentage
                'TopDRTG', # defensive rating
                'TopSOS', # strength of schedule
                'BotEFGPer',
                'BotFTR',
                'BotTOPer',
                'BotDRTG',
                'BotSOS'
                ]

    # don't scale SeedType
    if 'SeedType' in col_labels:
        col_labels.remove('SeedType')
        if len(col_labels) != 0:
            data[col_labels] = scale(data[col_labels])
        col_labels.insert(0, 'SeedType')
        
    else:
        data[col_labels] = scale(data[col_labels])

    # change SeedTypes to integers in case need to encode later
    data = data.replace(
            ['OneSixteen', 'TwoFifteen', 'ThreeFourteen',
                'FourThirteen', 'FiveTwelve', 'SixEleven',
                'SevenTen', 'EightNine'],
            [1, 2, 3, 4, 5, 6, 7, 8])

    train = data.loc[(data['year'] != year) & \
            (data['year'] != 2018)][col_labels]
    train_results = data.loc[(data['year'] != year) & \
            (data['year'] != 2018)]['Upset'] # not a df

    test = data.loc[data['year'] == year][col_labels]
    results_columns = ['SeedType', 'TopSeed', 'BotSeed', 'Upset']
    test_results = data.loc[data['year'] == year][results_columns]

    # have to one-hot the seeding type if that's in there
    if 'SeedType' in col_labels:
        enc = OneHotEncoder(categorical_features = [0]) # must be first
        train = enc.fit_transform(train).toarray()
        test = enc.fit_transform(test).toarray()
    else:
        train = train.as_matrix()
        test = test.as_matrix()

    # making the model
    if model_type == "forest":
        model = rf()
    elif model_type == "gbc":
        model = gbc()
    elif model_type == "svc":
        model = svc(probability = True)
    else:
        model = lm.LogisticRegression()
    model.fit(train, train_results.as_matrix())

    predictions = model.predict_proba(test)
    proba = []
    for i in range(len(predictions)):
        proba.append(predictions[i][1]) # second column is upset percentage

    test_results['UpsetProba'] = proba
    test_results = test_results.sort('UpsetProba', ascending = 0)

    print(test_results)
Esempio n. 25
0
    std = np.std(x_all, axis=0)

    num_mu[num_index] = mu[num_index]
    num_std[num_index] = std[num_index]

    x_all_normal = (x_all - num_mu) / num_std
    x_train = x_all_normal[:X_train.shape[0]]
    x_test = x_all_normal[X_train.shape[0]:]

    y_train = Y_train.reshape(-1, )

    # tune model

    top_n = 0.0
    top_lr = 0.0
    top_score = 0.0

    # it takes 2.5 hours to tune the model via Intel i5 CPU core

    for n in [50, 80, 100, 150, 200, 250, 300, 350]:
        for lr in [0.001, 0.005, 0.01, 0.0375, 0.05, 0.0875, 0.1, 0.15]:
            gbc_model = gbc(n_estimators=n, learning_rate=lr, random_state=112)
            res = cva(gbc_model, x_train, y_train, cv=10, n_jobs=-1)
            final_score = np.mean(res['test_score'])
            if final_score > top_score:
                top_score = final_score
                top_n = n
                top_lr = lr

    print('top_n = ', top_n, ' top_lr = ', top_lr, ' top_score = ', top_score)
Esempio n. 26
0
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
#from sklearn.svm import SVC as svc

clf1 = gnb()
#clf2 = lr()
clf3 = rfc()
clf4 = abc()
clf5 = gbc()
#clf6 = svc()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
from sklearn import metrics as mtr
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

	for num in num_list:

		train_data_file,test_data_file,test_result_dir = cmd.TrainTestFileParser(sys.argv,num)
		# test_result_file = open(test_result_dir+"rfc_test_result"+str(num)+".txt",'w+')


		trdata=pd.read_csv(train_data_file,header=None,sep=' ')
		tedata=pd.read_csv(test_data_file,header=None,sep=' ')


		params = {'n_estimators': 1000, 'max_depth': 3, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}

		model= gbc(**params)



		model = model.fit(trdata.iloc[:,1:],trdata.iloc[:,0])
		accur = model.score(tedata.iloc[:,1:],tedata.iloc[:,0])
		print('Out of Bag accuracy: %f \n' %accur)

		sum += accur


	print ('Average accuracy:'+str(sum/5))
	# resultClass = model.predict(tedata.iloc[:,1:])
	# #resultLogProba = model.predict_log_proba(tedata.iloc[:,1:])
	# resultProba = model.predict_proba(tedata.iloc[:,1:])
	"""

	### Enumerate best performing classifiers for 59 feature test set
	"""
	clf_rfc1 = rfc(random_state = 30, criterion = 'entropy', max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 70, max_features = 7)
	clf_rfc2 = rfc(random_state = 30, criterion = 'gini', n_estimators = 100)	# wild-card
	clf_gbc = gbc(random_state = 30, max_depth = 3, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 120, subsample = 0.9, learning_rate = 0.03)
	clf_etc1 = etc(random_state = 30, n_estimators = 350, criterion = 'entropy', min_samples_leaf = 2, min_samples_split = 5)
	clf_etc2 = etc(random_state = 30, n_estimators = 250, criterion = 'gini')		# wild-card
	# clf_adab = adab(random_state = 30, n_estimators = 300, learning_rate = 0.02)
	"""
	
	### Enumerate best performing classifiers for 22 feature test set
	clf_rfc1 = rfc(random_state = 30, criterion = 'entropy', max_depth = 7, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 50)
	clf_rfc2 = rfc(random_state = 30, criterion = 'gini', n_estimators = 120, max_depth = 8, min_samples_split = 2, min_samples_leaf = 5)	
	clf_gbc = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 80, subsample = 0.5, learning_rate = 0.03)
	clf_etc1 = etc(random_state = 30, n_estimators = 375, criterion = 'entropy', min_samples_leaf = 2, min_samples_split = 5)
	clf_etc2 = etc(random_state = 30, n_estimators = 60, criterion = 'gini',  min_samples_leaf = 2, min_samples_split = 10 )		
	
	clfs = [clf_rfc1,
					clf_rfc2,
					clf_etc1,
					clf_etc2,
					# pipe_adab,
					# clf_adab,
					# pipe_svc,
					clf_gbc]
	"""
	# Replace with Calibrated Classifiers instead	(cv5xccv5 is a bad idea because CCV splits are not stratified) - maybe try 3x3 or 5x2 instead 
	clfs = [CalibratedClassifierCV(clf_rfc1, cv=calib_folds, method='isotonic'),
					CalibratedClassifierCV(clf_gbc, cv=calib_folds, method='isotonic'),
Esempio n. 29
0
#pca = PCA(n_components=40)
#pca.fit(nptrd[:,range(1,94)])
#X = pca.transform(nptrd[:,range(1,94)])
PCAExplained = sum(pipeline.named_steps['pca'].explained_variance_ratio_)

# Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. 
# This indicates that only a few values are non-zero for most features.
# This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure

#forest = rfc(n_estimators=100,n_jobs=-1,min_samples_split=20,min_samples_leaf=10)

#forest = rfc(n_estimators=100,n_jobs=-1)

start = time.time()

forest = gbc(n_estimators=25, learning_rate=.15, min_samples_leaf=6,subsample=.9,max_features=0.6,verbose=2,max_depth=15)

forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])
#forest = forest.fit(X,nptrd[:,-1])

temp = forest.predict(nptrd[:,range(1,94)])
#temp = forest.predict(X)
TrainError = sum(temp == nptrd[:,-1]) / (len(nptrd)*1.0)

# Need to spend some time checking for overfit - using some elbow techiques maybe


# Cross validate the model using the cross validation dataset

#XCv = pipeline.transform(npcvd[:,range(1,94)])
outputCv = forest.predict(npcvd[:,range(1,94)])
Esempio n. 30
0
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting XGBoost to the Training set
from sklearn.ensemble import GradientBoostingClassifier as gbc
classifier = gbc()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()