def regression(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestRegressor as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeRegressor as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesRegressor as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "linear": from sklearn.linear_model import LinearRegression as lr cases = y.nunique() est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVR as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingRegressor as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPRegressor as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def model_data(training_data): dtc = DecisionTreeClassifier(random_state=9, min_samples_split=5) dtc.fit(training_data['data'], training_data['result']) nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) nn.fit(training_data['data'], training_data['result']) svc = SVC(C=100, kernel="linear") svc.fit(training_data['data'], training_data['result']) rfc = RFC(n_estimators=10, criterion='entropy', max_depth=10, min_samples_split=5, bootstrap='true', random_state=None) rfc.fit(training_data['data'], training_data['result']) knc_map = knc(n_neighbors=15, weights='distance') knc_map.fit(training_data['data'], training_data['result']) gbc_map = gbc(n_estimators=150, verbose=0) gbc_map.fit(training_data['data'], training_data['result']) return { 'Decision Tree Classifier': dtc, 'Neural Networks': nn, 'Support Vector Machines': svc, 'Random Forest Classification': rfc, 'k Nearest Neighbours': knc_map, 'Gradient Boosting Classifier': gbc_map }
def classifier(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestClassifier as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeClassifier as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesClassifier as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "logistic": from sklearn.linear_model import LogisticRegression as lr cases = y.nunique() if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial") else: est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVC as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingClassifier as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPClassifier as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def gradient_boosting_classifier(x_train, y_train, x_test, y_test, num_tree): model = gbc(loss='deviance', learning_rate=0.2, n_estimators=num_tree, subsample=1.0, min_samples_split=2, min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_depth=5, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) model.fit(x_train, y_train) expected = y_test predicted = model.predict(x_test) return expected, predicted
def rfe_feature_output(x_train, y_train, x_test, y_test): rfe_tree = pd.DataFrame(recursive_ftr_elim(etc(), x_train, y_train, 1)) rfe_tree.columns = ["etc_ftr_nm", "etc_ftr_rank"] rfe_gbst = pd.DataFrame(recursive_ftr_elim(gbc(), x_train, y_train, 1)) rfe_gbst.columns = ["gbst_ftr_nm", "gbst_ftr_rank"] rfe_log = pd.DataFrame( recursive_ftr_elim(LogisticRegression(), x_train, y_train, 1)) rfe_log.columns = ["log_regr_ftr_nm", "log_regr_ftr_rank"] pd.concat([rfe_tree, rfe_log, rfe_gbst], axis=1).to_csv(elim_out_path + timestamp + ".csv")
def gradient_boosting_classifier(f_train, l_train, f_test): from sklearn.ensemble import GradientBoostingClassifier as gbc clf = gbc() import time start_time = time.time() clf.fit(f_train, l_train) print("Training Time: %s seconds" % (time.time() - start_time)) start_time = time.time() pred = clf.predict_proba(f_test) print("Predicting Time: %s seconds" % (time.time() - start_time)) return pred
def train_model_gbc (features, labels) : # Start with reduced param space #params_dict = {'n_estimators':[ 50, 60, 70, 80, 90], 'max_depth':[3], 'min_samples_leaf': [1, 2], 'learning_rate': [0.04, 0.05, 0.06], 'min_samples_split': [2, 5, 10], 'subsample': [0.8, 0.9, 1]} # params_dict = {'n_estimators':[70, 80, 90], 'max_depth':[3, 4], 'learning_rate': [0.03, 0.04, 0.05], 'subsample': [0.7, 0.8, 0.9], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5, 10]} params_dict = {'n_estimators':[60, 80, 100], 'max_depth':[4, 5, 6], 'learning_rate': [0.03, 0.05, 0.07], 'subsample': [0.5, 0.7, 0.9]} ### Train estimator (initially only on final count clf = GridSearchCV(gbc(random_state = 30), params_dict, n_jobs = 4, scoring = 'roc_auc', cv = 5) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best grid scores: %.4f" %(clf.best_score_)) return clf
def gbm(): global features_train, labels_train, features_test from sklearn.ensemble import GradientBoostingClassifier as gbc from sklearn.grid_search import GridSearchCV as gscv param = { 'learning_rate': [0.1, 0.01, 0.3, 0.4, 0.5, 0.2], "n_estimators": [10, 50, 100], 'min_samples_split': [2, 5, 10, 15, 20, 25, 30] } svr = gbc() clf = gscv(svr, param) clf.fit(features_train, labels_train) pred = clf.predict(features_test) return pred
def model_gradientboosting_classifier(X_train, X_test, y_train, y_test): model_name = f'model_{count}_gradientboosting_classifier' model = gbc() model.fit(X_train, y_train) model.independentcols = independentcols y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) # print(classification_report(y_test, y_pred)) score = accuracy_score(y_test, y_pred) print(f'{model_name} accuracy: {score}') joblib.dump(model, f'model/{model_name}.joblib')
def gradiant_boosting(X_train, y_train, X_valid, y_valid,feature_list=None,top_features_num=20): t0 = time() clf = gbc() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_valid) # np.savetxt("random.csv", y_pred.astype(int), fmt='%i', delimiter=",") print("Classification report") print(classification_report(y_valid, y_pred)) print("Confusion_matrix") print(confusion_matrix(y_valid, y_pred)) print("done in %fs" % (time() - t0)) y_score = clf.predict_proba(X_valid)[:, 1] if feature_list is not None: selected_features = rank_features(clf, feature_list, top_features_num) return y_score, selected_features
def grad_bst_prediction(x_train, y_train, x_test, y_test): gb = gbc() gbst = gb.fit(x_train, y_train) gbst_pred = gbst.predict(x_test) if on_scrn == "Yes": print divider print "\nGradient Boosting Classifier mislabeled %d points out of a total of %d points" % ( (y_test != gbst_pred).sum(), x_test.shape[0]) print "\nGradient Boosting Classification Report:\n" print classification_report(y_test, gbst_pred) print divider cm(y_test, gbst_pred) print divider else: pass return gbst_pred
def classification(self, metric, folds, alphas, graph): size = 1.3 * self.report_width // 10 models = {} models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Logistic classifier"] = logitc() models["SVM classifier with RBF kernel"] = svc(gamma='scale') models["SVM classifier with linear kernel"] = svc(kernel='linear') models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Random forest classifier"] = rfc(n_estimators=100) models["Gradient boosting classifier"] = gbc() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] for model_name in models: cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Classifier': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def train_model_gbc_calibrated_cv (features, labels, hold_out = False, train_sz = 0.9) : features_train, features_test = [], [] labels_train, labels_test = [], [] if (hold_out == True) : # First, set aside a some of the training set for calibration # Use stratified shuffle split so that class ratios are maintained after the split splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = train_sz, random_state = 30) # Length is 1 in this case since we have a single fold for splitting print (len(splitter)) for train_idx, test_idx in splitter: features_train, features_test = features[train_idx], features[test_idx] labels_train, labels_test = labels[train_idx], labels[test_idx] else : features_train = features labels_train = labels print ("features_train shape: ", features_train.shape) print ("labels_train shape: ", labels_train.shape) if (hold_out == True) : print ("features_test shape: ", features_test.shape) print ("labels_test shape: ", labels_test.shape) print ("Parameters selected based on prior grid Search ...") # clf = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 80, learning_rate = 0.03, subsample = 0.8, max_features = 'sqrt') # clf = gbc(random_state = 30, max_depth = 3, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 120, learning_rate = 0.03, subsample = 0.8) clf = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 80, learning_rate = 0.03, subsample = 0.5) # Perform calibration # Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting # 05/22 - Looks like isotonic does better than sigmoid for both Brier score and roc_auc_score. # Using 30-40% holdout actually improves ROC AUC for holdout score from 0.88 to 0.925 with CV=5 print ("Performing Calibration now ...") # sigmoid = CalibratedClassifierCV(clf, cv=5, method='sigmoid') sigmoid = CalibratedClassifierCV(clf, cv=5, method='isotonic') sigmoid.fit(features_train, labels_train) if (hold_out == True) : # Calculate Brier score loss y_probs = sigmoid.predict_proba(features_test)[:, 1] clf_score = brier_score_loss(labels_test, y_probs) print ("Brier score: ", clf_score) auc_score = estimate_roc_auc (sigmoid, features_test, labels_test) return sigmoid
def train(): X_train, X_valid, y_train, y_valid = load_train_data() # Number of trees, increase this to beat the benchmark ;) # n_estimators = 10 # clf = RandomForestClassifier(n_estimators=n_estimators) # for r in np.arange(3,50,3): # # tclf = gbc(n_estimators=50, learning_rate=0.35,max_depth=5,max_features=0.7,min_samples_leaf=6,min_samples_split=r) # print(" --------- Testing Stuff -------------", "min_samples_split=", r) # tclf.fit(X_train, y_train) # ty_prob = tclf.predict_proba(X_valid) # # tencoder = LabelEncoder() # ty_true = tencoder.fit_transform(y_valid) # assert (tencoder.classes_ == tclf.classes_).all() # # tscore = logloss_mc(ty_true, ty_prob) # print(" -- Multiclass logloss While Testing Stuff: {:.4f}.".format(tscore)) clf = gbc(n_estimators=25, learning_rate=0.18,min_samples_leaf=6, max_features=0.8,subsample=0.9,verbose=2,max_depth=10) # clf = gbc(n_estimators=20, learning_rate=.13, min_samples_leaf=6,subsample=.8,max_features=0.6,verbose=2,max_depth=20) print(" -- Start training Random Forest Classifier.") clf.fit(X_train, y_train) y_prob = clf.predict_proba(X_valid) print(" -- Finished training.") encoder = LabelEncoder() y_true = encoder.fit_transform(y_valid) assert (encoder.classes_ == clf.classes_).all() score = logloss_mc(y_true, y_prob) print(" -- Multiclass logloss on validation set: {:.4f}.".format(score)) return clf, encoder
def grad_bst_ftr_eval(x_train, y_train, x_test, y_test): gb = gbc() gbst = gb.fit(x_train, y_train) gbst_ftr_imp = list(gbst.feature_importances_) gbst_ftr_eval = [] for feature, importance in zip(depvar_ftrs, gbst_ftr_imp): ftr_update = {"name": feature, "score": importance} gbst_ftr_eval.append(ftr_update) if importance == 0.0: gbst_ftr_elim.append(feature) gbst_ftr_eval = sorted(gbst_ftr_eval, key=itemgetter("score"), reverse=True) if eval_on_scrn == "Yes": print divider print "\nGradient Boosting Classifier evaluated dependent variable data features as follows:\n" for i in range(len(gbst_ftr_eval)): print "{}. {}: %.5f".format( i + 1, gbst_ftr_eval[i]["name"].title()) % gbst_ftr_eval[i]["score"] print divider else: pass return gbst_ftr_eval
1 ] else: print "Searching over GBC and lasso" ess = [ Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)), ("sc", StandardScaler()), ("lr1", lr(n_jobs=args.n_jobs, penalty='l1', class_weight='balanced', random_state=args.random_state))]), # Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)), # ("sc", StandardScaler()), # ("lr2", lr(n_jobs=n_jobs, class_weight='balanced', random_state=random_state))]), # Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)), # ("rf", rf(n_jobs=n_jobs, class_weight='balanced', random_state=random_state))]), Pipeline([("im", Imputer(missing_values=np.NAN, strategy="most_frequent", axis=0, verbose=10)), ("gbc", gbc(random_state=args.random_state))]), ] es_names = [ "lr1", # "lr2", # "rf", "gbc" ] paramss = [ { #"lr1__C": expon(scale=0.1), #"lr1__C": uniform(0,5)#[2e-3], #One shot "lr1__C": [2e-3]
pipeline = Pipeline([('deal_na', Deal_NAs()), ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])), scale_num]) #X_prepared = pipeline.fit_transform(X_) X_train_p = pipeline.fit_transform(X_train) X_vali_p = pipeline.transform(X_vali) from sklearn.linear_model import LogisticRegression as lr from sklearn.tree import DecisionTreeClassifier as dtc from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc model = lr(C=1) model = dtc(min_samples_split=10, max_features=5) model = abc(dtc(max_depth=4), n_estimators=100) model = gbc(n_estimators=200) #model = rfc(n_estimators=200 ,min_samples_split = 5) model.fit(X_train_p, Y_train) # print(model.score(X_train_p, Y_train)) # print(model.score(X_vali_p, Y_vali)) # coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]}) # coef_df.sort_values('coef', ascending = False) from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix Y_pred = model.predict(X_vali_p) print(classification_report(Y_vali, Y_pred)) print(submit.head()) test_p = pipeline.transform(test)
def split_data(data,test_size): c=len(data) if(c<=test_size): test_size=int(c/2) t=np.arange(c) np.random.shuffle(t) data_train=data[t[:c-test_size]] data_test=data[t[c-test_size:]] return data_train,data_test data_train,data_test=split_data(data,50) x_train=data_train[:,0:-1] y_train=data_train[:,-1] pgrid={'n_estimators':range(5,21,5)} clf=GridSearchCV(estimator=gbc(n_estimators=100,verbose=1),param_grid=pgrid,cv=3) clf.fit(x_train,y_train) print(clf.best_estimator_) y_pred=clf.predict(x_train) y_proba=clf.predict_proba(x_train) print('accuracy:{0}'.format(metrics.accuracy_score(y_train,y_pred))) print('AUC:{0}'.format(metrics.roc_auc_score(y_train,y_proba,multi_class='ovr'))) # get the accuracy of the test data x_test=data_test[:,0:-1] y_test=data_test[:,-1] yhat=clf.predict(x_test) r=np.mean(y_test==yhat) print(r)
print("------------------------------------------------") print(" ") scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train_new) x_validate_scaled = scaler.transform(x_validate) # Gradient boosting ------------------------------------------------------------------ print(" Gradient Boosting ... ") print("------------------------------------------------") print(" ") # Training........................................... print("Training...........................") classifier = gbc() gbc_model = classifier.fit(x_train_scaled, y_train_new) with open( '/home/mkolpe2s/rand/Classic_ML/Proper_method/GB/US8K/gbc_US8K_default_parameter.pkl', 'wb') as f: pickle.dump(gbc_model, f) print("Train score:", gbc_model.score(x_train_scaled, y_train_new)) #Validation.............................. print("Validation...........................") print("Validation score:", gbc_model.score(x_validate_scaled, y_validate)) #Testing................................. print("Testing...........................")
def classification(self, metric, folds, printt=True, graph=False): size = self.graph_width if len(self.y.iloc[:,0].unique()) > 2: struct = 'multiclass' else: struct = 'binary' # significant model setup differences should be list as different models models = {} models["Linear discriminant analysis"] = ldac() models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean') models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan') models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc(binarize=0.5) models["Multinomial naive bayes"] = mnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Ridge classifier"] = rc() if len(self.Xt_train) < 10000: models["SVM classifier RBF"] = svc(gamma='scale') models["SVM classifier Linear"] = svc(kernel='linear') models["SVM classifier Poly"] = svc(kernel='poly') if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5: models["Gradient boosting classifier"] = gbc() models["Random forest classifier"] = rfc(n_estimators=100) if struct == 'multiclass': models["Logistic classifier multinomial"] = logitc(multi_class='multinomial', solver='lbfgs') models["Logistic classifier auto"] = logitc(multi_class='auto') models["Logistic One vs Rest"] = ovrc(logitc()) models["Logistic One vs One"] = ovoc(logitc()) if struct == 'binary': models["Logistic classifier"] = logitc(max_iter=2000) self.models = models kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) #print(model_name, time.time() - start) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None
df_wins = df_concat[['pt_diff', 'ast_diff', 'or_diff', 'dr_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff', 'fgp_diff', '3p_diff', 'ftp_diff', 'seed_diff', 'Season', 'Wteam', 'Lteam']] df_wins['result'] = 1 df_losses = -df_concat[['pt_diff', 'ast_diff', 'or_diff', 'dr_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff', 'fgp_diff', '3p_diff', 'ftp_diff', 'seed_diff', 'Season', 'Wteam', 'Lteam']] df_losses['result'] = 0 df_for_predictions = pd.concat((df_wins, df_losses)) columns = df_for_predictions.columns.tolist() columns = columns[:-4] given_data = df_for_predictions.as_matrix(columns = columns) target_data = np.array(df_for_predictions['result'].tolist()) clf = gbc(n_estimators = 10) clf = clf.fit(given_data, target_data) # df_sample_sub = pd.read_csv('sample_submission.csv') n_test_games = len(df_sample_sub) def get_year_t1_t2(id): """Return a tuple with ints `year`, `team1` and `team2`.""" return (int(x) for x in id.split('_')) X_test = np.zeros(shape=(n_test_games, 12)) for ii, row in df_sample_sub.iterrows(): year, t1, t2 = get_year_t1_t2(row.id) # There absolutely must be a better way of doing this! t1_score = df_teams[(df_teams.Team_Id == t1) & (df_teams.Season == year)][['score','ast','or','dr','to','stl','blk','pf','fgp','3p','ftp']].values[0] t2_score = df_teams[(df_teams.Team_Id == t2) & (df_teams.Season == year)][['score','ast','or','dr','to','stl','blk','pf','fgp','3p','ftp']].values[0]
train_data_file, test_data_file, test_result_dir = cmd.TrainTestFileParser(sys.argv, num) trdata = pd.read_csv(train_data_file, header=None, sep=" ") tedata = pd.read_csv(test_data_file, header=None, sep=" ") # initialize classifier params = { "n_estimators": 1000, "max_depth": 3, "subsample": 0.5, "learning_rate": 0.01, "min_samples_leaf": 1, "random_state": 3, } model = gbc(**params) model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0]) acc = model.score(tedata.iloc[:, 1:], tedata.iloc[:, 0]) print("Accuracy: {:.4f}".format(acc)) # generate predictions # preds = np.array(model.predict_proba(testdata))[:,1] # save out # mg.writeout(preds,testid,'predictions/rfcmodel_test.csv')
# standardize data num_index = [0, 1, 3, 4, 5] num_mu = np.zeros(X_train.shape[1]) num_std = np.ones(X_train.shape[1]) x_all = np.concatenate((X_train, X_test), axis=0) mu = np.mean(x_all, axis=0) std = np.std(x_all, axis=0) num_mu[num_index] = mu[num_index] num_std[num_index] = std[num_index] x_all_normal = (x_all - num_mu) / num_std x_train = x_all_normal[:X_train.shape[0]] x_test = x_all_normal[X_train.shape[0]:] y_train = Y_train.reshape(-1, ) # initialize model and test and output result gbc_best = gbc(n_estimators=356, learning_rate=0.165, random_state=112).fit(x_train, y_train) y_predict = gbc_best.predict(x_test) with open(str(sys.argv[6]), 'w', newline='') as csvfile: wr = csv.writer(csvfile) wr.writerow(['id', 'label']) for row_ind, row in enumerate(y_predict): wr.writerow([str(row_ind + 1), row])
def make_model(col_labels = None, year = 2017, model_type = None): """make and run model""" data = pd.read_csv('NCAA2001_2017.csv') data_2018 = pd.read_csv('NCAA2018.csv') data_2018['year'] = 2018 data = data.append(data_2018) # data to pull from the data frame if col_labels is None: col_labels = [ 'TopEFGPer', # effective field goal percentage 'TopFTR', # free throw rate 'TopTOPer', # turnover percentage 'TopDRTG', # defensive rating 'TopSOS', # strength of schedule 'BotEFGPer', 'BotFTR', 'BotTOPer', 'BotDRTG', 'BotSOS' ] # don't scale SeedType if 'SeedType' in col_labels: col_labels.remove('SeedType') if len(col_labels) != 0: data[col_labels] = scale(data[col_labels]) col_labels.insert(0, 'SeedType') else: data[col_labels] = scale(data[col_labels]) # change SeedTypes to integers in case need to encode later data = data.replace( ['OneSixteen', 'TwoFifteen', 'ThreeFourteen', 'FourThirteen', 'FiveTwelve', 'SixEleven', 'SevenTen', 'EightNine'], [1, 2, 3, 4, 5, 6, 7, 8]) train = data.loc[(data['year'] != year) & \ (data['year'] != 2018)][col_labels] train_results = data.loc[(data['year'] != year) & \ (data['year'] != 2018)]['Upset'] # not a df test = data.loc[data['year'] == year][col_labels] results_columns = ['SeedType', 'TopSeed', 'BotSeed', 'Upset'] test_results = data.loc[data['year'] == year][results_columns] # have to one-hot the seeding type if that's in there if 'SeedType' in col_labels: enc = OneHotEncoder(categorical_features = [0]) # must be first train = enc.fit_transform(train).toarray() test = enc.fit_transform(test).toarray() else: train = train.as_matrix() test = test.as_matrix() # making the model if model_type == "forest": model = rf() elif model_type == "gbc": model = gbc() elif model_type == "svc": model = svc(probability = True) else: model = lm.LogisticRegression() model.fit(train, train_results.as_matrix()) predictions = model.predict_proba(test) proba = [] for i in range(len(predictions)): proba.append(predictions[i][1]) # second column is upset percentage test_results['UpsetProba'] = proba test_results = test_results.sort('UpsetProba', ascending = 0) print(test_results)
std = np.std(x_all, axis=0) num_mu[num_index] = mu[num_index] num_std[num_index] = std[num_index] x_all_normal = (x_all - num_mu) / num_std x_train = x_all_normal[:X_train.shape[0]] x_test = x_all_normal[X_train.shape[0]:] y_train = Y_train.reshape(-1, ) # tune model top_n = 0.0 top_lr = 0.0 top_score = 0.0 # it takes 2.5 hours to tune the model via Intel i5 CPU core for n in [50, 80, 100, 150, 200, 250, 300, 350]: for lr in [0.001, 0.005, 0.01, 0.0375, 0.05, 0.0875, 0.1, 0.15]: gbc_model = gbc(n_estimators=n, learning_rate=lr, random_state=112) res = cva(gbc_model, x_train, y_train, cv=10, n_jobs=-1) final_score = np.mean(res['test_score']) if final_score > top_score: top_score = final_score top_n = n top_lr = lr print('top_n = ', top_n, ' top_lr = ', top_lr, ' top_score = ', top_score)
### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB as gnb #from sklearn.linear_model import LogisticRegression as lr from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc #from sklearn.svm import SVC as svc clf1 = gnb() #clf2 = lr() clf3 = rfc() clf4 = abc() clf5 = gbc() #clf6 = svc() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split from sklearn import metrics as mtr features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42)
for num in num_list: train_data_file,test_data_file,test_result_dir = cmd.TrainTestFileParser(sys.argv,num) # test_result_file = open(test_result_dir+"rfc_test_result"+str(num)+".txt",'w+') trdata=pd.read_csv(train_data_file,header=None,sep=' ') tedata=pd.read_csv(test_data_file,header=None,sep=' ') params = {'n_estimators': 1000, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3} model= gbc(**params) model = model.fit(trdata.iloc[:,1:],trdata.iloc[:,0]) accur = model.score(tedata.iloc[:,1:],tedata.iloc[:,0]) print('Out of Bag accuracy: %f \n' %accur) sum += accur print ('Average accuracy:'+str(sum/5)) # resultClass = model.predict(tedata.iloc[:,1:]) # #resultLogProba = model.predict_log_proba(tedata.iloc[:,1:]) # resultProba = model.predict_proba(tedata.iloc[:,1:])
""" ### Enumerate best performing classifiers for 59 feature test set """ clf_rfc1 = rfc(random_state = 30, criterion = 'entropy', max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 70, max_features = 7) clf_rfc2 = rfc(random_state = 30, criterion = 'gini', n_estimators = 100) # wild-card clf_gbc = gbc(random_state = 30, max_depth = 3, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 120, subsample = 0.9, learning_rate = 0.03) clf_etc1 = etc(random_state = 30, n_estimators = 350, criterion = 'entropy', min_samples_leaf = 2, min_samples_split = 5) clf_etc2 = etc(random_state = 30, n_estimators = 250, criterion = 'gini') # wild-card # clf_adab = adab(random_state = 30, n_estimators = 300, learning_rate = 0.02) """ ### Enumerate best performing classifiers for 22 feature test set clf_rfc1 = rfc(random_state = 30, criterion = 'entropy', max_depth = 7, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 50) clf_rfc2 = rfc(random_state = 30, criterion = 'gini', n_estimators = 120, max_depth = 8, min_samples_split = 2, min_samples_leaf = 5) clf_gbc = gbc(random_state = 30, max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 80, subsample = 0.5, learning_rate = 0.03) clf_etc1 = etc(random_state = 30, n_estimators = 375, criterion = 'entropy', min_samples_leaf = 2, min_samples_split = 5) clf_etc2 = etc(random_state = 30, n_estimators = 60, criterion = 'gini', min_samples_leaf = 2, min_samples_split = 10 ) clfs = [clf_rfc1, clf_rfc2, clf_etc1, clf_etc2, # pipe_adab, # clf_adab, # pipe_svc, clf_gbc] """ # Replace with Calibrated Classifiers instead (cv5xccv5 is a bad idea because CCV splits are not stratified) - maybe try 3x3 or 5x2 instead clfs = [CalibratedClassifierCV(clf_rfc1, cv=calib_folds, method='isotonic'), CalibratedClassifierCV(clf_gbc, cv=calib_folds, method='isotonic'),
#pca = PCA(n_components=40) #pca.fit(nptrd[:,range(1,94)]) #X = pca.transform(nptrd[:,range(1,94)]) PCAExplained = sum(pipeline.named_steps['pca'].explained_variance_ratio_) # Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. # This indicates that only a few values are non-zero for most features. # This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure #forest = rfc(n_estimators=100,n_jobs=-1,min_samples_split=20,min_samples_leaf=10) #forest = rfc(n_estimators=100,n_jobs=-1) start = time.time() forest = gbc(n_estimators=25, learning_rate=.15, min_samples_leaf=6,subsample=.9,max_features=0.6,verbose=2,max_depth=15) forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1]) #forest = forest.fit(X,nptrd[:,-1]) temp = forest.predict(nptrd[:,range(1,94)]) #temp = forest.predict(X) TrainError = sum(temp == nptrd[:,-1]) / (len(nptrd)*1.0) # Need to spend some time checking for overfit - using some elbow techiques maybe # Cross validate the model using the cross validation dataset #XCv = pipeline.transform(npcvd[:,range(1,94)]) outputCv = forest.predict(npcvd[:,range(1,94)])
# Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features = [1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Fitting XGBoost to the Training set from sklearn.ensemble import GradientBoostingClassifier as gbc classifier = gbc() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) accuracies.mean() accuracies.std()