class Boosting(): ''' ''' def __init__(self): self.clf = GB() def fit(self, X, y): ''' :param X: :param y: :return: ''' self.clf.fit(X,y) def predict(self, X): ''' :param X: :return: ''' m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
class GradientBoostingClassifierImpl(): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier( random_state=0, n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X) try: auc_score = roc_auc_score(y, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} print(result) return {}, result
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev): print("\nPerforming Gradient Boosting.") gb = GradientBoostingClassifier(n_estimators=50, learning_rate=0.25, max_depth=5, random_state=0) gb.fit(X_train, y_train) y_pred = gb.predict(X_dev) accuracy = np.mean(y_dev == y_pred) print("Accuracy", accuracy) return gb, accuracy
class Boosting(): #TODO: dokumentasi def __init__(self): self.clf = GB() def fit(self,X,y): self.clf.fit(X,y) def predict(self,X): m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def trainGBT(requestsQ, responsesQ): while True: args = requestsQ.get() if args[0] == 'KILL': break vectors = args[1] # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth hyperparams = args[2] model = GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3]) model.fit(vectors['Xtrain'], vectors['Ytrain']) score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest'])) responsesQ.put((model, score), True) return 0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train, y_test): ##TO DO : Testing Hyper Parameters and Cross Validation print 'Applying Gradient Boosting' # Training the classifier classifier = GradientBoostingClassifier(n_estimators=100) classifier = classifier.fit(X_train_preprocessed, y_train) # Testing the classifier on Test Data y_test_pred = classifier.predict(X_test_preprocessed) #Compute Accuracy Score acc = accuracy_score(y_test, y_test_pred, normalize=True) print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc return classifier, acc
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier(random_state=0, n_estimators=50, learning_rate=0.3) clf.fit(X, y) y_pred = clf.predict(X) try: auc_score = roc_auc_score(y, y_pred) except: print(f"no auc score available") acc = accuracy_score(y, y_pred) result = {"accuracy": acc} print('multi result', result) return {}, result
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
normal = y[:, 3] pod = y[:, 4] smurf = y[:, 5] teardrop = y[:, 6] print(x.shape) #shap函数,几行几列np.shape print(normal.shape) y = teardrop #先只针对teardrop预测 X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.4, random_state=0) #验证集占训练集40%,随机种子(random_state)每次不一样 print('data load finish.....') print(np.sum(y_train)) print(np.sum(y_test)) #没有交叉验证, clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=5, verbose=1) #gbdt初始化(迭代最大次数太大过拟合太小欠拟合,步长,决策树的最大深度,输出日志 clf.fit(X_train, y_train) #训练 y_ = clf.predict(X_test) #预测 score = f1_score(y_test, y_) #预测准确率 print(score) joblib.dump(clf, 'model/teardrop_clf.m')
test['Embarked'] = lbd2.transform(test['Embarked']) #label encode categorical variables lbd3 = LabelEncoder() data['Sex'] = lbd3.fit_transform(data['Sex']) test['Sex'] = lbd3.transform(test['Sex']) #add total family members data['SibSp'] = data['SibSp'] + data['Parch'] + 1 test['SibSp'] = test['SibSp'] + test['Parch'] + 1 #perform standard scaling for x in ['Age', 'SibSp', 'Parch', 'Fare']: StndSc = StandardScaler() data[x] = StndSc.fit_transform(data[x].values.reshape(891, 1)) test[x] = StndSc.transform(test[x].values.reshape(test.shape[0], 1)) X = data.iloc[:, 1:] y = data.iloc[:, 0] #train the gradient boosted classifier gdb = GradientBoostingClassifier(n_estimators=250, max_depth=3) gdb.fit(X, y) #draw predictions pred = gdb.predict(test) #configuring the submission files df = pd.DataFrame({'PassengerId': Id, 'Survived': pred}) df.to_csv("H://Submissions_Titanic.csv", index=False)
n_informative=18, n_redundant=2, n_classes=2, n_clusters_per_class=3, random_state=2017) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # 不生成新的特征,直接训练 clf = GradientBoostingClassifier(n_estimators=50) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print("Original featrues") print("GBDT_ACC: {:.6f}".format(acc)) print("GBDT_AUC: {:.6f}".format(auc)) # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵 X_train_leaves = clf.apply(X_train)[:, :, 0] X_test_leaves = clf.apply(X_test)[:, :, 0] # 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作 All_leaves = np.r_[X_train_leaves, X_test_leaves] # 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作
y_pred1 = SVM.predict(X_test) print(classification_report(y_test, y_pred1)) print(accuracy_score(y_test, y_pred1)) ## Random Forest Classifier print("RandomForrest Classifier Results are as following") rfc = RandomForestClassifier(n_estimators=200, max_depth=4) rfc.fit(X_train, y_train) y_pred2 = rfc.predict(X_test) print(classification_report(y_test, y_pred2)) print(accuracy_score(y_test, y_pred2)) # Neural Network print("Neural Network Classifier Results are as following") mlp = MLPClassifier(max_iter=500) mlp.fit(X_train, y_train) y_pred3 = mlp.predict(X_test) print(classification_report(y_test, y_pred3)) print(accuracy_score(y_test, y_pred3)) # GradientBoosting Classifier print("GradientBoosting Classifier Results are as following") grd = GradientBoostingClassifier() grd.fit(X_train, y_train) y_pred4 = grd.predict(X_test) print(classification_report(y_test, y_pred4)) print(accuracy_score(y_test, y_pred4))
# 当前最优 : GBC-随机梯度上升 ############################################################ # 数据正态化后审查结果可视化 _Fig = plt.figure() _Fig.suptitle(t="STANDARDSCALED ALGORITHM COMPARISION") _Ax = _Fig.add_subplot(111) plt.boxplot(x=_STANDARDSCALED_ALGORITHM_CMP_RESULT_LIST) _Ax.set_xticklabels(labels=_STANDARDSCALED_MODELS.keys()) plt.show() ################################################################################ # 数据正态化后随机梯度上升预测 _STANDARDSCALED_GBC_MODEL = GBC() _STANDARDSCALED_GBC_SCALER = preprocessing.StandardScaler().fit(X=_X_TRAIN) _STANDARDSCALED_GBC_MODEL.fit( X=_STANDARDSCALED_GBC_SCALER.transform(X=_X_TRAIN), y=_Y_TRAIN) _STANDARDSCALED_GBC_PREDICTIONS = _STANDARDSCALED_GBC_MODEL.predict( X=_STANDARDSCALED_GBC_SCALER.transform(X=_X_VAL)) print( "GBC-随机梯度上升数据正态化后预测结果:\n", # " " * 4, "ACCURACY_SCORE:\n", " " * 8, metrics.accuracy_score(y_true=_Y_VAL, y_pred=_STANDARDSCALED_GBC_PREDICTIONS), "\n", # " " * 4, "CONFUSION_MATRIX:\n", metrics.confusion_matrix(y_true=_Y_VAL, y_pred=_STANDARDSCALED_GBC_PREDICTIONS), "\n",
y_pred_rf = rf_model.predict(X_test) accuracy_rf = accuracy_score(y_test, y_pred_rf) print(accuracy_rf) # In[193]: ######## Trying Gradient Boost ###### # In[194]: gbc = GradientBoostingClassifier(n_estimators=100) gbc.fit(X_train, y_train) # In[195]: y_pred_gbc = gbc.predict(X_test) accuracy_gbc = accuracy_score(y_test, y_pred_gbc) print(accuracy_gbc) # In[196]: ############## Model evaluation ############## # In[197]: confusion_matrix(y_test, y_pred_lr) # In[198]: confusion_matrix(y_test, y_pred_rf)
# y_hat = rfc.predict(test_x) # get output for predicted test_x data # # # Format to correct output CSV file # y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1)) # index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1)) # csv_output = np.concatenate((index, y_hat_cols), axis=1) # np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',') #################### SVM ######################### # svc = SVC(gamma='auto') # svc.fit(train_x, train_y) # y_hat = svc.predict(test_x) # # # Format to correct output CSV file # y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1)) # index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1)) # csv_output = np.concatenate((index, y_hat_cols), axis=1) # np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',') #################### GBC ######################### gbc = GBC(learning_rate=0.2, n_estimators=200) gbc.fit(train_x, train_y) y_hat = gbc.predict(test_x) # Format to correct output CSV file y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1)) index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1)) csv_output = np.concatenate((index, y_hat_cols), axis=1) np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',') print("DONEEEE !!!!!!!")
trainData = dataTrain.values[:, 1:] trainLabel = dataTrain.values[:, 0] preData = dataPre.values[:, :] return trainData, trainLabel, preData # Data X, y, _ = opencsv() #X = X[:int(0.2*len(X)), :] #y = y[:len(X)] splitRatio = 0.9 X_train = X[:int(splitRatio * len(X)), :] y_train = y[:int(splitRatio * len(X))] X_test = X[int(splitRatio * len(X)):, :] y_test = y[int(splitRatio * len(X)):] print('Data split') #param_search_n_estimators = {'n_estimators':range(20, 300, 20)} #gsearch_gbct = GridSearchCV(GradientBoostingClassifier(), param_grid=param_search_n_estimators, scoring='accuracy', iid=False, cv=5) gbct = GradientBoostingClassifier(n_estimators=200, subsample=.1) gbct.fit(X_train, y_train) print('Fit finished') #print(gsearch_gbct.grid_scores_, gsearch_gbct.best_params_, gsearch_gbct.best_score_) y_pre = gbct.predict(X_test) print(accuracy_score(y_pre, y_test)) print('Time cost: ', time.clock() - c_time)
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
# <codecell> X, y = shuffle(df2[possible_features], df2.bad) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # <codecell> params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'} clf = GradientBoostingClassifier(**params) # <codecell> clf = clf.fit(X_train, y_train) predicted = clf.predict(X_test) # <codecell> clf.feature_importances_ # <codecell> print "Mean Squared Error" mse = mean_squared_error(y_test, predicted) print("MSE: %.4f" % mse) print # <codecell> params = clf.get_params()
X_train, X_test, y_train, y_test = train_test_split(df_train, is_promoted, test_size=0.3) #y_train = y_train.reshape(-1,1) #y_test = y_test.reshape(-1,1) print X_train.shape, X_test.shape, y_train.shape, y_test.shape # In[19]: # # In[24]: gboost = GradientBoostingClassifier(max_depth=5, n_estimators=200) gboost.fit(df_train, is_promoted) result = gboost.predict(df_test) print 'lol' # In[26]: #WNS_solution = open('WNS_solution.csv','w') WNS_solution = pd.DataFrame() WNS_solution['employee_id'] = emp_id result = pd.DataFrame(result) WNS_solution['is_promoted'] = result WNS_solution.to_csv('WNS_solution4.csv', index=False) # In[ ]: print 'lol'
data = pd.read_csv("D:/Sai/JavaDoc/Cousera/5/2/gbm-data.csv") y = data[data.columns[1]].values x = data[data.columns[1:]].values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) ls = [1, 0.5, 0.3, 0.2, 0.1] for i in ls: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=i) clf.fit(x_train, y_train) qual_test = clf.staged_decision_function(x_test) qual_train = clf.staged_decision_function(x_train) predict = clf.predict(x_test) pred_trans = 1 / (1 + math.exp(-predict)) plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train'])