def eval_fn(params): model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed) score = 0 n_estimators = 0 for tr, va in skf: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss', early_stopping_rounds=50, verbose=False) score += model.best_score n_estimators += model.best_iteration score /= n_folds n_estimators /= n_folds n_estimators_lst.append(n_estimators) result_str = "train:%.4f ntree:%5d " % (score, n_estimators) if X_valid is not None: model.n_estimators = n_estimators model.fit(X_train, y_train) pr = model.predict_proba(X_valid)[:,1] sc_valid = log_loss(y_valid, pr) score_valid.append(sc_valid) result_str += "valid:%.4f" % sc_valid if verbose: print result_str return score
def job_function(params): learning_rate = params[0] max_depth = params[1] ss_cs = params[2] gamma = params[3] min_child_weight = params[4] reg_lambda = params[5] reg_alpha = params[6] early_stopping_rounds = 25 if learning_rate >= 0.3: early_stopping_rounds = 5 if learning_rate <= 0.03: early_stopping_rounds = 50 scores = [] for i in range(iterations_per_job): X_train = Xy[i][0] X_test = Xy[i][1] y_train = Xy[i][2] y_test = Xy[i][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) scores.append(score) avg_score = np.array(scores).mean() print(avg_score, params) return avg_score
def main(training_data, test_data): # Merging data to ensure consistent cleaning. Putting marker variable to separate later. training_data['source'] = 'training' test_data['source'] = 'test' merged_data = pd.concat([training_data, test_data]) # Cleaning data cleaned_data = data_cleaner(merged_data) # Separating data, removing marker pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy() test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy() pred_df.drop('source', axis=1, inplace=True) test_pred.drop('source', axis=1, inplace=True) # Transforming target into ints, saving the key for later transformation labels = LabelEncoder().fit(training_data['country_destination']) target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index) # Training model xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(pred_df.as_matrix(), target_df.tolist()) # Running the model preds = xgb_model.predict_proba(test_pred.as_matrix()) # Selecting the top 5 most likely for each respondent and stacking. # This section is VERY slow and could use being optimized model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_) stacked_probs = pd.Series() for i in model_probs.index: temp = model_probs.loc[i, :] temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index) temp_sort['id'] = i temp_sort.columns = ['country', 'id'] stacked_probs = pd.concat([stacked_probs, temp_sort]) # # Selecting classes with highest probabilities, compiling into list # ids = [] # cts = [] # test_ids = pd.Series(test_data.index) # for i in range(len(test_ids)): # idx = test_data.index[i] # ids += [idx] * 5 # cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist() # # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) # Cleaning output and returning it output = stacked_probs[['id', 'country']] return output
def xgboostinitial_predictor(train_path, test_path, eval_path): # Loading the data print 'Loading the data...' train = pd.read_csv(train_path, index_col=0) test = pd.read_csv(test_path, index_col=0) eval_df = pd.read_csv(eval_path, index_col=0) target = train['target'].copy() train.drop('target', axis=1, inplace=True) # Training model print 'Model training begins...' # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan) # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01, # 'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0} # # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3) xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(train.as_matrix(), target.tolist()) # Running the model print 'Making predictions....' # xgtest = xgb.DMatrix(test.values) # xgeval = xgb.DMatrix(eval_df) test_preds = xgb_model.predict_proba(test.as_matrix()) eval_preds = xgb_model.predict_proba(eval_df.as_matrix()) print 'Cleaning predictions to match expected format....' test_output = pd.DataFrame(test_preds, index=test.index) print test_output.columns test_output = test_output[1] test_output.columns = ['PredictedProb'] eval_output = pd.DataFrame(eval_preds, index=eval_df.index) eval_output = eval_output[1] eval_output.columns = ['PredictedProb'] return test_output, eval_output
def objective(space): clf = XGBClassifier(n_estimators=int(space['n_estimators']), objective='binary:logistic', seed=37, learning_rate=space['learning_rate'], max_depth=space['max_depth'], min_child_weight=space['min_child_weight'], colsample_bytree=space['colsample_bytree'], subsample=space['subsample']) clf.fit(xTrain, yTrain, eval_metric="logloss") pred = clf.predict_proba(xValid)[:, 1] loss = log_loss(yValid, pred) return{'loss': loss, 'status': STATUS_OK}
def main(): data_train = pd.read_csv(args.train_dataset) X_train = data_train.drop(['Id', 'Class'], axis=1) y_train = data_train.loc[:, 'Class'] data_test = pd.read_csv(args.test_dataset) X_test = data_test.drop(['Id'], axis=1) Id = data_test.loc[:, 'Id'] clf = XGBClassifier() clf.set_params(**best_dicts) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test) columns = ['Prediction'+str(i) for i in range(1, 10)] prediction = pd.DataFrame(prediction, columns=columns) results = pd.concat([Id, prediction], axis=1) return (clf, results)
def myThreadFunc(ThreadID): X_train = Xy[ThreadID][0] X_test = Xy[ThreadID][1] y_train = Xy[ThreadID][2] y_test = Xy[ThreadID][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) print(score, clf.booster().best_ntree_limit) train_and_test_scores[ThreadID] = score
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'): """ Ensembler based on xgboost Gradient boosting. """ #Loading data X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder) y = y_valid #Defining classifier xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200, objective='multi:softprob', gamma=0., max_delta_step=0., subsample=0.9, colsample_bytree=0.9, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) return y_pred
def perform_prediction(training, labels, testing, xgb_votes, rf_votes): """ Perform prediction using a combination of XGB and RandomForests. """ predictions = np.zeros((len(testing), len(set(labels)))) # Predictions using xgboost. for i in range(xgb_votes): print 'XGB vote %d' % i xgb = XGBClassifier( max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB, n_estimators=ESTIMATORS_XGB, objective='multi:softprob', subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB) xgb.fit(training, labels) predictions += xgb.predict_proba(testing) # Predictions using RandomForestClassifier. for i in range(rf_votes): print 'RandomForest vote %d' % i rand_forest = RandomForestClassifier( n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF, max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True) rand_forest.fit(training, labels) predictions += rand_forest.predict_proba(testing) return predictions
#X_val_pca = pca.transform(norm_X_val) #X_test_pca = pca.transform(norm_X_test) ### #X_val = X_val_pca #norm_X_val = X_val_pca #X_train = X_train_pca #norm_X_train = X_train_pca #X_test = X_test_pca #norm_X_test = X_test_pca #+++++++++++++++++++++++++++++++++++++Classifier+++++++++++++++++++++++++++++++++++++++++++++++++++++++ #boosting print 'start boosting' xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0).fit(X_train, y_train) y_pred_boosting = xgb.predict_proba(X_test) if val_num != 0: y_pred_boosting_val = xgb.predict_proba(X_val) # #randomforest #print 'start random forest' #clf_randforest = RandomForestClassifier().fit(X_train, y_train) #y_pred_randforest = clf_randforest.predict_proba(X_test) #if val_num != 0: # y_pred_randforest_val = clf_randforest.predict_proba(X_val) # ##bagging #print 'start bagging' #clf2 = BaggingClassifier(n_estimators=100).fit(X_train, y_train) #y_pred_bagging = clf2.predict_proba(X_test) #y_pred_bagging_val = clf2.predict_proba(X_val)
del Training["Train"] lr=LogisticRegression(n_jobs=4) xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=80,objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) #lr.fit(Training,y) xgb.fit(Training,y) testIds=Testing["id"] del Testing["country_destination"] del Testing["id"] del Testing["Train"] Testing = Testing.fillna(-1) ypred = xgb.predict_proba(Testing) F=(pd.DataFrame(ypred))[[0,1,2,3,4,5,6,7,8,9,10,11]].idxmax(axis=1) Fst=pd.DataFrame(F,columns=['idm']) summary=Fst.groupby(['idm']) print summary['idm'].aggregate(len) idList = [] #list of ids cts = [] #list of countries i=0 for idx in testIds: cts += le.inverse_transform(np.argsort(ypred[i])[::-1])[:5].tolist() idList += [idx] * 5 i=i+1
train.drop(x, axis=1, inplace=True) test.drop(x, axis=1, inplace=True) y_train = train['TARGET'].values X_train = train.drop(['ID','TARGET'], axis=1).values y_test = test['ID'] X_test = test.drop(['ID'], axis=1).values xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=600, max_depth=5, min_child_weight=1, gamma=0, subsample=0.6815, colsample_bytree=0.701, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['auc'], early_stopping_rounds=50, show_progress=False) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(X_train, y_train, eval_metric='auc') output = xgb1.predict_proba(X_test)[:,1] submission = pd.DataFrame({"ID":y_test, "TARGET":output}) submission.to_csv("submission.csv", index=False)
print 'Finished Reconstructing Train/Test Sets' print data_train.shape print data_test.shape print 'Started Computing train set labels' label_set = np.sign(label_set['Click']) label_set[label_set == -1] = 0 print 'Finished computing train set labels' # fit estimator print "start XGBClassifier" n_samples = data_train.shape[0] est = XGBClassifier(n_estimators=200, learning_rate=0.1, silent=False) print "start fitting" est.fit(data_train, label_set) # predict class labels probs = est.predict_proba(data_test) print "cross validation start" cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0) scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv) mean = np.mean(probs[:, 1]) std = np.std(probs[:, 1]) print "Test predicted Mean:", mean print "Test predicted STD:", std df = pd.DataFrame(probs[:, 1]) df.columns = ["Prediction"] df.index += 1 df.to_csv("output_prediction.csv", index_label="Id")
max_depth=5) for pdepth in range(30, 44, 2): clf = XGBClassifier(objective='binary:logistic', silent=1, seed=215, learning_rate=0.05, gamma=0., colsample_bytree=0.8, subsample=0.8, base_score=0.5, max_delta_step=0, min_child_weight=6, max_depth=pdepth) y_score = [] clf.fit(X_train, Y_train) predict_X_test = clf.predict_proba(X_test) y_score = [] for each in predict_X_test: y_score.append(each[1]) print metrics.roc_auc_score(Y_test, y_score) ''' row = [] column = [] element = [] name = [] source = open("testfeature_TFIDF.txt","rb") i = 0 for line in source: data = line.split(",") length = (len(data) - 1) / 2 for j in range(0,length):
num_rounds=206 z=[] dtrain=xgb.DMatrix(train[features],label=y) clf=xgb.train(params,dtrain,num_rounds) importance=clf.get_fscore(fmap='xgb.fmap') importance=sorted(importance.items(),key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() bst=list(df['feature'][df.fscore>0.001]) #df.to_csv('select.csv',index=False) X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10) print ('start xgboost learning...') alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1) alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True) #plt.figure() #df.plot() #df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) #plt.title('XGBoost Feature Importance') #plt.xlabel('relative importance') #plt.gcf().savefig('feature_importance_xgb.png') y_pred = alg.predict_proba(test[bst]) result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2']) result['id']=test.id.values.copy() #result.to_csv('xgb10.csv',index=False)
colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=1000, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True, subsample=0.8) model.fit(X_train, y_train) # 训练模型 test_pred_xgb = model.predict_proba(X_test)[:, 1] # 预测为1的可能性 fpr_xgb, tpr_xgb, threshold = metrics.roc_curve(y_test, test_pred_xgb) auc = metrics.auc(fpr_xgb, tpr_xgb) score = metrics.accuracy_score(y_test, model.predict(X_test)) # 输入真实值和预测值 print([score, auc]) # 准确率、AUC面积 precision_xgb, recall_xgb, thresholds = precision_recall_curve( y_test, test_pred_xgb) pr_xgb = pd.DataFrame({"precision": precision_xgb, "recall": recall_xgb}) prc_xgb = pr_xgb[pr_xgb.precision >= 0.97].recall.max() print(prc_xgb) # 精确度≥0.97条件下的最大召回率 importance = model.feature_importances_ indices = np.argsort(importance)[::-1] # np.argsort()返回数值升序排列的索引,[::-1]表示倒序 features = X_train.columns for f in range(X_train.shape[1]): print("%2d) %3d %20s (%.4f)" %
def xgboost_algorithm(XTrain,YTrain,XTest): xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(XTrain, YTrain) y_pred_xgboost = xgb.predict_proba(XTest) return y_pred_xgboost
"shot_type","shot_zone_area","shot_zone_basic","shot_zone_range", "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining", "loc_x","loc_y"] for col in cols: data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1) train_x=data_x[-pd.isnull(data.shot_made_flag)] test_x=data_x[pd.isnull(data.shot_made_flag)] train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)] clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550, subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(train_x, train_y) y_pred = clf.predict(train_x) print("Number of mislabeled points out of a total %d points : %d" % (train_x.shape[0],(train_y != y_pred).sum())) def logloss(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred))) ll = ll * -1.0/len(act) print(ll) return ll logloss(train_y,clf.predict_proba(train_x)[:,1]) test_y=clf.predict_proba(test_x)[:,1] test_id=data[pd.isnull(data.shot_made_flag)]["shot_id"] submission=pd.DataFrame({"shot_id":test_id,"shot_made_flag":test_y}) submission.to_csv("submissson_1.csv",index=False)
def xgb_cls(self, testlen, ntrain, lengths, timesteps, day, tr, attr, attry, modellabel, readfile): if attr == 'raw': hsmadata_x = self.hsmadata_raw_x(timesteps) elif attr == 'ta': hsmadata_x = self.hsmadata_ta_x(lengths) else: print('Wrong Attr!') if attry == 'roc': hsmadata_y = self.hsmadata_roc(day) elif attry == 'roo': hsmadata_y = self.hsmadata_roo(day) else: print('Wrong Attr_y!') hsmadata = pd.merge(hsmadata_y, hsmadata_x) dates = pd.Series(hsmadata['date'].unique()).sort_values() dates.index = range(0, len(dates)) ntest = len(dates) // testlen filename = 'testresult\\futurexgboost\\hsma_xgb_cls_testlen' + \ str(testlen) + '_attr' + str(attr) + '_attry' + str(attry) + \ '_tr' + str(tr) + '_timesteps' + str(timesteps) + '_day' + str(day) + \ '_' + modellabel + '_' + self.label + '.h5' if readfile: if os.path.exists(filename): hsma = pd.read_hdf(filename, 'hsma') else: hsma = pd.DataFrame() else: hsma = pd.DataFrame() for i in range(ntrain, ntest): traindata = hsmadata[ (hsmadata['date'] >= dates[(i - ntrain) * testlen]) & (hsmadata['date'] <= dates[i * testlen - day - 1])].copy() testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & (hsmadata['date'] < dates[ (i + 1) * testlen])].copy() startdate = dates[i * testlen] enddate = testdata.date.max() if hsma.shape[0] > 0: if startdate <= hsma.date.max(): continue print(enddate) ###变换数据集成LSTM所需格式 traindatax = traindata.drop(['date', 'code', 'ratio'], 1) testdatax = testdata[traindatax.columns] traindatay_long = traindata['ratio'].copy() traindatay_long[traindata['ratio'] >= tr] = 1 traindatay_long[traindata['ratio'] < tr] = 0 traindatay_short = traindata['ratio'].copy() traindatay_short[traindata['ratio'] <= -tr] = 1 traindatay_short[traindata['ratio'] > -tr] = 0 #加入变量筛选 ###建模并预测 ###xgboost sklearn api if modellabel == 'xgb': xclas = XGBClassifier( max_depth=10, learning_rate=0.1) #objective='multi:softmax' xclas.fit(traindatax, traindatay_long) testdata['pred_long'] = xclas.predict(testdatax) testdata['prob_long'] = xclas.predict_proba(testdatax)[:, 1] xclas = XGBClassifier(max_depth=10, learning_rate=0.1) xclas.fit(traindatax, traindatay_short) testdata['pred_short'] = xclas.predict(testdatax) testdata['prob_short'] = xclas.predict_proba(testdatax)[:, 1] else: pass if i == ntrain: hsma = testdata[[ 'code', 'date', 'ratio', 'pred_long', 'prob_long', 'pred_short', 'prob_short' ]].copy() else: hsma = pd.concat([ hsma, testdata[[ 'code', 'date', 'ratio', 'pred_long', 'prob_long', 'pred_short', 'prob_short' ]] ], ignore_index=True) if readfile: hsma.to_hdf(filename, 'hsma') return (hsma)
all_negative_indexes = np.where(train_y == 0)[0] all_positive_indexes = np.where(train_y == 1)[0] negative_indexes = np.random.choice(all_negative_indexes, size=negative_no, replace=False) positive_indexes = np.random.choice(all_positive_indexes, size=positive_no, replace=False) indexes = np.concatenate((negative_indexes, positive_indexes)) np.random.shuffle(indexes) return train_x[indexes], train_y[indexes] for training_sample_size in TRAINING_SAMPLE_SIZES: for _ in range(TRAINING_FOR_EACH_SIZE): sampled_x, sampled_y = draw_samples(training_sample_size) best_params = rand_search[MONTHS].best_params_ best_params['random_state'] = 1 best_params['n_jobs'] = N_JOBS clf = XGBClassifier(**best_params) clf.fit(sampled_x, sampled_y, verbose=True) pred_y = clf.predict_proba(test_x) auc_score = roc_auc_score(test_y, pred_y[:, 1]) log_score = log_loss(test_y, pred_y) logging.info('{}, {}, {}'.format(training_sample_size, auc_score, log_score))
trials = Trials() bestParams = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials) clf = XGBClassifier(**bestParams) clf.seed = 37 clf.fit(xTrain, yTrain, eval_metric='logloss') # Checking classifier predictions on training data. print "Log loss: %f" % log_loss(yValid, clf.predict_proba(xValid)) # Prediction testDf = pd.read_csv(TEST_FILENAME) testX = testDf.drop(ID_COL, axis=1) testX = poly.transform(testX) testX = scaler.fit_transform(testX) testY = clf.predict_proba(testX) testDf[LABEL_COL] = testY[:, 1] currentDt = dt.datetime.now().isoformat() outputFilename = '../output/submission' + currentDt + '.csv' testDf.to_csv(outputFilename, columns=(ID_COL, LABEL_COL), index=False)
def do_cell(task): df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3] #print('do_cell', df_train.shape, df_test.shape, x_start, y_start) #train n_places_th_local = n_places_th n_places_local = n_places if n_places != 0: tmp = df_train.shape[0] value_counts = df_train.place_id.value_counts()[0:n_places] df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns] n_places_th_local = value_counts.values[n_places - 1] percentage = df_train.shape[0]/tmp elif n_places_th != 0: value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] else: n_places_th_local = 2 value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] while percentage > n_places_percentage: n_places_th_local += 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] n_places_th_local -= 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] #print(x_start, y_start, n_places_local, n_places_th_local, percentage) #test row_ids = df_test.index if 'place_id' in df_test.columns: df_test = df_test.drop(['place_id'], axis=1) le = LabelEncoder() y = le.fit_transform(df_train.place_id.values) X = df_train.drop(['place_id'], axis=1).values X_predict = df_test.values score = 0 n_estimators = 0 if xgb == 1: if xgb_calculate_n_estimators == True: clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False) score = round(1 - clf.booster().best_score, 6) n_estimators = clf.booster().best_ntree_limit else: abc += 1 xgb_options = clf.get_xgb_params() xgb_options['num_class'] = n_places + 1 train_dmatrix = DMatrix(X, label=y) #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score) n_estimators = cv_results.shape[0] score = round(1 - cv_results.values[-1][0], 6) std = round(cv_results.values[-1][1], 6) else: n_estimators = n_estimators_fixed clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) else: clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1) if rf_calculate_score == True: if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() else: #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) scores_cv = [] for train, test in folds: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() print(' ', x_start, y_start, score) scores_cv.append(score) score = np.array(scores_cv).mean() #if few_cells == 1 or grid_search == 1: # return [score, None, None] clf.fit(X, y) y_predict = clf.predict_proba(X_predict) ##1 labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx]) print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage) return [score, row_ids, labels_predict]
min_child_weight=4, gamma=0, subsample=0.9, colsample_bytree=0.6, objective='binary:logistic', nthread=4, scale_pos_weight=1, reg_alpha=0.5) xgb2.fit(X_train[train], y_train[train], eval_set=[(X_train[val], y_train[val])], early_stopping_rounds=5, eval_metric='auc', verbose=False) y_pred = xgb2.predict(X_train[val]) scores = accuracy_score(y_train[val], y_pred) print("val acc: %.2f%%" % (scores * 100)) cvscores.append(scores * 100) print(" - train acc: %.2f%% (std: %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) y_pred = xgb2.predict(X_test) score = accuracy_score(y_test, y_pred) print(" - text acc: %.2f%%" % (score * 100)) y_predprob = xgb2.predict_proba(X_test)[:, 1] print(" - text auc score: %f" % metrics.roc_auc_score(y_test, y_predprob)) print(" - confusion matrix: ") print(metrics.confusion_matrix(y_test, y_pred))
def model1(df_train, df_test): print('model1') print('rows', df_train.shape[0]) #remove rows with no sessions data hassessions = df_train['HasSessions'] df_train = df_train.drop(hassessions[hassessions == 0].index) #remove rows older than 1/1/2014 #dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) #print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)) #df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index) print('rows', df_train.shape[0]) labels = df_train['country_destination'].values df_train = df_train.drop(['country_destination'], axis=1) piv_train = df_train.shape[0] #Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1) #Filling nan df_all = df_all.fillna(-1) #####Feature engineering####### print('features in the csv', df_all.shape[1]) #date_account_created print('dac', datetime.now()) dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) df_all['dac_year'] = dac[:,0] df_all['dac_month'] = dac[:,1] df_all['dac_day'] = dac[:,2] #day of week, seazon print('dac2', datetime.now()) dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday()) df_all['dac_season'] = dac2.apply(calculate_season) df_all = df_all.drop(['date_account_created'], axis=1) #timestamp_first_active print('tfa', datetime.now()) tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values) df_all['tfa_year'] = tfa[:,0] df_all['tfa_month'] = tfa[:,1] df_all['tfa_day'] = tfa[:,2] df_all = df_all.drop(['timestamp_first_active'], axis=1) #Age print('age', datetime.now()) av = df_all.age.values df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av) #remove features print('remove features', datetime.now()) df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1) df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1) print('features in the model', df_all.shape[1]) #One-hot-encoding features print('one-hot', datetime.now()) ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #Splitting train and test vals = df_all.values X = vals[:piv_train] y = labels X_predict = vals[piv_train:] #learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 6, 0.5, 2, 2, 2, 1 learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 8, 0.5, 2, 1, 2, 0 early_stopping_rounds = 25 if learning_rate <= 0.03: early_stopping_rounds = 50 print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha) #n_estimators = 455 n_estimators = 350 #n_estimators = 1 print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1) clf2.fit(X, y) y_predicted2 = clf2.predict_proba(X_predict) return y_predicted2
#Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] # In[ ]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) # In[ ]: ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub0.csv',index=False)
av = train_file.age.values train_file['age'] = np.where(np.logical_or(av<14, av>100), 0, av) # One Hot Encoding# train_file_dummy=[] features = ['gender','age','signup_method','signup_flow','language','affiliate_channel','affiliate_provider','first_affiliate_tracked','signup_app','first_device_type','first_browser'] for feature in features: train_dummy = pd.get_dummies(train_file[feature],prefix=feature) train_file = train_file.drop(feature,axis=1) train_file = pd.concat((train_dummy,train_file),axis=1) # Train and Test data split vals = train_file.values train_data = vals[:piv_train] le = LabelEncoder() train_labels = le.fit_transform(labels) test_data = vals[piv_train:] # Train the Classifier. xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(train_data, train_labels) y_pred = xgb.predict_proba(test_data) ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub.csv',index=False)
gamma=0.4, reg_alpha=0.05, reg_lambda=2, subsample=1.0, colsample_bytree=1.0, max_delta_step=1, scale_pos_weight=1, objective='multi:softprob', nthread=8, seed=0 # , # silent = False ) print('training...') xgb_model.fit(training, label) print('predicting...') predicted = xgb_model.predict_proba(testing) predicted = pandas.DataFrame(predicted) predicted.columns = xgb_model.classes_ # Name index column. predicted.index.name = 'Id' # Write csv. print('Saving prediction...') predicted.to_csv('Prediction.csv') # feature importance feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') matplotlib.pyplot.show() plot_importance(xgb_model, title='Feature importance') matplotlib.pyplot.show() plot_tree(xgb_model, num_trees=0) matplotlib.pyplot.show()
scores2 = [] for i in range(10): folds = StratifiedKFold(y_train, n_folds=5, shuffle=True) scores = [] for train_index, test_index in folds: X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index] y_train2, y_test2 = y_train[train_index], y_train[test_index] X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2) X_train2 = csr_matrix(X_train2.values) X_test2 = csr_matrix(X_test2.values) clf.fit(X_train2, y_train2) y_pred = clf.predict_proba(X_test2) score = log_loss(y_test2, y_pred) scores.append(round(score, 6)) scores = np.array(scores) score = scores.mean() scores2.append(score) print('score, std', score, scores.std()) scores = np.array(scores2) scores = np.delete(scores, [scores.argmax(), scores.argmin()]) print('score, std', scores.mean(), scores.std()) if is_tt_rf_1 == 1: X_train, X_test = feature_engineering(df_train, df_test, y_train)
for name, model in models: print 'training ' + name + '...' results = cross_val_score(model, X, y, cv=kfold, scoring = 'log_loss') print name + ': ' + str(results.mean()) + ' +/- ' + str(results.std()) #above cross validation shows that ' ' is the best estimator model = GradientBoostingClassifier(n_estimators=100, random_state=2288, max_depth = 4, learning_rate = 0.1, max_features = 10) model.fit(X, y) preds = model.predict_proba(test) submission = pd.DataFrame() submission["shot_id"] = test.index submission["shot_made_flag"]= preds[:,1] submission.to_csv("sub.csv",index=False) ''' #xgboost from xgboost.sklearn import XGBClassifier clf_xgb = XGBClassifier(max_depth=7, learning_rate=0.012, n_estimators=1000, subsample=0.62, colsample_bytree=0.6, seed=1) clf_xgb.fit(X, y) preds = clf_xgb.predict_proba(test) submission = pd.DataFrame() submission["shot_id"] = test.index submission["shot_made_flag"]= preds[:,1] submission.to_csv("sub.csv",index=False)
def xgbost(x,y,targetx): clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4) clf_xgb.fit(x,y) return clf_xgb.predict_proba(targetx)[:,1]
test_users_ids = test_users['id'] test_users.drop('id', axis=1, inplace=True) test_users = test_users.fillna(-1) x_test = test_users.values clf = XGBClassifier( max_depth=7, learning_rate=0.18, n_estimators=80, objective="rank:pairwise", gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None, silent=True, nthread=-1, seed=42 ) clf.fit(x_train, encoded_y_train) y_pred = clf.predict_proba(x_test) generate_submission(y_pred, test_users_ids, label_encoder, name=NAME)
clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2) submit = 0 if submit == 1: # n_estimators = 395 n_estimators = 349 #n_estimators = clf.booster().best_ntree_limit print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf2.fit(X, y) #clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators) y_predicted = clf2.predict_proba(X_predict) ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(calculate_top5(y_predicted[i])).tolist() filename = 'results' + str(n_estimators) + '.csv' #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv(filename, index=False)
test_predictions.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/RF_pred.csv",header = RF.classes_ ) pred = pred.drop(['hour_5'], axis = 1) test_pred = test_pred.drop(['hour_3'], axis = 1) #XGBoost XGB = XGBClassifier(n_estimators=15000) XGB.fit(pred, resp) test_predictions_XGB = pd.DataFrame(XGB.predict_proba(test_pred)) test_predictions_XGB.index += 1 test_predictions.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/XGB_pred.csv",header = XGB.classes_ ) output = pd.DataFrame(pred.columns) output.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/pred.csv") test_output = pd.DataFrame(test_pred.columns) test_output.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/test_pred.csv")
# # Let's try a gradiant boost classifier # In[56]: xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1) xgb_model.fit(X, y) # ## How did we do? # # * To start, let's look at how well we did just predicting the final outcome pred = xgb_model.predict_proba(X) # Find the most probable country best_country = [] # Not used for now bestId = [] for i in range(len(pred)): bestId.append(np.argsort(pred[i])[::-1]) best_country.append(label_table.inverse_transform(bestId[-1])) # ## Make a scorer for the model # # Following that mentioned in the evaluation by the project # In[92]:
def main(): train_users = pd.read_csv('../input/train_users_2.csv') predict_users = pd.read_csv('../input/test_users.csv') np.random.seed(100) user_device_type_time_df = get_user_device_type_time() print(type(user_device_type_time_df)) print(user_device_type_time_df[user_device_type_time_df.total_elapsed_time > 0].head()) train_users_combined = merge_user_and_session_data(train_users, user_device_type_time_df) predict_users_combined = merge_user_and_session_data(predict_users, user_device_type_time_df) df_train = train_users_combined df_test = predict_users_combined labels = df_train['country_destination'].values df_train = df_train.drop(['country_destination'], axis=1) id_test = df_test['id'] piv_train = df_train.shape[0] #Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking'], axis=1) #Filling nan df_all = df_all.fillna(-1) #####Feature engineering####### #date_account_created dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) df_all['dac_year'] = dac[:,0] df_all['dac_month'] = dac[:,1] df_all['dac_day'] = dac[:,2] df_all = df_all.drop(['date_account_created'], axis=1) #timestamp_first_active tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4], x[4:6], x[6:8], x[8:10], x[10:12], x[12:14]]))).values) df_all['tfa_year'] = tfa[:,0] df_all['tfa_month'] = tfa[:,1] df_all['tfa_day'] = tfa[:,2] df_all = df_all.drop(['timestamp_first_active'], axis=1) #Age av = df_all.age.values df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av) #One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser','total_elapsed_time'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=4, learning_rate=0.25, n_estimators=43, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub.csv',index=False)
print data_test.shape print 'Started Computing train set labels' label_set = np.sign(label_set['Click']) label_set[label_set == -1] = 0 print 'Finished computing train set labels' # fit estimator print "start XGBClassifier" n_samples = data_train.shape[0] est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False) print "start fitting" est.fit(data_train, label_set) # predict class labels probs = est.predict_proba(data_test) print "cross validation start" cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0) scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv) mean = np.mean(probs[:, 1]) std = np.std(probs[:, 1]) print "Test predicted Mean:", mean print "Test predicted STD:", std df = pd.DataFrame(probs[:, 1]) df.columns = ["Prediction"] df.index += 1 df.to_csv("output_prediction.csv", index_label="Id")
clf = XGBClassifier(max_depth=9, learning_rate=0.1, n_estimators=1200, silent=True, objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=6, max_delta_step=0, subsample=0.4, colsample_bytree=0.8) clf.fit(df, label_df) # Calculate the classifier predictions and print metrics calculations pred = clf.predict(df) predprob = clf.predict_proba(df)[:, 1] print "F1 score for training set: {:.4f}.".format( f1_score(label_df, pred, pos_label=1.0)) print "Recall score for training set: {:.4f}.".format( recall_score(label_df, pred, pos_label=1.0, average='binary')) print "ROC score for training set: {:.4f}.".format( roc_auc_score(label_df, pred, average='macro')) print "ROC score proba for training set: {:.4f}.".format( roc_auc_score(label_df, predprob, average='macro')) # Save the trained model joblib.dump(clf, 'trained_models/model_2.pkl', compress=0) tot_end = time() print "Total calculation time {:.4f} seconds.".format(tot_end - tot_start)