Ejemplo n.º 1
0
 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score
Ejemplo n.º 2
0
def job_function(params):
	learning_rate = params[0]
	max_depth = params[1]
	ss_cs = params[2]
	gamma = params[3]
	min_child_weight = params[4]
	reg_lambda = params[5]
	reg_alpha = params[6]

	early_stopping_rounds = 25
	if learning_rate >= 0.3:
		early_stopping_rounds = 5
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	scores = []
	for i in range(iterations_per_job):
		X_train = Xy[i][0]
		X_test = Xy[i][1]
		y_train = Xy[i][2]
		y_test = Xy[i][3]
		
		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
		y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
		score = calculate_score(y_predicted, y_test2)
		scores.append(score)

	avg_score = np.array(scores).mean()
	print(avg_score, params)
	return avg_score
def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking. 
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output
Ejemplo n.º 4
0
def xgboostinitial_predictor(train_path, test_path, eval_path):
    # Loading the data
    print 'Loading the data...'
    train = pd.read_csv(train_path, index_col=0)
    test = pd.read_csv(test_path, index_col=0)
    eval_df = pd.read_csv(eval_path, index_col=0)
    target = train['target'].copy()
    train.drop('target', axis=1, inplace=True)

    # Training model
    print 'Model training begins...'
    # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan)
    # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01,
    #                   'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0}
    #
    # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3)

    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(train.as_matrix(), target.tolist())

    # Running the model
    print 'Making predictions....'
    # xgtest = xgb.DMatrix(test.values)
    # xgeval = xgb.DMatrix(eval_df)

    test_preds = xgb_model.predict_proba(test.as_matrix())
    eval_preds = xgb_model.predict_proba(eval_df.as_matrix())

    print 'Cleaning predictions to match expected format....'
    test_output = pd.DataFrame(test_preds, index=test.index)
    print test_output.columns
    test_output = test_output[1]
    test_output.columns = ['PredictedProb']

    eval_output = pd.DataFrame(eval_preds, index=eval_df.index)
    eval_output = eval_output[1]
    eval_output.columns = ['PredictedProb']

    return test_output, eval_output
Ejemplo n.º 5
0
def objective(space):

    clf = XGBClassifier(n_estimators=int(space['n_estimators']),
                        objective='binary:logistic',
                        seed=37,
                        learning_rate=space['learning_rate'],
                        max_depth=space['max_depth'],
                        min_child_weight=space['min_child_weight'],
                        colsample_bytree=space['colsample_bytree'],
                        subsample=space['subsample'])

    clf.fit(xTrain, yTrain, eval_metric="logloss")
    pred = clf.predict_proba(xValid)[:, 1]
    loss = log_loss(yValid, pred)
    return{'loss': loss, 'status': STATUS_OK}
def main():
    data_train = pd.read_csv(args.train_dataset)
    X_train = data_train.drop(['Id', 'Class'], axis=1)
    y_train = data_train.loc[:, 'Class']
    data_test = pd.read_csv(args.test_dataset)
    X_test = data_test.drop(['Id'], axis=1)
    Id = data_test.loc[:, 'Id']
    clf = XGBClassifier()
    clf.set_params(**best_dicts)
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_test)
    columns = ['Prediction'+str(i) for i in range(1, 10)]
    prediction = pd.DataFrame(prediction, columns=columns)
    results = pd.concat([Id, prediction], axis=1)
    return (clf, results)
Ejemplo n.º 7
0
def myThreadFunc(ThreadID):
	X_train = Xy[ThreadID][0]
	X_test = Xy[ThreadID][1]
	y_train = Xy[ThreadID][2]
	y_test = Xy[ThreadID][3]
		
	y_train2 = le.transform(y_train)   
	y_test2 = le.transform(y_test)   

	clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
	clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
	y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
	score = calculate_score(y_predicted, y_test2)
	print(score, clf.booster().best_ntree_limit)
	
	train_and_test_scores[ThreadID] = score
Ejemplo n.º 8
0
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'):
    """
    Ensembler based on xgboost Gradient boosting.
    """
    #Loading data
    X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder)
    y = y_valid
    
    #Defining classifier
    xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200,
                        objective='multi:softprob', gamma=0., 
                        max_delta_step=0., subsample=0.9, colsample_bytree=0.9,
                        seed=0)  
    xgb.fit(X, y)   
    y_pred = xgb.predict_proba(X_test)
    return y_pred      
    
    
    
def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
    """ Perform prediction using a combination of XGB and RandomForests. """
    predictions = np.zeros((len(testing), len(set(labels))))
    # Predictions using xgboost.
    for i in range(xgb_votes):
        print 'XGB vote %d' % i
        xgb = XGBClassifier(
            max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
            n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
            subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
        xgb.fit(training, labels)
        predictions += xgb.predict_proba(testing)
    # Predictions using RandomForestClassifier.
    for i in range(rf_votes):
        print 'RandomForest vote %d' % i
        rand_forest = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
            max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
        rand_forest.fit(training, labels)
        predictions += rand_forest.predict_proba(testing)
    return predictions
Ejemplo n.º 10
0
#X_val_pca = pca.transform(norm_X_val)
#X_test_pca = pca.transform(norm_X_test)
###
#X_val = X_val_pca
#norm_X_val = X_val_pca
#X_train = X_train_pca
#norm_X_train = X_train_pca
#X_test = X_test_pca
#norm_X_test = X_test_pca

#+++++++++++++++++++++++++++++++++++++Classifier+++++++++++++++++++++++++++++++++++++++++++++++++++++++
#boosting 
print 'start boosting'
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0).fit(X_train, y_train)                  
y_pred_boosting = xgb.predict_proba(X_test)  
if val_num != 0:
    y_pred_boosting_val = xgb.predict_proba(X_val)
#
#randomforest
#print 'start random forest'
#clf_randforest = RandomForestClassifier().fit(X_train, y_train)                  
#y_pred_randforest = clf_randforest.predict_proba(X_test)  
#if val_num != 0:
#    y_pred_randforest_val = clf_randforest.predict_proba(X_val)  
#
##bagging
#print 'start bagging'
#clf2 = BaggingClassifier(n_estimators=100).fit(X_train, y_train)                  
#y_pred_bagging = clf2.predict_proba(X_test)
#y_pred_bagging_val = clf2.predict_proba(X_val)
Ejemplo n.º 11
0
del Training["Train"]


lr=LogisticRegression(n_jobs=4)
xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=80,objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)				  
#lr.fit(Training,y)

xgb.fit(Training,y)

testIds=Testing["id"]
del Testing["country_destination"]
del Testing["id"]
del Testing["Train"]
Testing = Testing.fillna(-1)

ypred = xgb.predict_proba(Testing)

F=(pd.DataFrame(ypred))[[0,1,2,3,4,5,6,7,8,9,10,11]].idxmax(axis=1)
Fst=pd.DataFrame(F,columns=['idm'])
summary=Fst.groupby(['idm'])
print summary['idm'].aggregate(len)



idList = []  #list of ids
cts = []  #list of countries
i=0
for idx in testIds:
	cts += le.inverse_transform(np.argsort(ypred[i])[::-1])[:5].tolist()
	idList += [idx] * 5
	i=i+1
train.drop(x, axis=1, inplace=True)
test.drop(x, axis=1, inplace=True)

y_train = train['TARGET'].values
X_train = train.drop(['ID','TARGET'], axis=1).values

y_test = test['ID']
X_test = test.drop(['ID'], axis=1).values

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.6815,
 colsample_bytree=0.701,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
metrics=['auc'], early_stopping_rounds=50, show_progress=False)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train, eval_metric='auc')
output = xgb1.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":y_test, "TARGET":output})
submission.to_csv("submission.csv", index=False)
Ejemplo n.º 13
0
print 'Finished Reconstructing Train/Test Sets'
print data_train.shape
print data_test.shape

print 'Started Computing train set labels'
label_set = np.sign(label_set['Click'])
label_set[label_set == -1] = 0
print 'Finished computing train set labels'

# fit estimator
print "start XGBClassifier"
n_samples = data_train.shape[0]
est = XGBClassifier(n_estimators=200, learning_rate=0.1, silent=False)

print "start fitting"
est.fit(data_train, label_set)
# predict class labels
probs = est.predict_proba(data_test)

print "cross validation start"
cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0)
scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv)
mean = np.mean(probs[:, 1])
std = np.std(probs[:, 1])
print "Test predicted Mean:", mean
print "Test predicted STD:", std
df = pd.DataFrame(probs[:, 1])
df.columns = ["Prediction"]
df.index += 1
df.to_csv("output_prediction.csv", index_label="Id")
Ejemplo n.º 14
0
                    max_depth=5)
for pdepth in range(30, 44, 2):
    clf = XGBClassifier(objective='binary:logistic',
                        silent=1,
                        seed=215,
                        learning_rate=0.05,
                        gamma=0.,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        base_score=0.5,
                        max_delta_step=0,
                        min_child_weight=6,
                        max_depth=pdepth)
    y_score = []
    clf.fit(X_train, Y_train)
    predict_X_test = clf.predict_proba(X_test)
    y_score = []
    for each in predict_X_test:
        y_score.append(each[1])
    print metrics.roc_auc_score(Y_test, y_score)
'''
row = []
column = []
element = []
name = []
source = open("testfeature_TFIDF.txt","rb")
i = 0
for line in source:
	data = line.split(",")
	length = (len(data) - 1) / 2
	for j in range(0,length):
Ejemplo n.º 15
0
num_rounds=206
z=[]
dtrain=xgb.DMatrix(train[features],label=y)
clf=xgb.train(params,dtrain,num_rounds)

importance=clf.get_fscore(fmap='xgb.fmap')
importance=sorted(importance.items(),key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

bst=list(df['feature'][df.fscore>0.001])
#df.to_csv('select.csv',index=False)
X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10)
print ('start xgboost learning...')
alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1)                    
alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True)


#plt.figure()
#df.plot()
#df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
#plt.title('XGBoost Feature Importance')
#plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')
y_pred = alg.predict_proba(test[bst])
result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2'])
result['id']=test.id.values.copy()
#result.to_csv('xgb10.csv',index=False)


Ejemplo n.º 16
0
                      colsample_bytree=0.8,
                      gamma=0,
                      learning_rate=0.1,
                      max_delta_step=0,
                      max_depth=3,
                      min_child_weight=1,
                      missing=None,
                      n_estimators=1000,
                      reg_alpha=0,
                      reg_lambda=1,
                      scale_pos_weight=1,
                      seed=27,
                      silent=True,
                      subsample=0.8)
model.fit(X_train, y_train)  # 训练模型
test_pred_xgb = model.predict_proba(X_test)[:, 1]  # 预测为1的可能性
fpr_xgb, tpr_xgb, threshold = metrics.roc_curve(y_test, test_pred_xgb)
auc = metrics.auc(fpr_xgb, tpr_xgb)
score = metrics.accuracy_score(y_test, model.predict(X_test))  # 输入真实值和预测值
print([score, auc])  # 准确率、AUC面积
precision_xgb, recall_xgb, thresholds = precision_recall_curve(
    y_test, test_pred_xgb)
pr_xgb = pd.DataFrame({"precision": precision_xgb, "recall": recall_xgb})
prc_xgb = pr_xgb[pr_xgb.precision >= 0.97].recall.max()
print(prc_xgb)  # 精确度≥0.97条件下的最大召回率

importance = model.feature_importances_
indices = np.argsort(importance)[::-1]  # np.argsort()返回数值升序排列的索引,[::-1]表示倒序
features = X_train.columns
for f in range(X_train.shape[1]):
    print("%2d) %3d %20s (%.4f)" %
Ejemplo n.º 17
0
def xgboost_algorithm(XTrain,YTrain,XTest):
    xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
    xgb.fit(XTrain, YTrain)
    y_pred_xgboost = xgb.predict_proba(XTest) 
    return y_pred_xgboost
Ejemplo n.º 18
0
      "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range",
      "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining",
      "loc_x","loc_y"]
for col in cols:
    data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1)
train_x=data_x[-pd.isnull(data.shot_made_flag)]
test_x=data_x[pd.isnull(data.shot_made_flag)]
train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)]

clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550,
                     subsample=0.5, colsample_bytree=0.5, seed=0)
clf.fit(train_x, train_y)
y_pred = clf.predict(train_x)
print("Number of mislabeled points out of a total %d points : %d"  % (train_x.shape[0],(train_y != y_pred).sum()))

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    print(ll)
    return ll
    
logloss(train_y,clf.predict_proba(train_x)[:,1])

test_y=clf.predict_proba(test_x)[:,1]
test_id=data[pd.isnull(data.shot_made_flag)]["shot_id"]
submission=pd.DataFrame({"shot_id":test_id,"shot_made_flag":test_y})
submission.to_csv("submissson_1.csv",index=False)
Ejemplo n.º 19
0
    def xgb_cls(self, testlen, ntrain, lengths, timesteps, day, tr, attr,
                attry, modellabel, readfile):

        if attr == 'raw':
            hsmadata_x = self.hsmadata_raw_x(timesteps)
        elif attr == 'ta':
            hsmadata_x = self.hsmadata_ta_x(lengths)
        else:
            print('Wrong Attr!')

        if attry == 'roc':
            hsmadata_y = self.hsmadata_roc(day)
        elif attry == 'roo':
            hsmadata_y = self.hsmadata_roo(day)
        else:
            print('Wrong Attr_y!')
        hsmadata = pd.merge(hsmadata_y, hsmadata_x)

        dates = pd.Series(hsmadata['date'].unique()).sort_values()
        dates.index = range(0, len(dates))
        ntest = len(dates) // testlen

        filename = 'testresult\\futurexgboost\\hsma_xgb_cls_testlen' + \
            str(testlen) + '_attr' + str(attr) + '_attry' + str(attry) + \
            '_tr' + str(tr) + '_timesteps' + str(timesteps) + '_day' + str(day) + \
            '_' + modellabel + '_' + self.label + '.h5'

        if readfile:
            if os.path.exists(filename):
                hsma = pd.read_hdf(filename, 'hsma')
            else:
                hsma = pd.DataFrame()
        else:
            hsma = pd.DataFrame()

        for i in range(ntrain, ntest):
            traindata = hsmadata[
                (hsmadata['date'] >= dates[(i - ntrain) * testlen])
                & (hsmadata['date'] <= dates[i * testlen - day - 1])].copy()
            testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen])
                                & (hsmadata['date'] < dates[
                                    (i + 1) * testlen])].copy()
            startdate = dates[i * testlen]
            enddate = testdata.date.max()
            if hsma.shape[0] > 0:
                if startdate <= hsma.date.max():
                    continue
            print(enddate)

            ###变换数据集成LSTM所需格式
            traindatax = traindata.drop(['date', 'code', 'ratio'], 1)
            testdatax = testdata[traindatax.columns]
            traindatay_long = traindata['ratio'].copy()
            traindatay_long[traindata['ratio'] >= tr] = 1
            traindatay_long[traindata['ratio'] < tr] = 0
            traindatay_short = traindata['ratio'].copy()
            traindatay_short[traindata['ratio'] <= -tr] = 1
            traindatay_short[traindata['ratio'] > -tr] = 0

            #加入变量筛选

            ###建模并预测
            ###xgboost sklearn api
            if modellabel == 'xgb':
                xclas = XGBClassifier(
                    max_depth=10,
                    learning_rate=0.1)  #objective='multi:softmax'
                xclas.fit(traindatax, traindatay_long)
                testdata['pred_long'] = xclas.predict(testdatax)
                testdata['prob_long'] = xclas.predict_proba(testdatax)[:, 1]
                xclas = XGBClassifier(max_depth=10, learning_rate=0.1)
                xclas.fit(traindatax, traindatay_short)
                testdata['pred_short'] = xclas.predict(testdatax)
                testdata['prob_short'] = xclas.predict_proba(testdatax)[:, 1]
            else:
                pass

            if i == ntrain:
                hsma = testdata[[
                    'code', 'date', 'ratio', 'pred_long', 'prob_long',
                    'pred_short', 'prob_short'
                ]].copy()
            else:
                hsma = pd.concat([
                    hsma, testdata[[
                        'code', 'date', 'ratio', 'pred_long', 'prob_long',
                        'pred_short', 'prob_short'
                    ]]
                ],
                                 ignore_index=True)

            if readfile:
                hsma.to_hdf(filename, 'hsma')

        return (hsma)
Ejemplo n.º 20
0
    all_negative_indexes = np.where(train_y == 0)[0]
    all_positive_indexes = np.where(train_y == 1)[0]
    negative_indexes = np.random.choice(all_negative_indexes,
                                        size=negative_no,
                                        replace=False)
    positive_indexes = np.random.choice(all_positive_indexes,
                                        size=positive_no,
                                        replace=False)
    indexes = np.concatenate((negative_indexes, positive_indexes))
    np.random.shuffle(indexes)
    return train_x[indexes], train_y[indexes]


for training_sample_size in TRAINING_SAMPLE_SIZES:
    for _ in range(TRAINING_FOR_EACH_SIZE):
        sampled_x, sampled_y = draw_samples(training_sample_size)

        best_params = rand_search[MONTHS].best_params_
        best_params['random_state'] = 1
        best_params['n_jobs'] = N_JOBS

        clf = XGBClassifier(**best_params)
        clf.fit(sampled_x, sampled_y, verbose=True)
        pred_y = clf.predict_proba(test_x)

        auc_score = roc_auc_score(test_y, pred_y[:, 1])
        log_score = log_loss(test_y, pred_y)

        logging.info('{}, {}, {}'.format(training_sample_size, auc_score,
                                         log_score))
Ejemplo n.º 21
0
trials = Trials()
bestParams = fmin(fn=objective,
                  space=space,
                  algo=tpe.suggest,
                  max_evals=100,
                  trials=trials)


clf = XGBClassifier(**bestParams)
clf.seed = 37

clf.fit(xTrain, yTrain, eval_metric='logloss')

# Checking classifier predictions on training data.
print "Log loss: %f" % log_loss(yValid, clf.predict_proba(xValid))

# Prediction
testDf = pd.read_csv(TEST_FILENAME)

testX = testDf.drop(ID_COL, axis=1)
testX = poly.transform(testX)
testX = scaler.fit_transform(testX)

testY = clf.predict_proba(testX)
testDf[LABEL_COL] = testY[:, 1]

currentDt = dt.datetime.now().isoformat()
outputFilename = '../output/submission' + currentDt + '.csv'
testDf.to_csv(outputFilename, columns=(ID_COL, LABEL_COL), index=False)
Ejemplo n.º 22
0
Archivo: test1.py Proyecto: mircean/ML
def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]
Ejemplo n.º 23
0
                         min_child_weight=4,
                         gamma=0,
                         subsample=0.9,
                         colsample_bytree=0.6,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         reg_alpha=0.5)

    xgb2.fit(X_train[train],
             y_train[train],
             eval_set=[(X_train[val], y_train[val])],
             early_stopping_rounds=5,
             eval_metric='auc',
             verbose=False)
    y_pred = xgb2.predict(X_train[val])
    scores = accuracy_score(y_train[val], y_pred)
    print("val acc: %.2f%%" % (scores * 100))
    cvscores.append(scores * 100)

print(" - train acc: %.2f%% (std: %.2f%%)" %
      (np.mean(cvscores), np.std(cvscores)))
y_pred = xgb2.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(" - text acc: %.2f%%" % (score * 100))
y_predprob = xgb2.predict_proba(X_test)[:, 1]
print(" - text auc score: %f" % metrics.roc_auc_score(y_test, y_predprob))

print(" - confusion matrix: ")
print(metrics.confusion_matrix(y_test, y_pred))
Ejemplo n.º 24
0
def model1(df_train, df_test):
	print('model1')

	print('rows', df_train.shape[0]) 

	#remove rows with no sessions data
	hassessions = df_train['HasSessions']
	df_train = df_train.drop(hassessions[hassessions == 0].index)

	#remove rows older than 1/1/2014
	#dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	#print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index))
	#df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)

	print('rows', df_train.shape[0]) 

	labels = df_train['country_destination'].values
	df_train = df_train.drop(['country_destination'], axis=1)
	piv_train = df_train.shape[0]

	#Creating a DataFrame with train+test data
	df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
	#Removing id and date_first_booking
	df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1)

	#Filling nan
	df_all = df_all.fillna(-1)

	#####Feature engineering#######
	print('features in the csv', df_all.shape[1])

	#date_account_created
	print('dac', datetime.now())
	dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
	df_all['dac_year'] = dac[:,0]
	df_all['dac_month'] = dac[:,1]
	df_all['dac_day'] = dac[:,2]

	#day of week, seazon
	print('dac2', datetime.now())
	dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday())
	df_all['dac_season'] = dac2.apply(calculate_season)

	df_all = df_all.drop(['date_account_created'], axis=1)

	#timestamp_first_active
	print('tfa', datetime.now())
	tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
	df_all['tfa_year'] = tfa[:,0]
	df_all['tfa_month'] = tfa[:,1]
	df_all['tfa_day'] = tfa[:,2]
	df_all = df_all.drop(['timestamp_first_active'], axis=1)

	#Age
	print('age', datetime.now())
	av = df_all.age.values
	df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

	#remove features
	print('remove features', datetime.now())
	df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1)
	df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1)

	print('features in the model', df_all.shape[1])

	#One-hot-encoding features
	print('one-hot', datetime.now())
	ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] 

	for f in ohe_feats:
		df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
		df_all = df_all.drop([f], axis=1)
		df_all = pd.concat((df_all, df_all_dummy), axis=1)

	#Splitting train and test
	vals = df_all.values
	X = vals[:piv_train]
	y = labels
	X_predict = vals[piv_train:]

	#learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 6, 0.5, 2, 2, 2, 1
	learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 8, 0.5, 2, 1, 2, 0

	early_stopping_rounds = 25
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha)

	#n_estimators = 455
	n_estimators = 350
	#n_estimators = 1
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1)      
	clf2.fit(X, y)
	y_predicted2 = clf2.predict_proba(X_predict)  

	return y_predicted2
Ejemplo n.º 25
0
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]


# In[ ]:


#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  


# In[ ]:

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub0.csv',index=False)
Ejemplo n.º 26
0
av = train_file.age.values
train_file['age'] = np.where(np.logical_or(av<14, av>100), 0, av)
# One Hot Encoding#
train_file_dummy=[]
features = ['gender','age','signup_method','signup_flow','language','affiliate_channel','affiliate_provider','first_affiliate_tracked','signup_app','first_device_type','first_browser']
for feature in features:
    train_dummy = pd.get_dummies(train_file[feature],prefix=feature)
    train_file = train_file.drop(feature,axis=1)
    train_file = pd.concat((train_dummy,train_file),axis=1)

# Train and Test data split
vals = train_file.values
train_data = vals[:piv_train]
le = LabelEncoder()
train_labels = le.fit_transform(labels)   
test_data = vals[piv_train:]
# Train the Classifier.
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(train_data, train_labels)
y_pred = xgb.predict_proba(test_data)
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)
                           gamma=0.4,
                           reg_alpha=0.05,
                           reg_lambda=2,
                           subsample=1.0,
                           colsample_bytree=1.0,
                           max_delta_step=1,
                           scale_pos_weight=1,
                           objective='multi:softprob',
                           nthread=8,
                           seed=0  # ,
                           # silent = False
                           )
 print('training...')
 xgb_model.fit(training, label)
 print('predicting...')
 predicted = xgb_model.predict_proba(testing)
 predicted = pandas.DataFrame(predicted)
 predicted.columns = xgb_model.classes_
 # Name index column.
 predicted.index.name = 'Id'
 # Write csv.
 print('Saving prediction...')
 predicted.to_csv('Prediction.csv')
 # feature importance
 feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False)
 feat_imp.plot(kind='bar', title='Feature Importances')
 matplotlib.pyplot.show()
 plot_importance(xgb_model, title='Feature importance')
 matplotlib.pyplot.show()
 plot_tree(xgb_model, num_trees=0)
 matplotlib.pyplot.show()
Ejemplo n.º 28
0
        scores2 = []
        for i in range(10):
            folds = StratifiedKFold(y_train, n_folds=5, shuffle=True)
            scores = []
            for train_index, test_index in folds:
                X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index]
                y_train2, y_test2 = y_train[train_index], y_train[test_index]

                X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2)

                X_train2 = csr_matrix(X_train2.values)
                X_test2 = csr_matrix(X_test2.values)

                clf.fit(X_train2, y_train2)
                y_pred = clf.predict_proba(X_test2)
                score = log_loss(y_test2, y_pred)
                scores.append(round(score, 6))

            scores = np.array(scores)
            score = scores.mean()
            scores2.append(score)
            print('score, std', score, scores.std())

        scores = np.array(scores2)
        scores = np.delete(scores, [scores.argmax(), scores.argmin()])
        print('score, std', scores.mean(), scores.std())

    if is_tt_rf_1 == 1:
        X_train, X_test = feature_engineering(df_train, df_test, y_train)
    
Ejemplo n.º 29
0
for name, model in models:
    print 'training ' + name + '...'
    results = cross_val_score(model, X, y, cv=kfold, scoring = 'log_loss')
    print name + ': ' + str(results.mean()) + ' +/- ' + str(results.std())
 
#above cross validation shows that ' ' is the best estimator

model = GradientBoostingClassifier(n_estimators=100, random_state=2288, max_depth = 4, learning_rate = 0.1, max_features = 10)

model.fit(X, y)
preds = model.predict_proba(test)

submission = pd.DataFrame()
submission["shot_id"] = test.index 
submission["shot_made_flag"]= preds[:,1]

submission.to_csv("sub.csv",index=False)
'''
#xgboost
from xgboost.sklearn import XGBClassifier

clf_xgb = XGBClassifier(max_depth=7, learning_rate=0.012, n_estimators=1000, subsample=0.62, colsample_bytree=0.6, seed=1)
clf_xgb.fit(X, y)

preds = clf_xgb.predict_proba(test)
submission = pd.DataFrame()
submission["shot_id"] = test.index 
submission["shot_made_flag"]= preds[:,1]

submission.to_csv("sub.csv",index=False)
Ejemplo n.º 30
0
def xgbost(x,y,targetx):
    clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4)
    clf_xgb.fit(x,y)
    return clf_xgb.predict_proba(targetx)[:,1]
Ejemplo n.º 31
0
    test_users_ids = test_users['id']
    test_users.drop('id', axis=1, inplace=True)
    test_users = test_users.fillna(-1)
    x_test = test_users.values

    clf = XGBClassifier(
        max_depth=7,
        learning_rate=0.18,
        n_estimators=80,
        objective="rank:pairwise",
        gamma=0,
        min_child_weight=1,
        max_delta_step=0,
        subsample=1,
        colsample_bytree=1,
        colsample_bylevel=1,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        base_score=0.5,
        missing=None,
        silent=True,
        nthread=-1,
        seed=42
    )

    clf.fit(x_train, encoded_y_train)
    y_pred = clf.predict_proba(x_test)

    generate_submission(y_pred, test_users_ids, label_encoder, name=NAME)
Ejemplo n.º 32
0
		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2)

submit = 0
if submit == 1:
#	n_estimators = 395
	n_estimators = 349
	#n_estimators = clf.booster().best_ntree_limit 
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
	clf2.fit(X, y)
	#clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators)

	y_predicted = clf2.predict_proba(X_predict)  

	ids = []  #list of ids
	cts = []  #list of countries
	for i in range(len(id_test)):
		idx = id_test[i]
		ids += [idx] * 5
		cts += le.inverse_transform(calculate_top5(y_predicted[i])).tolist()

	filename = 'results' + str(n_estimators) + '.csv'
	#Generate submission
	sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
	sub.to_csv(filename, index=False)


Ejemplo n.º 33
0
test_predictions.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/RF_pred.csv",header = RF.classes_ )





pred = pred.drop(['hour_5'], axis = 1)
test_pred = test_pred.drop(['hour_3'], axis = 1) 

#XGBoost
XGB = XGBClassifier(n_estimators=15000)
XGB.fit(pred, resp)



test_predictions_XGB = pd.DataFrame(XGB.predict_proba(test_pred))
test_predictions_XGB.index += 1



test_predictions.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/XGB_pred.csv",header = XGB.classes_ )


output = pd.DataFrame(pred.columns)
output.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/pred.csv")

test_output = pd.DataFrame(test_pred.columns)
test_output.to_csv("/Users/grazim/Documents/Kaggle_Local/Shelter Animal Outcomes/test_pred.csv")


Ejemplo n.º 34
0

# # Let's try a gradiant boost classifier

# In[56]:

xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1)
xgb_model.fit(X, y)


# ## How did we do?
#
# * To start, let's look at how well we did just predicting the final outcome


pred = xgb_model.predict_proba(X)

# Find the most probable country
best_country = []  # Not used for now
bestId = []
for i in range(len(pred)):
    bestId.append(np.argsort(pred[i])[::-1])
    best_country.append(label_table.inverse_transform(bestId[-1]))


# ## Make a scorer for the model
#
# Following that mentioned in the evaluation by the project

# In[92]:
def main():

    train_users = pd.read_csv('../input/train_users_2.csv')
    predict_users = pd.read_csv('../input/test_users.csv')
    
    np.random.seed(100)

    user_device_type_time_df = get_user_device_type_time()
    print(type(user_device_type_time_df))
    print(user_device_type_time_df[user_device_type_time_df.total_elapsed_time > 0].head())

    train_users_combined = merge_user_and_session_data(train_users, user_device_type_time_df)
    predict_users_combined = merge_user_and_session_data(predict_users, user_device_type_time_df)
    
    df_train = train_users_combined
    df_test = predict_users_combined
    labels = df_train['country_destination'].values
    df_train = df_train.drop(['country_destination'], axis=1)
    id_test = df_test['id']
    piv_train = df_train.shape[0]

    #Creating a DataFrame with train+test data
    df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
    #Removing id and date_first_booking
    df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
    #Filling nan
    df_all = df_all.fillna(-1)

    #####Feature engineering#######
    #date_account_created
    dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
    df_all['dac_year'] = dac[:,0]
    df_all['dac_month'] = dac[:,1]
    df_all['dac_day'] = dac[:,2]
    df_all = df_all.drop(['date_account_created'], axis=1)

    #timestamp_first_active
    tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],
                                                                                             x[4:6],
                                                                                             x[6:8],
                                                                                             x[8:10],
                                                                                             x[10:12],
                                                                                             x[12:14]]))).values)
    df_all['tfa_year'] = tfa[:,0]
    df_all['tfa_month'] = tfa[:,1]
    df_all['tfa_day'] = tfa[:,2]
    df_all = df_all.drop(['timestamp_first_active'], axis=1)

    #Age
    av = df_all.age.values
    df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

    #One-hot-encoding features
    ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider',
                 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser','total_elapsed_time']
    for f in ohe_feats:
        df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
        df_all = df_all.drop([f], axis=1)
        df_all = pd.concat((df_all, df_all_dummy), axis=1)

    #Splitting train and test
    vals = df_all.values
    X = vals[:piv_train]
    le = LabelEncoder()
    y = le.fit_transform(labels)   
    X_test = vals[piv_train:]

    #Classifier
    xgb = XGBClassifier(max_depth=4, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  
    xgb.fit(X, y)
    y_pred = xgb.predict_proba(X_test)  

    #Taking the 5 classes with highest probabilities
    ids = []  #list of ids
    cts = []  #list of countries
    for i in range(len(id_test)):
        idx = id_test[i]
        ids += [idx] * 5
        cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

    #Generate submission
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv('sub.csv',index=False)
Ejemplo n.º 36
0
print data_test.shape


print 'Started Computing train set labels'
label_set = np.sign(label_set['Click'])
label_set[label_set == -1] = 0
print 'Finished computing train set labels'

# fit estimator
print "start XGBClassifier"
n_samples = data_train.shape[0]
est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False)

print "start fitting"
est.fit(data_train, label_set)
# predict class labels
probs = est.predict_proba(data_test)

print "cross validation start"
cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0)
scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv)
mean = np.mean(probs[:, 1])
std = np.std(probs[:, 1])
print "Test predicted Mean:", mean
print "Test predicted STD:", std
df = pd.DataFrame(probs[:, 1])
df.columns = ["Prediction"]
df.index += 1
df.to_csv("output_prediction.csv", index_label="Id")

Ejemplo n.º 37
0
clf = XGBClassifier(max_depth=9,
                    learning_rate=0.1,
                    n_estimators=1200,
                    silent=True,
                    objective='binary:logistic',
                    nthread=-1,
                    gamma=0,
                    min_child_weight=6,
                    max_delta_step=0,
                    subsample=0.4,
                    colsample_bytree=0.8)
clf.fit(df, label_df)

# Calculate the classifier predictions and print metrics calculations
pred = clf.predict(df)
predprob = clf.predict_proba(df)[:, 1]
print "F1 score for training set: {:.4f}.".format(
    f1_score(label_df, pred, pos_label=1.0))
print "Recall score for training set: {:.4f}.".format(
    recall_score(label_df, pred, pos_label=1.0, average='binary'))
print "ROC score for training set: {:.4f}.".format(
    roc_auc_score(label_df, pred, average='macro'))
print "ROC score proba for training set: {:.4f}.".format(
    roc_auc_score(label_df, predprob, average='macro'))

# Save the trained model
joblib.dump(clf, 'trained_models/model_2.pkl', compress=0)

tot_end = time()
print "Total calculation time {:.4f} seconds.".format(tot_end - tot_start)