Ejemplo n.º 1
0
def model_train(params):
    ins_pred=[]
    oos_pred=[]
    ins_tgt=[]
    oos_tgt=[]

    day_errors=[]
    var_importance={}
    test_scores=[]
    for i in range(39):
        ins_msk=store['train_y'].iloc[:,i].fillna(0)>0
        oos_msk=store['val_y'].iloc[:,i].fillna(0)>0
            
        if params!=[]:
            idv=['%s_visitors_%d'%(x,y) for x,y in product(['min','max','avg','med','cnt'],[1,2,3,7,14,21,35,56,91,147])]
            idv+=[x for x in store['train_x'].columns if 'wd%d'%(6-i%7) in x]
            idv+=['air_reserve_%d'%i]
            train_lgb=lgb.Dataset(store['train_x'].loc[ins_msk,idv],store['train_y'].loc[ins_msk,:].iloc[:,i])
            model=lgb.train(params, train_lgb)
            ins_pred.append(pd.Series(model.predict(store['train_x'].loc[ins_msk,idv])))
            oos_pred.append(pd.Series(model.predict(store['val_x'].loc[oos_msk,idv])))
        else:
            idv=['med_visitors_56','avg_visitors_wd%d_13'%(6-i%7)]
            model=LinearRegression()
            ins_vars=store['train_x'].loc[ins_msk,idv]
            ins_vars['med_visitors_56']=ins_vars['med_visitors_56'].replace(-99999,2.86)
            ins_vars['avg_visitors_wd%d_13'%(6-i%7)]=np.where(ins_vars['avg_visitors_wd%d_13'%(6-i%7)]==-99999,ins_vars['med_visitors_56'],ins_vars['avg_visitors_wd%d_13'%(6-i%7)])
            oos_vars=store['val_x'].loc[oos_msk,idv]
            oos_vars['med_visitors_56']=oos_vars['med_visitors_56'].replace(-99999,2.86)
            oos_vars['avg_visitors_wd%d_13'%(6-i%7)]=np.where(oos_vars['avg_visitors_wd%d_13'%(6-i%7)]==-99999,oos_vars['med_visitors_56'],oos_vars['avg_visitors_wd%d_13'%(6-i%7)])
            model.fit(ins_vars,store['train_y'].loc[ins_msk,:].iloc[:,i])
            ins_pred.append(pd.Series(model.predict(ins_vars)))
            oos_pred.append(pd.Series(model.predict(oos_vars)))
            print 'COEF: ',np.array([model.intercept_]),model.coef_

        ins_tgt.append(pd.Series(store['train_y'].loc[ins_msk,:].iloc[:,i].values))
        oos_tgt.append(pd.Series(store['val_y'].loc[oos_msk,:].iloc[:,i].values))
        ins_error=np.sqrt(mean_squared_error(ins_tgt[-1],ins_pred[-1]))
        oos_error=np.sqrt(mean_squared_error(oos_tgt[-1],oos_pred[-1]))
        print 'RMSLE: ',np.array([ins_error,oos_error])
        day_errors.append([ins_error,oos_error])
        var_importance[i]=pd.Series(model.feature_importance(),index=idv)
        test_score=pd.Series(np.expm1(model.predict(store['test_x'].loc[:,idv])),index=store['test_x'].index.tolist()).to_frame('visitors')
        test_score['visit_date']=date(2017,4,23)+timedelta(days=i)
        test_scores.append(test_score)
    ins_error=np.sqrt(mean_squared_error(pd.concat(ins_tgt),pd.concat(ins_pred)))
    oos_error=np.sqrt(mean_squared_error(pd.concat(oos_tgt),pd.concat(oos_pred)))
    day_errors.append([ins_error,oos_error])
    day_errors=pd.DataFrame(day_errors,columns=['train','test'])
    var_importance=pd.DataFrame(var_importance).unstack()
    var_importance=var_importance[var_importance>0]
    test_scores=pd.concat(test_scores)
    return day_errors,var_importance,test_scores
Ejemplo n.º 2
0
                           categorical_feature=categorical_feats)

    num_round = 10000
    regressor = lgb.train(param,
                          trn_data,
                          num_round,
                          valid_sets=[trn_data, val_data],
                          verbose_eval=100,
                          early_stopping_rounds=200)

    validation_prediction[validation_idx] = regressor.predict(
        valid_df_x, num_iteration=regressor.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = regressor.feature_importance()
    fold_importance_df["fold_n"] = fold_n + 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)

    predictions += regressor.predict(
        test_df[features],
        num_iteration=regressor.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(
    mean_squared_error(validation_prediction, target)**0.5))
stop = time.time()
print("Time taken by the algorithm:", round(stop - start, 2), "seconds")

#Importance of each feature (Plotting)
cols = (feature_importance_df[[