def run_training(df_train, params, valid_train=True):
    d_train = lgb.Dataset(df_train[f_to_use],
                          label=df_train['reordered'],
                          free_raw_data=False)

    # parameter processing
    train_init_learning_rate = params.pop('init_learning_rate', 0.1)
    train_decay = params.pop('decay', 0.99)
    train_min_learning_rate = params.pop('min_learning_rate', 0.01)

    # handle training parameters
    fitParams = ['early_stopping_rounds', 'num_boost_round', 'verbose_eval']
    fitKwargs = dict()
    for key in params.keys():
        if key in fitParams:
            fitKwargs[key] = params.pop(key)

    tctrl = TrainingCtrl(init_learning_rate=train_init_learning_rate,\
                         decay=train_decay,\
                         min_learning_rate=train_min_learning_rate)
    evals_result = dict()

    # train the model and use train_set as validation
    if valid_train:
        model = lgb.train(params,
                          d_train,
                          valid_sets=d_train,
                          learning_rates=tctrl.get_learning_rate,
                          evals_result=evals_result,
                          **fitKwargs)
    else:
        model = lgb.train(params,
                          d_train,
                          learning_rates=tctrl.get_learning_rate,
                          evals_result=evals_result,
                          **fitKwargs)

    common.logging_dict(logger, evals_result, 'evals result')

    return model
Exemple #2
0
        df_train_debug=df_train.merge(df_user_cat_stats, how='left', on='user_id').drop('user_id',axis=1)
        pne.set_features(pne.f_to_use + df_user_cat_stats.keys().drop('user_id').tolist())
        print(pne.f_to_data)
    '''

    # start to crossvaliation
    parameters = list(ParameterGrid(gridParams))
    print('Total number of combinations ' + str(len(parameters)))

    for i in range(len(parameters)):
        print('current number %d in total combination %d' %
              (i, len(parameters)))

        logging_params = parameters[i]
        params = copy.deepcopy(logging_params)
        common.logging_dict(logger, logging_params, 'cv parameters')

        pne.set_params(params)
        cv_result = pne.cv(df_train_debug)

        bst_cv = get_bst_cv(cv_file='../output/lightgbm_pnone_cv.csv')
        cv_result_dict = {
            'num_rounds':
            [bst_cv.num_rounds.max() + 1 if bst_cv.shape[0] > 0 else 1],
            'best_score': [np.min(cv_result['binary_logloss-mean'])],
            'best_iteration': [np.argmin(cv_result['binary_logloss-mean'])]
        }
        bst_cv = bst_cv.append(pd.DataFrame(cv_result_dict))
        bst_cv.to_csv('../output/lightgbm_pnone_cv.csv', index=False)

        common.logging_dict(logger, cv_result_dict, 'one cv result')
Exemple #3
0
def run_cross_validation(df_data,
                         params,
                         n_splits=5,
                         f1_eval=False,
                         cv_iter=None):
    # parameter processing
    ori_params = copy.deepcopy(params)

    if cv_iter == None:
        cv_iter = n_splits

    # records
    cv_result=pd.DataFrame(columns=['best_training_iteration',\
                                    'best_training_score',\
                                    'best_valid_iteration',\
                                    'best_valid_score',\
                                    'best_eval_score',\
                                    'params'])

    # eval_threshold=params.pop('threshold',0.21)
    train_init_learning_rate = params.pop('init_learning_rate', 0.1)
    train_decay = params.pop('decay', 0.99)
    train_min_learning_rate = params.pop('min_learning_rate', 0.01)

    # handle training parameters
    fitParams = ['early_stopping_rounds', 'num_boost_round', 'verbose_eval']
    fitKwargs = dict()
    for key in params.keys():
        if key in fitParams:
            fitKwargs[key] = params.pop(key)

    # setup k fold
    df_pnone_labels = df_data.groupby('order_id').apply(
        lambda x: 0 if x.reordered.sum() > 0 else 1)
    skf = StratifiedKFold(n_splits=n_splits, random_state=1234)
    skf_iterator = skf.split(df_pnone_labels.index.values,
                             df_pnone_labels.values.tolist())

    # train_group_indices, test_group_indices = skf_iterator.next()
    n_iter = 0
    for train_group_indices, test_group_indices in skf_iterator:
        if n_iter >= cv_iter:
            break
        else:
            n_iter = n_iter + 1
        # fetch data
        df_cv_train = df_data[df_data.order_id.isin(
            df_pnone_labels.index.values[train_group_indices])]
        df_cv_valid = df_data[df_data.order_id.isin(
            df_pnone_labels.index.values[test_group_indices])]
        # construct the d_train and d_value
        d_train = lgb.Dataset(df_cv_train[f_to_use],
                              label=df_cv_train['reordered'],
                              free_raw_data=False)
        d_valid = lgb.Dataset(df_cv_valid[f_to_use],
                              label=df_cv_valid['reordered'],
                              free_raw_data=False)

        tctrl = TrainingCtrl(init_learning_rate=train_init_learning_rate,\
                             decay=train_decay,\
                             min_learning_rate=train_min_learning_rate)

        evals_result = dict()

        model = lgb.train(params, d_train, valid_sets=[d_train, d_valid],\
                          learning_rates=tctrl.get_learning_rate,\
                          evals_result=evals_result,**fitKwargs)
        # add pnone_pred
        pne = pNoneEstimator()
        df_cv_valid = df_cv_valid.merge(pne.get_pnone(df_cv_train, df_cv_valid), \
                                        how='left', on='order_id')

        best_training_score = min(evals_result['training']['binary_logloss'])
        best_training_iteration = np.argmin(
            evals_result['training']['binary_logloss'])
        best_valid_score = min(evals_result['valid_1']['binary_logloss'])
        best_valid_iteration = np.argmin(
            evals_result['valid_1']['binary_logloss'])
        best_eval_score = cv_evaluate(model, df_cv_valid) if f1_eval else 0

        ori_params.pop('metric', None)
        cv_result=cv_result.append(pd.DataFrame({\
                    'best_training_score': [best_training_score],\
                    'best_training_iteration': [best_training_iteration],\
                    'best_valid_score': [best_valid_score],\
                    'best_valid_iteration': [best_valid_iteration],\
                    'best_eval_score': [best_eval_score],
                    'lgb_version': [lgb.__version__],
                    'params':json.dumps(ori_params)}))
        # explore
        print("Features importance...")
        gain = model.feature_importance('gain')
        ft = pd.DataFrame({
            'feature': model.feature_name(),
            'split': model.feature_importance('split'),
            'gain': 100 * gain / gain.sum()
        }).sort_values('gain', ascending=False)
        print(ft)

        logger.debug('train and valid loss')
        common.logging_dict(logger, evals_result, 'evals result')
        logger.debug(ft.to_string())
        logger.debug([
            best_eval_score, best_training_score, best_training_iteration,
            best_valid_score, best_valid_iteration
        ])

        del df_cv_train
        del df_cv_valid
        del d_train
        del d_valid
        del model

    print cv_result.best_eval_score.mean()
    return cv_result
        'scale_pos_weight': 1.0,
        'is_unbalance': False,
        'feature_fraction': 0.56,
        #'bagging_fraction': 0.95,
        #'bagging_freq': 5,
        'early_stopping_rounds':
        100,  # early_stopping_rounds is important. only when early_stopping happens, the best_iteration will be returned.
        'num_boost_round':
        3000,  # num_boost_round, Number of boosted trees to fit 
        'decay': 0.995,
        'min_learning_rate': 0.02,
        'verbose_eval': True
    }

    print(trainParams)
    common.logging_dict(logger, trainParams, 'test logging')
    logger.debug('lgb_version=%f' % lgb.__version__)

    # load the data
    df_train = common.load_df('../data/', 'df_imba_train')
    df_train['aisle_id'] = df_train['aisle_id'].astype('category')
    df_train['department_id'] = df_train['department_id'].astype('category')

    # load extra cat data 150, 300
    df_train = get_extra_cat_data(df_train)
    print(df_train.dtypes['user_cat_150'])
    print(df_train.dtypes['prod_cat_150'])

    # the bst_model_id decided load model or do training
    bst_model_id = -1
Exemple #5
0
    # should we update lr to 0.01 after 1000 rounds?

    # start to crossvaliation
    parameters = list(ParameterGrid(gridParams))
    print('Total number of combinations ' + str(len(parameters)))

    n_fold = 5  # 5 fold cv, which will close final training
    for i in range(len(parameters)):
        print('current number %d in total combination %d' %
              (i, len(parameters)))

        logging_params = parameters[i]
        params = copy.deepcopy(logging_params)

        common.logging_dict(logger, logging_params, 'cv parameters')

        df_cv_result = run_cross_validation(df_train,
                                            params,
                                            n_splits=n_fold,
                                            f1_eval=True,
                                            cv_iter=3)

        # save and logging
        bst_cv = get_bst_cv(cv_file='../output/lightgbm_cv.csv')
        df_cv_result['num_rounds'] = bst_cv.num_rounds.max(
        ) + 1 if bst_cv.shape[0] > 0 else 1
        bst_cv = bst_cv.append(df_cv_result)
        bst_cv.to_csv('../output/lightgbm_cv.csv', index=False)

        logger.debug('one cv result \n' + df_cv_result.to_string())