コード例 #1
0
def predict_sub(model_lgb, testdex, test, subfilename):
    print_header('Submission')
    print_doing_in_task('predicting')
    lgpred = model_lgb.predict(test)
    lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex)
    lgsub['deal_probability'].clip(0.0, 1.0, inplace=True)
    print('saving submission file to', subfilename)
    lgsub.to_csv(subfilename, index=True, header=True)
    print('done')
コード例 #2
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_TUNE_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    # boosting_list = ['gbdt', 'dart']
    boosting_list = ['gbdt']
    num_leave_list = [7, 9, 15, 31, 63, 128]
    max_depth_list = [3, 4, 7, 15, 31, 64]

    model_list = []
    for i in range(len(num_leave_list)):
        num_leave = num_leave_list[i]
        max_depth = max_depth_list[i]
        for boosting_type in boosting_list:
            model_list = model_list + [
                '{}_{}_{}'.format(boosting_type, num_leave, max_depth)
            ]

    LOCAL_TUNE_RESULT = pd.DataFrame(
        index=model_list,
        columns=['running_time', 'num_round', 'train', 'val'])
    if DEBUG: print(LOCAL_TUNE_RESULT)

    option = 1
    is_textadded = True
    PREDICTORS = PREDICTORS_BASED
    mat_filename = dir_feature + 'text_feature_kernel.pickle'
    print_header('Option {}'.format(option))
    print('is_textadded {} \n predictors {} \n mat filename {}'.format(
        is_textadded, PREDICTORS, mat_filename))

    for k in range(len(num_leave_list)):
        i = len(num_leave_list) - k - 1
        num_leave = num_leave_list[i]
        max_depth = max_depth_list[i]
        for boosting_type in boosting_list:
            DO(option, is_textadded, mat_filename, dir_feature, num_leave,
               max_depth, boosting_type)

    print_header('FINAL SUMMARY')
    print(LOCAL_TUNE_RESULT)
    LOCAL_TUNE_RESULT.to_csv('csv/tune_params.csv', index=True)
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    done_feature_df = load_csv('csv/forward_selection.csv')
    print(done_feature_df)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option = 0
    is_textadded = False
    PREDICTORS = PREDICTORS_BASED

    feature_list = ['base']
    files = glob.glob(dir_feature + '*.pickle')
    REMOVED_LIST = [
        'cat_encode', 'len_feature_kernel', 'text_feature_kernel', 'time'
    ]
    for file in files:
        filename = os.path.basename(file)
        feature = re.sub('\.pickle$', '', filename)
        if is_added(filename, REMOVED_LIST):
            feature_list = feature_list + [feature]

    LOCAL_VALIDATION_RESULT = pd.DataFrame(
        index=feature_list,
        columns=['running_time', 'num_round', 'train', 'val', 'diff'])
    if DEBUG:
        print(feature_list)
        print(LOCAL_VALIDATION_RESULT)

    for feature in feature_list:
        if feature == 'base':
            PREDICTORS = PREDICTORS
        else:
            PREDICTORS = PREDICTORS + [feature]
        DO(option, is_textadded, 'abc', dir_feature, 1988, feature)
        if feature != 'base':
            PREDICTORS.remove(feature)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
    LOCAL_VALIDATION_RESULT.to_csv('forward_selection.csv', index=True)
コード例 #4
0
def prepare_training(mat_filename, dir_feature, predictors, is_textadded):
    print_header('Load features')
    df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET,
                                                          DEBUG)
    del len_train
    gc.collect()
    df = drop_col(df, REMOVED_LIST)

    # add features
    print_doing('add tabular features')
    for feature in predictors:
        dir_feature_file = dir_feature + feature + '.pickle'
        if not os.path.exists(dir_feature_file):
            print('can not find {}. Please check'.format(dir_feature_file))
        else:
            if feature in df:
                print('{} already added'.format(feature))
            else:
                print_doing_in_task('adding {}'.format(feature))
                df = add_feature(df, dir_feature_file)
    print_memory()

    if is_textadded:
        # add text_feature
        print_doing_in_task('add text features')
        ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0)

        # stack
        print_doing_in_task('stack')
        X = hstack([
            csr_matrix(df.loc[traindex, :].values),
            ready_df[0:traindex.shape[0]]
        ])  # Sparse Matrix
        testing = hstack([
            csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:]
        ])
        print_memory()

        print_doing_in_task('prepare vocab')
        tfvocab = df.columns.tolist() + tfvocab
        for shape in [X, testing]:
            print("{} Rows and {} Cols".format(*shape.shape))
        print("Feature Names Length: ", len(tfvocab))

    else:
        tfvocab = df.columns.tolist()
        testing = hstack([csr_matrix(df.loc[testdex, :].values)])
        X = hstack([csr_matrix(df.loc[traindex, :].values)])  # Sparse Matrix

    return X, y, testing, tfvocab, df.columns.tolist(), testdex
コード例 #5
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option = 0
    is_textadded = False
    PREDICTORS = PREDICTORS_BASED
    mat_filename = dir_feature + 'text_feature_kernel.pickle'

    # seed_list = np.random.randint(2000, size=1000)
    random.seed(1992)
    seed_array = random.sample(range(0, 10000), 100)

    seed_list = []
    for seed in seed_array:
        seed_list = seed_list + ['seed_' + str(seed)]
    LOCAL_VALIDATION_RESULT = pd.DataFrame(index=seed_list,
                                           columns=[
                                               'seed', 'running_time',
                                               'num_round', 'train', 'val',
                                               'local_test', 'diff'
                                           ])
    print(seed_list)
    print(LOCAL_VALIDATION_RESULT)
    for seed in seed_array:
        DO(option, is_textadded, mat_filename, dir_feature, seed)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
    LOCAL_VALIDATION_RESULT.to_csv('seed_select.csv', index=False)
コード例 #6
0
def train(X, y, num_leave, full_predictors, categorical, predictors,
          boosting_type, option, seed):
    if DEBUG:
        subfilename = '../sub/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
    else:
        subfilename = '../sub/findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)

    print_header("Training")
    start_time = time.time()

    print_doing_in_task('prepare dataset...')

    X, X_local_valid, y, y_local_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=seed)

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=seed)

    print('training shape: {} \n'.format(X.shape))

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': 15,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.8,
        'learning_rate': 0.1,
        'verbose': 0
    }
    print('params:', lgbm_params)

    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 30

    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=num_boost_round,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=10)
    print_memory()
    print_header("Model Report")

    runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60)
    num_boost_rounds_lgb = lgb_clf.best_iteration
    print_doing_in_task('fit val')
    val_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    print_doing_in_task('fit train')
    train_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train))))
    print_doing_in_task('fit local val')
    local_valid_rmse = '{0:.4f}'.format(
        np.sqrt(
            metrics.mean_squared_error(y_local_valid,
                                       lgb_clf.predict(X_local_valid))))
    diff_lb = '{0:.4f}'.format(abs(float(local_valid_rmse) - 0.2300))

    print('OPTION', option)
    print('model training time:     {} mins'.format(runnning_time))
    print('seed number:             {}'.format(seed))
    print('num_boost_rounds_lgb:    {}'.format(num_boost_rounds_lgb))
    print('train rmse:              {}'.format(train_rmse))
    print('val rmse:                {}'.format(val_rmse))
    print('local valid rmse:        {}'.format(local_valid_rmse))
    print('diff comapred to lb:     {}'.format(diff_lb))

    print('saving model to', modelfilename)
    lgb_clf.save_model(modelfilename)

    seed_name = 'seed_' + str(seed)
    LOCAL_VALIDATION_RESULT['seed'][seed_name] = seed
    LOCAL_VALIDATION_RESULT['running_time'][seed_name] = runnning_time
    LOCAL_VALIDATION_RESULT['num_round'][seed_name] = num_boost_rounds_lgb
    LOCAL_VALIDATION_RESULT['train'][seed_name] = train_rmse
    LOCAL_VALIDATION_RESULT['val'][seed_name] = val_rmse
    LOCAL_VALIDATION_RESULT['local_test'][seed_name] = local_valid_rmse
    LOCAL_VALIDATION_RESULT['diff'][seed_name] = diff_lb
    return lgb_clf, subfilename
コード例 #7
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option_list = []
    for option in range(10):
        option_list = option_list + ['option' + str(option)]

    LOCAL_VALIDATION_RESULT = pd.DataFrame(
        index=option_list,
        columns=['running_time', 'num_round', 'train', 'val'])

    if DEBUG:
        print(option_list)
        print(LOCAL_VALIDATION_RESULT)

    test_list = [8]
    for option in test_list:
        # nothing here
        if option == 0:
            is_textadded = False
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel
        elif option == 1:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel max_feature = 1000
        elif option == 2:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_1000.pickle'
        # kernel max_feature = 30000
        elif option == 3:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_30000.pickle'
        # kernel max_feature = infinite
        elif option == 4:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_-1.pickle'
        # kernel max_feature = 18000 + 'good' feature
        elif option == 5:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_GOOD
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel max_feature = 18000 + not-checked feature
        elif option == 6:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_NOTCHECKED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        elif option == 7:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_OVERFIT
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        elif option == 8:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_TRY
            mat_filename = dir_feature + 'text_feature_kernel_30000.pickle'
        if DEBUG:
            print_header('Option {}'.format(option))
            print('is_textadded {} \n predictors {} \n mat filename {}'.format(
                is_textadded, PREDICTORS, mat_filename))

        DO(option, is_textadded, mat_filename, dir_feature)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
コード例 #8
0
def train(X, y, num_leave, max_depth, full_predictors, categorical, predictors,
          boosting_type, option):

    print_header("Training")
    start_time = time.time()

    print_doing_in_task('prepare dataset...')
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=SEED)

    print('training shape: {} \n'.format(X.shape))

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': max_depth,
        'num_leave': num_leave,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'learning_rate': 0.1,
        'lambda_l1': 10,
        'max_bin': 512,
        'verbose': -1
    }
    print('params:', lgbm_params)

    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 100

    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=num_boost_round,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=10)

    print_memory()

    print_header("Model Report")

    runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60)
    num_boost_rounds_lgb = lgb_clf.best_iteration
    print_doing_in_task('fit val')
    val_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    print_doing_in_task('fit train')
    train_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train))))
    print_header("Model Report")

    print('boosting_type {}, num_leave {}, max_depth {}'.format(
        boosting_type, num_leave, max_depth))
    print('model training time:     {0:.2f} mins'.format(
        (time.time() - start_time) / 60))
    print('num_boost_rounds_lgb:    {}'.format(lgb_clf.best_iteration))
    print('best rmse:               {0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid,
                                           lgb_clf.predict(X_valid)))))

    model = '{}_{}_{}'.format(boosting_type, num_leave, max_depth)
    LOCAL_TUNE_RESULT['running_time'][model] = runnning_time
    LOCAL_TUNE_RESULT['num_round'][model] = num_boost_rounds_lgb
    LOCAL_TUNE_RESULT['train'][model] = train_rmse
    LOCAL_TUNE_RESULT['val'][model] = val_rmse