Exemple #1
0
def read_dataset(is_merged):
    debug = DEBUG
    if debug:
        filename_train = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'train_translated', debug)
        filename_test = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'test_translated', debug)
    else:
        filename_train = '../input/{}.feather'.format('train_translated')
        filename_test = '../input/{}.feather'.format('test_translated')

    print_doing('reading train, test and merge')
    if is_merged:
        df = read_train_test(filename_train,
                             filename_test,
                             '.feather',
                             is_merged=True)
        if debug: print(df.head())
    else:
        train_df, test_df = read_train_test(filename_train,
                                            filename_test,
                                            '.feather',
                                            is_merged=False)
        if debug:
            print(train_df.head())
            print(test_df.head())
    print_memory()
    if is_merged:
        return df
    else:
        return train_df, test_df
Exemple #2
0
def gen_len_title_description_feature(df, todir, ext):
    selcols = ['title_en', 'description_en', 'title', 'description']
    gp = measure_length(df, selcols=selcols, todir=todir, ext='.pickle')
    if DEBUG: print(df[selcols].head()), print(gp.head())
    del gp
    gc.collect()
    print_memory()
Exemple #3
0
def gen_mean_deal_probability(df, todir, ext):
    for selcols in MINH_LIST_MEAN_DEAL_PROB:
        gp = generate_groupby_by_type_and_columns(df, selcols, 'mean', todir,
                                                  ext)
        if DEBUG: print(df[selcols].head()), print(gp.head())
        del gp
        gc.collect()
        print_memory()
Exemple #4
0
def gen_var_price(df, todir, ext):
    for selcols in MINH_LIST_VAR_PRICE:
        gp = generate_groupby_by_type_and_columns(df, selcols, 'var', todir,
                                                  ext)
        if DEBUG: print(df[selcols].head()), print(gp.head())
        del gp
        gc.collect()
        print_memory()
Exemple #5
0
def read_dataset_origin(dataset):
    filename_train = '../input/train.csv'
    filename_test = '../input/test.csv'
    print_doing('reading train, test and merge')
    df = read_train_test(filename_train,
                         filename_test,
                         '.feather',
                         is_merged=1)
    print_memory()
    print(df.head())
    return df
Exemple #6
0
def gen_aggregated_kernel(todir, ext):
    train, test, train_active, test_active, \
        train_periods, test_periods = read_dataset_aggregated_kernel()

    gp = create_aggregated_features_kernel(train, test, train_active,
                                           test_active, train_periods,
                                           test_periods, todir, ext)

    if DEBUG: print(gp.head()), print(gp.info())
    del gp
    gc.collect()
    print_memory()
Exemple #7
0
def prepare_training(mat_filename, dir_feature, predictors, is_textadded):
    print_header('Load features')
    df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET,
                                                          DEBUG)
    del len_train
    gc.collect()
    df = drop_col(df, REMOVED_LIST)

    # add features
    print_doing('add tabular features')
    for feature in predictors:
        dir_feature_file = dir_feature + feature + '.pickle'
        if not os.path.exists(dir_feature_file):
            print('can not find {}. Please check'.format(dir_feature_file))
        else:
            if feature in df:
                print('{} already added'.format(feature))
            else:
                print_doing_in_task('adding {}'.format(feature))
                df = add_feature(df, dir_feature_file)
    print_memory()

    if is_textadded:
        # add text_feature
        print_doing_in_task('add text features')
        ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0)

        # stack
        print_doing_in_task('stack')
        X = hstack([
            csr_matrix(df.loc[traindex, :].values),
            ready_df[0:traindex.shape[0]]
        ])  # Sparse Matrix
        testing = hstack([
            csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:]
        ])
        print_memory()

        print_doing_in_task('prepare vocab')
        tfvocab = df.columns.tolist() + tfvocab
        for shape in [X, testing]:
            print("{} Rows and {} Cols".format(*shape.shape))
        print("Feature Names Length: ", len(tfvocab))

    else:
        tfvocab = df.columns.tolist()
        testing = hstack([csr_matrix(df.loc[testdex, :].values)])
        X = hstack([csr_matrix(df.loc[traindex, :].values)])  # Sparse Matrix

    return X, y, testing, tfvocab, df.columns.tolist(), testdex
Exemple #8
0
def do_dataset(dataset):
    train_df, test_df = read_dataset(False, DEBUG)
    len_train = len(train_df)

    if dataset=='train':
        df = train_df
        del test_df; gc.collect()
    else:
        df = test_df
        del train_df; gc.collect()    
    if DEBUG:
        storename = '../processed_features_debug{}/{}_debug{}.h5'.format(DEBUG, dataset, DEBUG)
        featuredir = '../processed_features_debug{}/'.format(DEBUG)
    else:
        storename = '../processed_features/{}.h5'.format(dataset)
        featuredir = '../processed_features/'

    temp = add_dataset_to_hdf5(storename, df) 
    if DEBUG: print(temp.isnull().sum(axis=0))

    files = glob.glob(featuredir + '*.pickle') 
    for file in files:
        if 'text_feature_kernel' not in file:
            print(file)
            filename = file
            print ('\n>> doing', filename)
            df = load_pickle(filename)
            if DEBUG:
                print(df.tail())

            print_doing('extract')
            if DEBUG: print(df.head()); print(df.tail())
            if dataset=='train':
                df_new = df.iloc[:len_train]
                if DEBUG: 
                    print('train: ', df.head())
                    print(df_new.isnull().sum(axis=0))
            else:
                df_new = df.iloc[len_train:]                        
                if DEBUG: 
                    print('test: ', df.tail())
                    print(df_new.isnull().sum(axis=0))
            print('merging...')
            temp = add_dataset_to_hdf5(storename, df_new) 
            if DEBUG: print(temp.isnull().sum(axis=0))
            print_memory() 
Exemple #9
0
def read_dataset():
    debug = DEBUG
    if debug:
        filename_train = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'train', debug)
        filename_test = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'test', debug)
    else:
        filename_train = '../input/{}.feather'.format('train')
        filename_test = '../input/{}.feather'.format('test')

    print_doing('reading train, test and merge')
    df = read_train_test(filename_train,
                         filename_test,
                         '.feather',
                         is_merged=1)
    print_memory()
    print(df.head())
    return df
Exemple #10
0
def read_dataset_deal_probability(seed):
    debug = DEBUG
    if debug:
        filename_train = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'train', debug)
        filename_test = '../input/debug{}/{}_debug{}.feather'.format(
            debug, 'test', debug)
    else:
        filename_train = '../input/{}.feather'.format('train')
        filename_test = '../input/{}.feather'.format('test')

    print_doing('reading train, test and merge')
    train_df, test_df = read_train_test(filename_train,
                                        filename_test,
                                        '.feather',
                                        is_merged=0)
    df = find_df_local_valid_and_make_deal_prob_nan(train_df, test_df, seed)
    print_memory()
    print(df.head())
    return df
Exemple #11
0
def get_svdtruncated_vectorizer(todir):
    print_doing('doing svdtruncated text feature')
    filename = todir + 'text_feature_kernel.pickle'
    savename = todir + 'truncated_text_feature_kernel.pickle'
    if os.path.exists(savename):
        print('done already...')
        with open(savename, "rb") as f:
            svd_matrix, vocab = pickle.load(f)
        with open(filename, "rb") as f:
            tfid_matrix, tfvocab = pickle.load(f)
    else:
        with open(filename, "rb") as f:
            tfid_matrix, tfvocab = pickle.load(f)
        svdT = TruncatedSVD(n_components=400)
        print_doing_in_task('truncated svd')
        svd_matrix = svdT.fit_transform(tfid_matrix)
        print_doing_in_task('convert to sparse')
        svd_matrix = sparse.csr_matrix(svd_matrix, dtype=np.float32)
        vocab = []
        for i in range(np.shape(svd_matrix)[1]):
            vocab.append('lsa' + str(i + 1))
        with open(savename, "wb") as f:
            pickle.dump((svd_matrix, vocab),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
    print('---- before truncate')
    print(tfid_matrix.shape), print('len of feature:', len(tfvocab))
    print('---- after truncate')
    print(svd_matrix.shape), print('len of feature:', len(vocab))

    if DEBUG:
        print(tfid_matrix)
        print('\n')
        print(svd_matrix)

    del svd_matrix, vocab, tfid_matrix, tfvocab
    gc.collect()
    print_memory()
def train(X, y, num_leave, max_depth, full_predictors, categorical, predictors,
          boosting_type, option):

    print_header("Training")
    start_time = time.time()

    print_doing_in_task('prepare dataset...')
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=SEED)

    print('training shape: {} \n'.format(X.shape))

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': max_depth,
        'num_leave': num_leave,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'learning_rate': 0.1,
        'lambda_l1': 10,
        'max_bin': 512,
        'verbose': -1
    }
    print('params:', lgbm_params)

    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 100

    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=num_boost_round,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=10)

    print_memory()

    print_header("Model Report")

    runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60)
    num_boost_rounds_lgb = lgb_clf.best_iteration
    print_doing_in_task('fit val')
    val_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    print_doing_in_task('fit train')
    train_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train))))
    print_header("Model Report")

    print('boosting_type {}, num_leave {}, max_depth {}'.format(
        boosting_type, num_leave, max_depth))
    print('model training time:     {0:.2f} mins'.format(
        (time.time() - start_time) / 60))
    print('num_boost_rounds_lgb:    {}'.format(lgb_clf.best_iteration))
    print('best rmse:               {0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid,
                                           lgb_clf.predict(X_valid)))))

    model = '{}_{}_{}'.format(boosting_type, num_leave, max_depth)
    LOCAL_TUNE_RESULT['running_time'][model] = runnning_time
    LOCAL_TUNE_RESULT['num_round'][model] = num_boost_rounds_lgb
    LOCAL_TUNE_RESULT['train'][model] = train_rmse
    LOCAL_TUNE_RESULT['val'][model] = val_rmse
Exemple #13
0
def gen_label_encode(df, todir, ext):
    gp = create_label_encode(df, todir, ext)
    if DEBUG: print(df.head()), print(gp.info())
    del gp
    gc.collect()
    print_memory()
def DO(mat_filename, storename,num_leaves,max_depth, option, boosting_type):
    frac = FRAC
    print('------------------------------------------------')
    print('start...')
    print('fraction:', frac)
    print('prepare predictors, categorical and target...')
    predictors = PREDICTORS

    print (predictors)

    categorical = get_categorical(predictors)
    target = TARGET

    subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz'
    modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option)

    print('----------------------------------------------------------')
    print('SUMMARY:')
    print('----------------------------------------------------------')
    print('predictors:',predictors)
    print('number of predictors: {} \n'.format(len(predictors)))
    print('categorical', categorical)
    print('number of predictors: {} \n'.format(len(categorical)))
    print('taget {} \n'.format(target))
    print('submission file name: {} \n'.format(subfilename))
    print('model file name: {} \n'.format(modelfilename))
    # print('fraction:', frac)
    # print('option:', option)

    print('----------------------------------------------------------')
    train_df = read_processed_h5(storename, predictors+target, categorical)
    print(train_df.info())
    print(train_df.head())

    train_df["price"] = np.log(train_df["price"]+0.001)
    train_df["price"].fillna(-999,inplace=True)
    # train_df["price"] = train_df["price"].astype('float')
    # train_df["image_top_1"].fillna(-999,inplace=True)

    print(train_df.head()); print(train_df.info())
    # train_df = train_df.sample(frac=frac, random_state = SEED)
    print_memory('afer reading train:')
    print(train_df.head())
    print("train size: ", len(train_df))
    gc.collect()

    print_doing('cleaning train...')
    train_df_array = train_df[predictors].values
    train_df_labels = train_df[target].values.astype('int').flatten()
    del train_df; gc.collect()
    print_memory()

    print_doing('reading text matrix')
    train_mat_text, tfvocab = get_text_matrix(mat_filename, 'train', DEBUG, train_df_array.shape[0])
    print_memory()

    print_doing('stack two matrix')
    train_df_array = hstack([csr_matrix(train_df_array),train_mat_text])
    print_memory()
    
    new_predictors = tfvocab
    predictors = predictors + new_predictors
    del train_mat_text; gc.collect()

    

    print('----------------------------------------------------------')
    print("Training...")
    start_time = time.time()

    params = {
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.02,
        'num_leaves': num_leaves,  # we should let it be smaller than 2^(max_depth)
        'max_depth': max_depth,  # -1 means no limit
        'subsample': 0.9,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'feature_fraction': 0.9,  # Subsample ratio of columns when constructing each tree.
        # 'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        # 'subsample_for_bin': 200000,  # Number of samples for constructing bin
        # 'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        # 'reg_alpha': 10,  # L1 regularization term on weights
        # 'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0
    }


    print('>> prepare dataset...')
    dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels,
                        feature_name=predictors,
                        categorical_feature=categorical)
    del train_df_array, train_df_labels; gc.collect()                        
    print_memory()   


    print(params)
    print('>> start cv...')

    cv_results  = lgb.cv(params, 
                        dtrain_lgb, 
                        categorical_feature = categorical,
                        num_boost_round=20000,                       
                        metrics='rmse',
                        seed = SEED,
                        shuffle = False,
                        nfold=10, 
                        show_stdv=True,
                        early_stopping_rounds=100, 
                        verbose_eval=50)                     

    print('[{}]: model training time'.format(time.time() - start_time))
    print_memory()


    # print (cv_results)
    print('--------------------------------------------------------------------') 
    num_boost_rounds_lgb = len(cv_results['rmse-mean'])
    print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))

    print ('>> start trainning... ')
    model_lgb = lgb.train(
                        params, dtrain_lgb, 
                        num_boost_round=num_boost_rounds_lgb,
                        feature_name = predictors,
                        categorical_feature = categorical)
    del dtrain_lgb
    gc.collect()

    print('--------------------------------------------------------------------') 
    print('>> save model...')
    # save model to file

    # if not DEBUG:
    model_lgb.save_model(modelfilename+'.txt')
Exemple #15
0
def do_dataset(dataset):
    train_df, test_df = read_dataset(False)
    len_train = len(train_df)

    if dataset == 'train':
        df = train_df
        del test_df
        gc.collect()
    else:
        df = test_df
        del train_df
        gc.collect()
    if DEBUG:
        storename = '../processed_features_debug{}/{}_debug{}.h5'.format(
            DEBUG, dataset, DEBUG)
        featuredir = '../processed_features_debug{}/'.format(DEBUG)
    else:
        storename = '../processed_features/{}.h5'.format(dataset)
        featuredir = '../processed_features/'

    add_dataset_to_hdf5(storename, df)

    files = glob.glob(featuredir + '*.pickle')
    for file in files:
        if 'train' not in file and 'test' not in file:
            if 'text_feature_kernel' in file:
                print(file)
                filename = file
                print('\n>> doing', filename)
                if DEBUG:
                    if '_en' in file:
                        savename = '../processed_features_debug{}/{}_text_dense_en_debug{}.pickle'.format(
                            DEBUG, dataset, DEBUG)
                    else:
                        savename = '../processed_features_debug{}/{}_text_dense_debug{}.pickle'.format(
                            DEBUG, dataset, DEBUG)
                else:
                    if '_en' in file:
                        savename = '../processed_features/{}_text_dense_en.h5'.format(
                            dataset)
                    else:
                        savename = '../processed_features/{}_text_dense.h5'.format(
                            dataset)

                if os.path.exists(savename):
                    print('done already')
                else:
                    mat = load_pickle(filename)
                    mat = mat.todense()

                    if DEBUG: print(mat.shape, np.sum(mat))
                    print_doing('extract')

                    if DEBUG:
                        print(mat[0:5, 0:7])
                        print(mat[-5:, 0:7])
                    if dataset == 'train':
                        mat = mat[:len_train, :]
                        if DEBUG: print(mat.shape, np.sum(mat))
                        if DEBUG: print('train: ', print(mat[0:5, 0:7]))
                    else:
                        mat = mat[len_train:, :]
                        print(mat.shape, np.sum(mat))
                        if DEBUG: print(mat.shape, np.sum(mat))
                        if DEBUG: print('test: ', print(mat[-5:, 0:7]))
                    print(mat)
                    print('merging...')
                    save_file(mat, savename, '.pickle')
                    print_memory()

            else:
                print(file)
                filename = file
                print('\n>> doing', filename)
                df = load_pickle(filename)
                print_doing('extract')
                if DEBUG:
                    print(df.head())
                    print(df.tail())
                if dataset == 'train':
                    df = df.iloc[:len_train]
                    if DEBUG: print('train: ', df.head())
                else:
                    df = df.iloc[len_train:]
                    if DEBUG: print('test: ', df.tail())
                print('merging...')
                add_dataset_to_hdf5(storename, df)
                print_memory()
Exemple #16
0
def train(X, y, num_leave, full_predictors, categorical, predictors,
          boosting_type, option, seed):
    if DEBUG:
        subfilename = '../sub/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
    else:
        subfilename = '../sub/findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)

    print_header("Training")
    start_time = time.time()

    print_doing_in_task('prepare dataset...')

    X, X_local_valid, y, y_local_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=seed)

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=seed)

    print('training shape: {} \n'.format(X.shape))

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': 15,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.8,
        'learning_rate': 0.1,
        'verbose': 0
    }
    print('params:', lgbm_params)

    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 30

    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=num_boost_round,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=10)
    print_memory()
    print_header("Model Report")

    runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60)
    num_boost_rounds_lgb = lgb_clf.best_iteration
    print_doing_in_task('fit val')
    val_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    print_doing_in_task('fit train')
    train_rmse = '{0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train))))
    print_doing_in_task('fit local val')
    local_valid_rmse = '{0:.4f}'.format(
        np.sqrt(
            metrics.mean_squared_error(y_local_valid,
                                       lgb_clf.predict(X_local_valid))))
    diff_lb = '{0:.4f}'.format(abs(float(local_valid_rmse) - 0.2300))

    print('OPTION', option)
    print('model training time:     {} mins'.format(runnning_time))
    print('seed number:             {}'.format(seed))
    print('num_boost_rounds_lgb:    {}'.format(num_boost_rounds_lgb))
    print('train rmse:              {}'.format(train_rmse))
    print('val rmse:                {}'.format(val_rmse))
    print('local valid rmse:        {}'.format(local_valid_rmse))
    print('diff comapred to lb:     {}'.format(diff_lb))

    print('saving model to', modelfilename)
    lgb_clf.save_model(modelfilename)

    seed_name = 'seed_' + str(seed)
    LOCAL_VALIDATION_RESULT['seed'][seed_name] = seed
    LOCAL_VALIDATION_RESULT['running_time'][seed_name] = runnning_time
    LOCAL_VALIDATION_RESULT['num_round'][seed_name] = num_boost_rounds_lgb
    LOCAL_VALIDATION_RESULT['train'][seed_name] = train_rmse
    LOCAL_VALIDATION_RESULT['val'][seed_name] = val_rmse
    LOCAL_VALIDATION_RESULT['local_test'][seed_name] = local_valid_rmse
    LOCAL_VALIDATION_RESULT['diff'][seed_name] = diff_lb
    return lgb_clf, subfilename
Exemple #17
0
def train(X, y, num_leave, full_predictors, categorical, predictors,
          boosting_type, option):

    if DEBUG:
        subfilename = '../sub/debug_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/debug_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
    else:
        subfilename = '../sub/{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)

    print('\n----------------------------------------------------------')
    print("Training...")
    print('----------------------------------------------------------')

    start_time = time.time()

    print('>> prepare dataset...')
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=SEED)

    print(X.shape)

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': 15,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.8,
        'learning_rate': 0.1,
        'verbose': 0
    }
    print('params:', lgbm_params)

    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 30

    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=num_boost_round,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=10)

    print_memory()

    print(
        '--------------------------------------------------------------------')
    print("Model Report")
    print('model training time:     {0:.2f} mins'.format(
        (time.time() - start_time) / 60))
    print('num_boost_rounds_lgb:    {}'.format(lgb_clf.best_iteration))
    print('best rmse:               {0:.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_valid,
                                           lgb_clf.predict(X_valid)))))

    print('saving model to', modelfilename)
    lgb_clf.save_model(modelfilename)

    return lgb_clf, subfilename
Exemple #18
0
def gen_time_feature(df, todir, ext):
    gp = create_time(df, todir=todir, ext=ext)
    if DEBUG: print(df['activation_date'].head()), print(gp.head())
    del gp
    gc.collect()
    print_memory()
Exemple #19
0
def gen_text_feature_from_kernel(df, todir, ext, language, max_features):
    create_text_feature(df, todir, ext, language, max_features)
    print_memory()
Exemple #20
0
def cv_train(X, y, num_leave, full_predictors, categorical, predictors,
             boosting_type, option):

    if DEBUG:
        subfilename = '../sub/debug_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/debug_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
    else:
        subfilename = '../sub/{}_{}_{}features_num_leave{}_OPTION{}.csv'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)
        modelfilename = '../trained_models/{}_{}_{}features_num_leave{}_OPTION{}.txt'. \
                format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option)

    print('\n----------------------------------------------------------')
    print("Training...")
    print('----------------------------------------------------------')

    start_time = time.time()

    params = {
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.2,
        'num_leaves':
        num_leave,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'subsample': 0.8,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'feature_fraction':
        0.8,  # Subsample ratio of columns when constructing each tree.
        'nthread': 4,
        'verbose': 0
    }

    print('>> prepare dataset...')
    dtrain_lgb = lgb.Dataset(X,
                             y,
                             feature_name=full_predictors,
                             categorical_feature=categorical)
    print_memory()

    print('params', params)
    print('\n>> start cv...')

    if DEBUG:
        num_boost_round = 300
        early_stopping_rounds = 10
    else:
        num_boost_round = 20000
        early_stopping_rounds = 30
    cv_results = lgb.cv(params,
                        dtrain_lgb,
                        categorical_feature=categorical,
                        num_boost_round=num_boost_round,
                        metrics='rmse',
                        seed=SEED,
                        shuffle=False,
                        nfold=10,
                        show_stdv=True,
                        early_stopping_rounds=early_stopping_rounds,
                        stratified=False,
                        verbose_eval=5)

    print('[{}]: model training time'.format(time.time() - start_time))
    print_memory()

    # print (cv_results)
    print(
        '--------------------------------------------------------------------')
    print("Model Report")
    num_boost_rounds_lgb = len(cv_results['rmse-mean'])
    print('num_boost_rounds_lgb = ' + str(num_boost_rounds_lgb))
    print('best rmse = {0:.4f}'.format(
        cv_results['rmse-mean'][num_boost_rounds_lgb - 1]))

    print('>> start trainning... ')
    model_lgb = lgb.train(params,
                          dtrain_lgb,
                          num_boost_round=num_boost_rounds_lgb,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    del dtrain_lgb
    gc.collect()

    print('saving model to', modelfilename)
    model_lgb.save_model(modelfilename)

    return model_lgb, subfilename