def read_dataset(is_merged): debug = DEBUG if debug: filename_train = '../input/debug{}/{}_debug{}.feather'.format( debug, 'train_translated', debug) filename_test = '../input/debug{}/{}_debug{}.feather'.format( debug, 'test_translated', debug) else: filename_train = '../input/{}.feather'.format('train_translated') filename_test = '../input/{}.feather'.format('test_translated') print_doing('reading train, test and merge') if is_merged: df = read_train_test(filename_train, filename_test, '.feather', is_merged=True) if debug: print(df.head()) else: train_df, test_df = read_train_test(filename_train, filename_test, '.feather', is_merged=False) if debug: print(train_df.head()) print(test_df.head()) print_memory() if is_merged: return df else: return train_df, test_df
def gen_len_title_description_feature(df, todir, ext): selcols = ['title_en', 'description_en', 'title', 'description'] gp = measure_length(df, selcols=selcols, todir=todir, ext='.pickle') if DEBUG: print(df[selcols].head()), print(gp.head()) del gp gc.collect() print_memory()
def gen_mean_deal_probability(df, todir, ext): for selcols in MINH_LIST_MEAN_DEAL_PROB: gp = generate_groupby_by_type_and_columns(df, selcols, 'mean', todir, ext) if DEBUG: print(df[selcols].head()), print(gp.head()) del gp gc.collect() print_memory()
def gen_var_price(df, todir, ext): for selcols in MINH_LIST_VAR_PRICE: gp = generate_groupby_by_type_and_columns(df, selcols, 'var', todir, ext) if DEBUG: print(df[selcols].head()), print(gp.head()) del gp gc.collect() print_memory()
def read_dataset_origin(dataset): filename_train = '../input/train.csv' filename_test = '../input/test.csv' print_doing('reading train, test and merge') df = read_train_test(filename_train, filename_test, '.feather', is_merged=1) print_memory() print(df.head()) return df
def gen_aggregated_kernel(todir, ext): train, test, train_active, test_active, \ train_periods, test_periods = read_dataset_aggregated_kernel() gp = create_aggregated_features_kernel(train, test, train_active, test_active, train_periods, test_periods, todir, ext) if DEBUG: print(gp.head()), print(gp.info()) del gp gc.collect() print_memory()
def prepare_training(mat_filename, dir_feature, predictors, is_textadded): print_header('Load features') df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET, DEBUG) del len_train gc.collect() df = drop_col(df, REMOVED_LIST) # add features print_doing('add tabular features') for feature in predictors: dir_feature_file = dir_feature + feature + '.pickle' if not os.path.exists(dir_feature_file): print('can not find {}. Please check'.format(dir_feature_file)) else: if feature in df: print('{} already added'.format(feature)) else: print_doing_in_task('adding {}'.format(feature)) df = add_feature(df, dir_feature_file) print_memory() if is_textadded: # add text_feature print_doing_in_task('add text features') ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0) # stack print_doing_in_task('stack') X = hstack([ csr_matrix(df.loc[traindex, :].values), ready_df[0:traindex.shape[0]] ]) # Sparse Matrix testing = hstack([ csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:] ]) print_memory() print_doing_in_task('prepare vocab') tfvocab = df.columns.tolist() + tfvocab for shape in [X, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab)) else: tfvocab = df.columns.tolist() testing = hstack([csr_matrix(df.loc[testdex, :].values)]) X = hstack([csr_matrix(df.loc[traindex, :].values)]) # Sparse Matrix return X, y, testing, tfvocab, df.columns.tolist(), testdex
def do_dataset(dataset): train_df, test_df = read_dataset(False, DEBUG) len_train = len(train_df) if dataset=='train': df = train_df del test_df; gc.collect() else: df = test_df del train_df; gc.collect() if DEBUG: storename = '../processed_features_debug{}/{}_debug{}.h5'.format(DEBUG, dataset, DEBUG) featuredir = '../processed_features_debug{}/'.format(DEBUG) else: storename = '../processed_features/{}.h5'.format(dataset) featuredir = '../processed_features/' temp = add_dataset_to_hdf5(storename, df) if DEBUG: print(temp.isnull().sum(axis=0)) files = glob.glob(featuredir + '*.pickle') for file in files: if 'text_feature_kernel' not in file: print(file) filename = file print ('\n>> doing', filename) df = load_pickle(filename) if DEBUG: print(df.tail()) print_doing('extract') if DEBUG: print(df.head()); print(df.tail()) if dataset=='train': df_new = df.iloc[:len_train] if DEBUG: print('train: ', df.head()) print(df_new.isnull().sum(axis=0)) else: df_new = df.iloc[len_train:] if DEBUG: print('test: ', df.tail()) print(df_new.isnull().sum(axis=0)) print('merging...') temp = add_dataset_to_hdf5(storename, df_new) if DEBUG: print(temp.isnull().sum(axis=0)) print_memory()
def read_dataset(): debug = DEBUG if debug: filename_train = '../input/debug{}/{}_debug{}.feather'.format( debug, 'train', debug) filename_test = '../input/debug{}/{}_debug{}.feather'.format( debug, 'test', debug) else: filename_train = '../input/{}.feather'.format('train') filename_test = '../input/{}.feather'.format('test') print_doing('reading train, test and merge') df = read_train_test(filename_train, filename_test, '.feather', is_merged=1) print_memory() print(df.head()) return df
def read_dataset_deal_probability(seed): debug = DEBUG if debug: filename_train = '../input/debug{}/{}_debug{}.feather'.format( debug, 'train', debug) filename_test = '../input/debug{}/{}_debug{}.feather'.format( debug, 'test', debug) else: filename_train = '../input/{}.feather'.format('train') filename_test = '../input/{}.feather'.format('test') print_doing('reading train, test and merge') train_df, test_df = read_train_test(filename_train, filename_test, '.feather', is_merged=0) df = find_df_local_valid_and_make_deal_prob_nan(train_df, test_df, seed) print_memory() print(df.head()) return df
def get_svdtruncated_vectorizer(todir): print_doing('doing svdtruncated text feature') filename = todir + 'text_feature_kernel.pickle' savename = todir + 'truncated_text_feature_kernel.pickle' if os.path.exists(savename): print('done already...') with open(savename, "rb") as f: svd_matrix, vocab = pickle.load(f) with open(filename, "rb") as f: tfid_matrix, tfvocab = pickle.load(f) else: with open(filename, "rb") as f: tfid_matrix, tfvocab = pickle.load(f) svdT = TruncatedSVD(n_components=400) print_doing_in_task('truncated svd') svd_matrix = svdT.fit_transform(tfid_matrix) print_doing_in_task('convert to sparse') svd_matrix = sparse.csr_matrix(svd_matrix, dtype=np.float32) vocab = [] for i in range(np.shape(svd_matrix)[1]): vocab.append('lsa' + str(i + 1)) with open(savename, "wb") as f: pickle.dump((svd_matrix, vocab), f, protocol=pickle.HIGHEST_PROTOCOL) print('---- before truncate') print(tfid_matrix.shape), print('len of feature:', len(tfvocab)) print('---- after truncate') print(svd_matrix.shape), print('len of feature:', len(vocab)) if DEBUG: print(tfid_matrix) print('\n') print(svd_matrix) del svd_matrix, vocab, tfid_matrix, tfvocab gc.collect() print_memory()
def train(X, y, num_leave, max_depth, full_predictors, categorical, predictors, boosting_type, option): print_header("Training") start_time = time.time() print_doing_in_task('prepare dataset...') X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED) print('training shape: {} \n'.format(X.shape)) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'max_depth': max_depth, 'num_leave': num_leave, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'learning_rate': 0.1, 'lambda_l1': 10, 'max_bin': 512, 'verbose': -1 } print('params:', lgbm_params) lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 100 lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=num_boost_round, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, verbose_eval=10) print_memory() print_header("Model Report") runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60) num_boost_rounds_lgb = lgb_clf.best_iteration print_doing_in_task('fit val') val_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))) print_doing_in_task('fit train') train_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train)))) print_header("Model Report") print('boosting_type {}, num_leave {}, max_depth {}'.format( boosting_type, num_leave, max_depth)) print('model training time: {0:.2f} mins'.format( (time.time() - start_time) / 60)) print('num_boost_rounds_lgb: {}'.format(lgb_clf.best_iteration)) print('best rmse: {0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))) model = '{}_{}_{}'.format(boosting_type, num_leave, max_depth) LOCAL_TUNE_RESULT['running_time'][model] = runnning_time LOCAL_TUNE_RESULT['num_round'][model] = num_boost_rounds_lgb LOCAL_TUNE_RESULT['train'][model] = train_rmse LOCAL_TUNE_RESULT['val'][model] = val_rmse
def gen_label_encode(df, todir, ext): gp = create_label_encode(df, todir, ext) if DEBUG: print(df.head()), print(gp.info()) del gp gc.collect() print_memory()
def DO(mat_filename, storename,num_leaves,max_depth, option, boosting_type): frac = FRAC print('------------------------------------------------') print('start...') print('fraction:', frac) print('prepare predictors, categorical and target...') predictors = PREDICTORS print (predictors) categorical = get_categorical(predictors) target = TARGET subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz' modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) print('----------------------------------------------------------') print('SUMMARY:') print('----------------------------------------------------------') print('predictors:',predictors) print('number of predictors: {} \n'.format(len(predictors))) print('categorical', categorical) print('number of predictors: {} \n'.format(len(categorical))) print('taget {} \n'.format(target)) print('submission file name: {} \n'.format(subfilename)) print('model file name: {} \n'.format(modelfilename)) # print('fraction:', frac) # print('option:', option) print('----------------------------------------------------------') train_df = read_processed_h5(storename, predictors+target, categorical) print(train_df.info()) print(train_df.head()) train_df["price"] = np.log(train_df["price"]+0.001) train_df["price"].fillna(-999,inplace=True) # train_df["price"] = train_df["price"].astype('float') # train_df["image_top_1"].fillna(-999,inplace=True) print(train_df.head()); print(train_df.info()) # train_df = train_df.sample(frac=frac, random_state = SEED) print_memory('afer reading train:') print(train_df.head()) print("train size: ", len(train_df)) gc.collect() print_doing('cleaning train...') train_df_array = train_df[predictors].values train_df_labels = train_df[target].values.astype('int').flatten() del train_df; gc.collect() print_memory() print_doing('reading text matrix') train_mat_text, tfvocab = get_text_matrix(mat_filename, 'train', DEBUG, train_df_array.shape[0]) print_memory() print_doing('stack two matrix') train_df_array = hstack([csr_matrix(train_df_array),train_mat_text]) print_memory() new_predictors = tfvocab predictors = predictors + new_predictors del train_mat_text; gc.collect() print('----------------------------------------------------------') print("Training...") start_time = time.time() params = { 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.02, 'num_leaves': num_leaves, # we should let it be smaller than 2^(max_depth) 'max_depth': max_depth, # -1 means no limit 'subsample': 0.9, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'feature_fraction': 0.9, # Subsample ratio of columns when constructing each tree. # 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) # 'subsample_for_bin': 200000, # Number of samples for constructing bin # 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization # 'reg_alpha': 10, # L1 regularization term on weights # 'reg_lambda': 0, # L2 regularization term on weights 'nthread': 4, 'verbose': 0 } print('>> prepare dataset...') dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels, feature_name=predictors, categorical_feature=categorical) del train_df_array, train_df_labels; gc.collect() print_memory() print(params) print('>> start cv...') cv_results = lgb.cv(params, dtrain_lgb, categorical_feature = categorical, num_boost_round=20000, metrics='rmse', seed = SEED, shuffle = False, nfold=10, show_stdv=True, early_stopping_rounds=100, verbose_eval=50) print('[{}]: model training time'.format(time.time() - start_time)) print_memory() # print (cv_results) print('--------------------------------------------------------------------') num_boost_rounds_lgb = len(cv_results['rmse-mean']) print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb)) print ('>> start trainning... ') model_lgb = lgb.train( params, dtrain_lgb, num_boost_round=num_boost_rounds_lgb, feature_name = predictors, categorical_feature = categorical) del dtrain_lgb gc.collect() print('--------------------------------------------------------------------') print('>> save model...') # save model to file # if not DEBUG: model_lgb.save_model(modelfilename+'.txt')
def do_dataset(dataset): train_df, test_df = read_dataset(False) len_train = len(train_df) if dataset == 'train': df = train_df del test_df gc.collect() else: df = test_df del train_df gc.collect() if DEBUG: storename = '../processed_features_debug{}/{}_debug{}.h5'.format( DEBUG, dataset, DEBUG) featuredir = '../processed_features_debug{}/'.format(DEBUG) else: storename = '../processed_features/{}.h5'.format(dataset) featuredir = '../processed_features/' add_dataset_to_hdf5(storename, df) files = glob.glob(featuredir + '*.pickle') for file in files: if 'train' not in file and 'test' not in file: if 'text_feature_kernel' in file: print(file) filename = file print('\n>> doing', filename) if DEBUG: if '_en' in file: savename = '../processed_features_debug{}/{}_text_dense_en_debug{}.pickle'.format( DEBUG, dataset, DEBUG) else: savename = '../processed_features_debug{}/{}_text_dense_debug{}.pickle'.format( DEBUG, dataset, DEBUG) else: if '_en' in file: savename = '../processed_features/{}_text_dense_en.h5'.format( dataset) else: savename = '../processed_features/{}_text_dense.h5'.format( dataset) if os.path.exists(savename): print('done already') else: mat = load_pickle(filename) mat = mat.todense() if DEBUG: print(mat.shape, np.sum(mat)) print_doing('extract') if DEBUG: print(mat[0:5, 0:7]) print(mat[-5:, 0:7]) if dataset == 'train': mat = mat[:len_train, :] if DEBUG: print(mat.shape, np.sum(mat)) if DEBUG: print('train: ', print(mat[0:5, 0:7])) else: mat = mat[len_train:, :] print(mat.shape, np.sum(mat)) if DEBUG: print(mat.shape, np.sum(mat)) if DEBUG: print('test: ', print(mat[-5:, 0:7])) print(mat) print('merging...') save_file(mat, savename, '.pickle') print_memory() else: print(file) filename = file print('\n>> doing', filename) df = load_pickle(filename) print_doing('extract') if DEBUG: print(df.head()) print(df.tail()) if dataset == 'train': df = df.iloc[:len_train] if DEBUG: print('train: ', df.head()) else: df = df.iloc[len_train:] if DEBUG: print('test: ', df.tail()) print('merging...') add_dataset_to_hdf5(storename, df) print_memory()
def train(X, y, num_leave, full_predictors, categorical, predictors, boosting_type, option, seed): if DEBUG: subfilename = '../sub/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) else: subfilename = '../sub/findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) print_header("Training") start_time = time.time() print_doing_in_task('prepare dataset...') X, X_local_valid, y, y_local_valid = train_test_split(X, y, test_size=0.2, random_state=seed) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=seed) print('training shape: {} \n'.format(X.shape)) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'max_depth': 15, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'learning_rate': 0.1, 'verbose': 0 } print('params:', lgbm_params) lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 30 lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=num_boost_round, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, verbose_eval=10) print_memory() print_header("Model Report") runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60) num_boost_rounds_lgb = lgb_clf.best_iteration print_doing_in_task('fit val') val_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))) print_doing_in_task('fit train') train_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train)))) print_doing_in_task('fit local val') local_valid_rmse = '{0:.4f}'.format( np.sqrt( metrics.mean_squared_error(y_local_valid, lgb_clf.predict(X_local_valid)))) diff_lb = '{0:.4f}'.format(abs(float(local_valid_rmse) - 0.2300)) print('OPTION', option) print('model training time: {} mins'.format(runnning_time)) print('seed number: {}'.format(seed)) print('num_boost_rounds_lgb: {}'.format(num_boost_rounds_lgb)) print('train rmse: {}'.format(train_rmse)) print('val rmse: {}'.format(val_rmse)) print('local valid rmse: {}'.format(local_valid_rmse)) print('diff comapred to lb: {}'.format(diff_lb)) print('saving model to', modelfilename) lgb_clf.save_model(modelfilename) seed_name = 'seed_' + str(seed) LOCAL_VALIDATION_RESULT['seed'][seed_name] = seed LOCAL_VALIDATION_RESULT['running_time'][seed_name] = runnning_time LOCAL_VALIDATION_RESULT['num_round'][seed_name] = num_boost_rounds_lgb LOCAL_VALIDATION_RESULT['train'][seed_name] = train_rmse LOCAL_VALIDATION_RESULT['val'][seed_name] = val_rmse LOCAL_VALIDATION_RESULT['local_test'][seed_name] = local_valid_rmse LOCAL_VALIDATION_RESULT['diff'][seed_name] = diff_lb return lgb_clf, subfilename
def train(X, y, num_leave, full_predictors, categorical, predictors, boosting_type, option): if DEBUG: subfilename = '../sub/debug_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/debug_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) else: subfilename = '../sub/{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) print('\n----------------------------------------------------------') print("Training...") print('----------------------------------------------------------') start_time = time.time() print('>> prepare dataset...') X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED) print(X.shape) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'max_depth': 15, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'learning_rate': 0.1, 'verbose': 0 } print('params:', lgbm_params) lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 30 lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=num_boost_round, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, verbose_eval=10) print_memory() print( '--------------------------------------------------------------------') print("Model Report") print('model training time: {0:.2f} mins'.format( (time.time() - start_time) / 60)) print('num_boost_rounds_lgb: {}'.format(lgb_clf.best_iteration)) print('best rmse: {0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))) print('saving model to', modelfilename) lgb_clf.save_model(modelfilename) return lgb_clf, subfilename
def gen_time_feature(df, todir, ext): gp = create_time(df, todir=todir, ext=ext) if DEBUG: print(df['activation_date'].head()), print(gp.head()) del gp gc.collect() print_memory()
def gen_text_feature_from_kernel(df, todir, ext, language, max_features): create_text_feature(df, todir, ext, language, max_features) print_memory()
def cv_train(X, y, num_leave, full_predictors, categorical, predictors, boosting_type, option): if DEBUG: subfilename = '../sub/debug_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/debug_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) else: subfilename = '../sub/{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) print('\n----------------------------------------------------------') print("Training...") print('----------------------------------------------------------') start_time = time.time() params = { 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.2, 'num_leaves': num_leave, # we should let it be smaller than 2^(max_depth) 'max_depth': -1, # -1 means no limit 'subsample': 0.8, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'feature_fraction': 0.8, # Subsample ratio of columns when constructing each tree. 'nthread': 4, 'verbose': 0 } print('>> prepare dataset...') dtrain_lgb = lgb.Dataset(X, y, feature_name=full_predictors, categorical_feature=categorical) print_memory() print('params', params) print('\n>> start cv...') if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 30 cv_results = lgb.cv(params, dtrain_lgb, categorical_feature=categorical, num_boost_round=num_boost_round, metrics='rmse', seed=SEED, shuffle=False, nfold=10, show_stdv=True, early_stopping_rounds=early_stopping_rounds, stratified=False, verbose_eval=5) print('[{}]: model training time'.format(time.time() - start_time)) print_memory() # print (cv_results) print( '--------------------------------------------------------------------') print("Model Report") num_boost_rounds_lgb = len(cv_results['rmse-mean']) print('num_boost_rounds_lgb = ' + str(num_boost_rounds_lgb)) print('best rmse = {0:.4f}'.format( cv_results['rmse-mean'][num_boost_rounds_lgb - 1])) print('>> start trainning... ') model_lgb = lgb.train(params, dtrain_lgb, num_boost_round=num_boost_rounds_lgb, feature_name=full_predictors, categorical_feature=categorical) del dtrain_lgb gc.collect() print('saving model to', modelfilename) model_lgb.save_model(modelfilename) return model_lgb, subfilename