def prepare_training(mat_filename, dir_feature, predictors, is_textadded): print_header('Load features') df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET, DEBUG) del len_train gc.collect() df = drop_col(df, REMOVED_LIST) # add features print_doing('add tabular features') for feature in predictors: dir_feature_file = dir_feature + feature + '.pickle' if not os.path.exists(dir_feature_file): print('can not find {}. Please check'.format(dir_feature_file)) else: if feature in df: print('{} already added'.format(feature)) else: print_doing_in_task('adding {}'.format(feature)) df = add_feature(df, dir_feature_file) print_memory() if is_textadded: # add text_feature print_doing_in_task('add text features') ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0) # stack print_doing_in_task('stack') X = hstack([ csr_matrix(df.loc[traindex, :].values), ready_df[0:traindex.shape[0]] ]) # Sparse Matrix testing = hstack([ csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:] ]) print_memory() print_doing_in_task('prepare vocab') tfvocab = df.columns.tolist() + tfvocab for shape in [X, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab)) else: tfvocab = df.columns.tolist() testing = hstack([csr_matrix(df.loc[testdex, :].values)]) X = hstack([csr_matrix(df.loc[traindex, :].values)]) # Sparse Matrix return X, y, testing, tfvocab, df.columns.tolist(), testdex
feature_name = '' feature_name = feature + suffix cols = cols + [feature_name] gp = read_from_h5(storename_train, storename_test, cols, categorical) for feature in cols: df[feature] = gp[feature].values # df = pd.concat([df, gp], axis=1) print(df.info()) print(df.head()) del gp, cols gc.collect ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0) print(ready_df.shape) print(ready_df) # print(tfvocab) df = df.drop(['description', 'param_1', 'param_2', 'param_3', 'title'], axis=1) # df_array = df.values print(df.head()), print(df.info()) ############################################ # X_full = hstack([csr_matrix(df.values),ready_df]) # X = X_full[0:traindex.shape[0]]
def DO(mat_filename, storename,num_leaves,max_depth, option, boosting_type): frac = FRAC print('------------------------------------------------') print('start...') print('fraction:', frac) print('prepare predictors, categorical and target...') predictors = PREDICTORS print (predictors) categorical = get_categorical(predictors) target = TARGET subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz' modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) print('----------------------------------------------------------') print('SUMMARY:') print('----------------------------------------------------------') print('predictors:',predictors) print('number of predictors: {} \n'.format(len(predictors))) print('categorical', categorical) print('number of predictors: {} \n'.format(len(categorical))) print('taget {} \n'.format(target)) print('submission file name: {} \n'.format(subfilename)) print('model file name: {} \n'.format(modelfilename)) # print('fraction:', frac) # print('option:', option) print('----------------------------------------------------------') train_df = read_processed_h5(storename, predictors+target, categorical) print(train_df.info()) print(train_df.head()) train_df["price"] = np.log(train_df["price"]+0.001) train_df["price"].fillna(-999,inplace=True) # train_df["price"] = train_df["price"].astype('float') # train_df["image_top_1"].fillna(-999,inplace=True) print(train_df.head()); print(train_df.info()) # train_df = train_df.sample(frac=frac, random_state = SEED) print_memory('afer reading train:') print(train_df.head()) print("train size: ", len(train_df)) gc.collect() print_doing('cleaning train...') train_df_array = train_df[predictors].values train_df_labels = train_df[target].values.astype('int').flatten() del train_df; gc.collect() print_memory() print_doing('reading text matrix') train_mat_text, tfvocab = get_text_matrix(mat_filename, 'train', DEBUG, train_df_array.shape[0]) print_memory() print_doing('stack two matrix') train_df_array = hstack([csr_matrix(train_df_array),train_mat_text]) print_memory() new_predictors = tfvocab predictors = predictors + new_predictors del train_mat_text; gc.collect() print('----------------------------------------------------------') print("Training...") start_time = time.time() params = { 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.02, 'num_leaves': num_leaves, # we should let it be smaller than 2^(max_depth) 'max_depth': max_depth, # -1 means no limit 'subsample': 0.9, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'feature_fraction': 0.9, # Subsample ratio of columns when constructing each tree. # 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) # 'subsample_for_bin': 200000, # Number of samples for constructing bin # 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization # 'reg_alpha': 10, # L1 regularization term on weights # 'reg_lambda': 0, # L2 regularization term on weights 'nthread': 4, 'verbose': 0 } print('>> prepare dataset...') dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels, feature_name=predictors, categorical_feature=categorical) del train_df_array, train_df_labels; gc.collect() print_memory() print(params) print('>> start cv...') cv_results = lgb.cv(params, dtrain_lgb, categorical_feature = categorical, num_boost_round=20000, metrics='rmse', seed = SEED, shuffle = False, nfold=10, show_stdv=True, early_stopping_rounds=100, verbose_eval=50) print('[{}]: model training time'.format(time.time() - start_time)) print_memory() # print (cv_results) print('--------------------------------------------------------------------') num_boost_rounds_lgb = len(cv_results['rmse-mean']) print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb)) print ('>> start trainning... ') model_lgb = lgb.train( params, dtrain_lgb, num_boost_round=num_boost_rounds_lgb, feature_name = predictors, categorical_feature = categorical) del dtrain_lgb gc.collect() print('--------------------------------------------------------------------') print('>> save model...') # save model to file # if not DEBUG: model_lgb.save_model(modelfilename+'.txt')