Ejemplo n.º 1
0
def prepare_training(mat_filename, dir_feature, predictors, is_textadded):
    print_header('Load features')
    df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET,
                                                          DEBUG)
    del len_train
    gc.collect()
    df = drop_col(df, REMOVED_LIST)

    # add features
    print_doing('add tabular features')
    for feature in predictors:
        dir_feature_file = dir_feature + feature + '.pickle'
        if not os.path.exists(dir_feature_file):
            print('can not find {}. Please check'.format(dir_feature_file))
        else:
            if feature in df:
                print('{} already added'.format(feature))
            else:
                print_doing_in_task('adding {}'.format(feature))
                df = add_feature(df, dir_feature_file)
    print_memory()

    if is_textadded:
        # add text_feature
        print_doing_in_task('add text features')
        ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0)

        # stack
        print_doing_in_task('stack')
        X = hstack([
            csr_matrix(df.loc[traindex, :].values),
            ready_df[0:traindex.shape[0]]
        ])  # Sparse Matrix
        testing = hstack([
            csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:]
        ])
        print_memory()

        print_doing_in_task('prepare vocab')
        tfvocab = df.columns.tolist() + tfvocab
        for shape in [X, testing]:
            print("{} Rows and {} Cols".format(*shape.shape))
        print("Feature Names Length: ", len(tfvocab))

    else:
        tfvocab = df.columns.tolist()
        testing = hstack([csr_matrix(df.loc[testdex, :].values)])
        X = hstack([csr_matrix(df.loc[traindex, :].values)])  # Sparse Matrix

    return X, y, testing, tfvocab, df.columns.tolist(), testdex
        feature_name = ''
        feature_name = feature + suffix
        cols = cols + [feature_name]
gp = read_from_h5(storename_train, storename_test, cols, categorical)

for feature in cols:
    df[feature] = gp[feature].values

# df = pd.concat([df, gp], axis=1)

print(df.info())
print(df.head())
del gp, cols
gc.collect

ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0)

print(ready_df.shape)
print(ready_df)
# print(tfvocab)

df = df.drop(['description', 'param_1', 'param_2', 'param_3', 'title'], axis=1)

# df_array = df.values

print(df.head()), print(df.info())

############################################

# X_full = hstack([csr_matrix(df.values),ready_df])
# X = X_full[0:traindex.shape[0]]
Ejemplo n.º 3
0
def DO(mat_filename, storename,num_leaves,max_depth, option, boosting_type):
    frac = FRAC
    print('------------------------------------------------')
    print('start...')
    print('fraction:', frac)
    print('prepare predictors, categorical and target...')
    predictors = PREDICTORS

    print (predictors)

    categorical = get_categorical(predictors)
    target = TARGET

    subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz'
    modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option)

    print('----------------------------------------------------------')
    print('SUMMARY:')
    print('----------------------------------------------------------')
    print('predictors:',predictors)
    print('number of predictors: {} \n'.format(len(predictors)))
    print('categorical', categorical)
    print('number of predictors: {} \n'.format(len(categorical)))
    print('taget {} \n'.format(target))
    print('submission file name: {} \n'.format(subfilename))
    print('model file name: {} \n'.format(modelfilename))
    # print('fraction:', frac)
    # print('option:', option)

    print('----------------------------------------------------------')
    train_df = read_processed_h5(storename, predictors+target, categorical)
    print(train_df.info())
    print(train_df.head())

    train_df["price"] = np.log(train_df["price"]+0.001)
    train_df["price"].fillna(-999,inplace=True)
    # train_df["price"] = train_df["price"].astype('float')
    # train_df["image_top_1"].fillna(-999,inplace=True)

    print(train_df.head()); print(train_df.info())
    # train_df = train_df.sample(frac=frac, random_state = SEED)
    print_memory('afer reading train:')
    print(train_df.head())
    print("train size: ", len(train_df))
    gc.collect()

    print_doing('cleaning train...')
    train_df_array = train_df[predictors].values
    train_df_labels = train_df[target].values.astype('int').flatten()
    del train_df; gc.collect()
    print_memory()

    print_doing('reading text matrix')
    train_mat_text, tfvocab = get_text_matrix(mat_filename, 'train', DEBUG, train_df_array.shape[0])
    print_memory()

    print_doing('stack two matrix')
    train_df_array = hstack([csr_matrix(train_df_array),train_mat_text])
    print_memory()
    
    new_predictors = tfvocab
    predictors = predictors + new_predictors
    del train_mat_text; gc.collect()

    

    print('----------------------------------------------------------')
    print("Training...")
    start_time = time.time()

    params = {
        'boosting_type': boosting_type,
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.02,
        'num_leaves': num_leaves,  # we should let it be smaller than 2^(max_depth)
        'max_depth': max_depth,  # -1 means no limit
        'subsample': 0.9,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'feature_fraction': 0.9,  # Subsample ratio of columns when constructing each tree.
        # 'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        # 'subsample_for_bin': 200000,  # Number of samples for constructing bin
        # 'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        # 'reg_alpha': 10,  # L1 regularization term on weights
        # 'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0
    }


    print('>> prepare dataset...')
    dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels,
                        feature_name=predictors,
                        categorical_feature=categorical)
    del train_df_array, train_df_labels; gc.collect()                        
    print_memory()   


    print(params)
    print('>> start cv...')

    cv_results  = lgb.cv(params, 
                        dtrain_lgb, 
                        categorical_feature = categorical,
                        num_boost_round=20000,                       
                        metrics='rmse',
                        seed = SEED,
                        shuffle = False,
                        nfold=10, 
                        show_stdv=True,
                        early_stopping_rounds=100, 
                        verbose_eval=50)                     

    print('[{}]: model training time'.format(time.time() - start_time))
    print_memory()


    # print (cv_results)
    print('--------------------------------------------------------------------') 
    num_boost_rounds_lgb = len(cv_results['rmse-mean'])
    print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))

    print ('>> start trainning... ')
    model_lgb = lgb.train(
                        params, dtrain_lgb, 
                        num_boost_round=num_boost_rounds_lgb,
                        feature_name = predictors,
                        categorical_feature = categorical)
    del dtrain_lgb
    gc.collect()

    print('--------------------------------------------------------------------') 
    print('>> save model...')
    # save model to file

    # if not DEBUG:
    model_lgb.save_model(modelfilename+'.txt')