def ridge_submit(): logger = mylogger() logger.info('RidgeRegression start') logger.debug('make_train_data start') #train = pd.read_csv('../result_tmp/scaled_train.csv') train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv') #train = train[train['date_block_num']==33] #直近1ヶ月 train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)] #直近3m y = train['item_cnt_month'] X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values #X = train.drop(['item_cnt_month'], axis=1).values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logger.debug('make_train_data end') logger.info('Fitting start') ridge = Ridge() ridge.fit(X, y) logger.debug('Fitting end') logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = ridge.predict(test).astype(np.float16).clip(0., 20.) submission.to_csv('../result_tmp/submit_180902_31-33_ridge.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RidgeRegression end') logger.debug('====================')
def RandomForest(): logger.info('RandomForestRegressor start') logger.debug('make_train_data start') train = pd.read_csv('./result_tmp/scaled_train.csv') y = train['item_cnt_month'] X = train.drop(['item_cnt_month'], axis=1).values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logger.debug('make_train_dat end') logger.info('Fitting start') forest = RandomForestRegressor(n_estimators=50, random_state=1) forest.fit(X, y) logger.debug('Fitting end') logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(forest.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = forest.predict(test).astype( np.float16).clip(0., 20.) submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RandomForestRegressor end') logger.debug('====================')
def forest_submit(): logger = mylogger() logger.info('RandomForestRegressor start') logger.debug('make_train_data start') #train = pd.read_csv('./result_tmp/scaled_train.csv') train = pd.read_csv('./result_tmp/scaled_train_DateBlockNum.csv') #train = train[train['date_block_num']==33] #直近1ヶ月 train = train.loc[(30 < train['date_block_num']) & (train['date_block_num'] <= 33)] #直近3m y = train['item_cnt_month'] X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values #X = train.drop(['item_cnt_month'], axis=1).values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logger.debug('make_train_data end') logger.info('Fitting start') forest = RandomForestRegressor(n_estimators=50, random_state=1) forest.fit(X, y) logger.debug('Fitting end') #EDAしたいとき #fti = forest.feature_importances_ #print('Feature Importances:') #for i, feature in enumerate(train.colunms): # print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i])) logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = forest.predict(test).astype( np.float16).clip(0., 20.) #submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False) submission.to_csv('./result_tmp/submit_180827_31-33.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RandomForestRegressor end') logger.debug('====================')
forest = RandomForestRegressor(n_estimators=50, random_state=1) forest.fit(X, y) logger.debug('Fitting end') #EDAしたいとき #fti = forest.feature_importances_ #print('Feature Importances:') #for i, feature in enumerate(train.colunms): # print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i])) logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = forest.predict(test).astype(np.float16).clip(0., 20.) #submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False) submission.to_csv('./result_tmp/submit_180827_31-33.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RandomForestRegressor end') logger.debug('====================') # CV: KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold # [http://scikit-learn.org/stable/modules/cross_validation.html] # TimeSeriesSplit()によるCV # [http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html] # 以下のイメージで分割する。TESTを予測するために、直近のTRAINを使うイメージがtscv # TRAIN: [0] TEST: [1], TRAIN: [0 1] TEST: [2], TRAIN: [0 1 2] TEST: [3] def forest_cv():