def load_test_data(): logger = mylogger() logger.info('read_test start') test = pd.read_csv('../../../input/test.csv') logger.info('test:\n{}'.format(test.head())) logger.debug('read_test end') return test
def load_train_data(): logger = mylogger() logger.info('read_train start') train = pd.read_csv('../../../input/train.csv') logger.info('train:\n{}'.format(train.head())) logger.debug('read_train end') return train
def load_submission(): logger = mylogger() logger.info('read_submission start') submission = pd.read_csv('../../../input/sample_submission.csv') logger.info('sample_submission:\n{}'.format(submission.head())) logger.debug('read_submission end') return submission
def forest_cv(): logger = mylogger() logger.info('RandomForestRegressor start') logger.debug('make_train_data start') train = pd.read_csv('../result_tmp/scaled_train.csv') #train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv') #train = train[train['date_block_num']==33] #直近1ヶ月 #train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)] #直近3m y = train['item_cnt_month'] X = train.drop(['item_cnt_month'], axis=1).values #X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values logger.debug('make_train_data end') logger.info('Cross-validation start') forest = RandomForestRegressor(n_estimators=50, random_state=1) #cvはKFoldのshuffle引数をTrue/Falseから選べる。 #ただのkfoldしたいときは上記引数をFalseにすればよい。その際、インデックス順にk分割する。 #shuffle_cvしたいときは上記引数をTrueにすればよい。毎回分割が変わる。 kfold = KFold(n_splits=3, shuffle=True, random_state=0) #skf = StratifiedKFold(n_splits=3) #skfはkfoldと同じ引数をもつ。 #tscv = TimeSeriesSplit(n_splits=3) scores = cross_val_score(forest, X, y, cv=kfold) #以下、GroupKFoldを使うときの書き方 #groups = list(train['date_block_num']) #scores = cross_val_score(forest, X, y, groups, cv=GroupKFold(n_splits=3)) logger.info('Cross-validation scores_forest: {}'.format(scores)) logger.info('Average Cross-validation score_forest: {}'.format(scores.mean())) logger.debug('Cross-validation end') logger.debug('RandomForestRegressor end') logger.debug('====================')
def forest_gscv(): logger = mylogger() logger.info('RandomForestRegressor start') logger.debug('make_train_data start') train = pd.read_csv('../result_tmp/scaled_train.csv') y = train['item_cnt_month'] X = train.drop(['item_cnt_month'], axis=1).values logger.debug('make_train_data end') logger.info('GridSearchCV start') param_grid = {'n_estimators':[10, 30, 50], 'random_state':[1, 2, 3]} logger.debug('Parameter grid:\n{}'.format(param_grid)) grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, n_jobs=4) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) grid_search.fit(X_train, y_train) logger.info('Best GridSearchCV parameters_forest: {}'.format(grid_search.best_params_)) logger.info('Best GridSearchCV score_forest: {}'.format(grid_search.best_score_)) logger.info('Test set score_forest: {:.2f}'.format(grid_search.score(X_test, y_test))) results = pd.DataFrame(grid_search.cv_results_) results.to_csv('../result_tmp/GridSearch.csv', encoding='utf-8-sig', index=False) logger.debug('GridSearchCV end') logger.debug('RandomForestRegressor end') logger.debug('====================')
def ridge_gscv(): logger = mylogger() logger.info('RidgeRegression start') logger.debug('make_train_data start') train = pd.read_csv('../result_tmp/scaled_train.csv') y = train['item_cnt_month'] X = train.drop(['item_cnt_month'], axis=1).values logger.debug('make_train_data end') logger.info('GridSearchCV start') ridge = Ridge() param_grid = {'alpha': [0.1, 1, 10, 50, 100]} logger.debug('Parameter grid:\n{}'.format(param_grid)) grid_search = GridSearchCV(ridge, param_grid, cv=5, n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) grid_search.fit(X_train, y_train) logger.info('Best GridSearchCV parameters_ridge: {}'.format(grid_search.best_params_)) logger.info('Best GridSearchCV score_ridge: {}'.format(grid_search.best_score_)) logger.info('Test set score_ridge: {:.2f}'.format(grid_search.score(X_test, y_test))) results = pd.DataFrame(grid_search.cv_results_) results.to_csv('../result_tmp/GridSearch_ridge.csv', encoding='utf-8-sig', index=False) logger.debug('GridSearchCV end') #EDAしたいときコメントアウトはずす。 ridge = Ridge(**grid_search.best_params_) ridge.fit(X_train, y_train) logger.info('coefficient:{}'.format(ridge.coef_)) logger.info('intercept:{}'.format(ridge.intercept_)) logger.debug('RidgeRegression end') logger.debug('====================')
def ridge_submit(): logger = mylogger() logger.info('RidgeRegression start') logger.debug('make_train_data start') #train = pd.read_csv('../result_tmp/scaled_train.csv') train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv') #train = train[train['date_block_num']==33] #直近1ヶ月 train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)] #直近3m y = train['item_cnt_month'] X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values #X = train.drop(['item_cnt_month'], axis=1).values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logger.debug('make_train_data end') logger.info('Fitting start') ridge = Ridge() ridge.fit(X, y) logger.debug('Fitting end') logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = ridge.predict(test).astype(np.float16).clip(0., 20.) submission.to_csv('../result_tmp/submit_180902_31-33_ridge.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RidgeRegression end') logger.debug('====================')
def load_suppliment(): logger = mylogger() logger.info('read_csv start') item_cat = pd.read_csv('../../../input/item_categories.csv') item = pd.read_csv('../../../input/items.csv') shop = pd.read_csv('../../../input/shops.csv') logger.debug('read_csv end') logger.info('item_cat:\n{}'.format(item_cat.head())) logger.info('item:\n{}'.format(item.head())) logger.info('shop:\n{}'.format(shop.head())) return item_cat, item, shop
def label_encode(train, test): logger = mylogger() # 木系のアルゴリズムは使う必要なし。 logger.info('label_encode starts') for c in ['shop_id', 'item_id']: lbl = preprocessing.LabelEncoder() lbl.fit(list(train[c].unique()) + list(test[c].unique())) train[c] = lbl.transform(train[c].astype(str)) test[c] = lbl.transform(test[c].astype(str)) logger.debug(c) train.to_csv('../result_tmp/train_lbl.csv') test.to_csv('../result_tmp/test_lbl.csv') logger.debug('label encode ends') return train, test
def xgboost_gscv(): #インストール # cd <workspace> # git clone --recursive https://github.com/dmlc/xgboost # cd xgboost; make -j4 # pip show setuptoolsでsetuptoolsのチェック。パスが通っていることも重要。 # cd python-package; sudo python setup.py install logger = mylogger() logger.info('xgboostRegressor start') logger.debug('make_train_data start') train = pd.read_csv('./result_tmp/scaled_train.csv') y = train['item_cnt_month'] X = train.drop(['item_cnt_month'], axis=1).values logger.debug('make_train_data end') logger.info('GridSearchCV start') reg = xgb.XGBRegressor() param_grid = {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 500]} logger.debug('Parameter grid:\n{}'.format(param_grid)) grid_search = GridSearchCV(reg, param_grid, cv=5, n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) grid_search.fit(X_train, y_train) logger.info('Best GridSearchCV parameters_xgb: {}'.format( grid_search.best_params_)) logger.info('Best GridSearchCV score_xgb: {}'.format( grid_search.best_score_)) logger.info('Test set score_xgb: {:.2f}'.format( grid_search.score(X_test, y_test))) results = pd.DataFrame(grid_search.cv_results_) results.to_csv('./result_tmp/GridSearch_xgb.csv', encoding='utf-8-sig', index=False) logger.debug('GridSearchCV end') #EDAしたいときコメントアウトはずす。 #xgbr = xgb.XGBRegressor(**grid_search.best_params_) #xgbr.fit(X_train, y_train) #fti = pd.Series(xgbr.feature_importances_, index=train.columns) #fti = fti.sort_values() #logger.debug(fti) #fti.plot(kind='barh') #plt.title('feature importance from xgboost') #plt.show() logger.debug('xgboostRegressor end') logger.debug('====================')
def make_train_in_test(): logger = mylogger() logger.info('train in test starts') train = load_train_data() test = load_test_data() logger.info('train.org.shape:{}'.format(train.shape)) test_shops = test.shop_id.unique() test_items = test.item_id.unique() train = train[train.shop_id.isin(test_shops)] train = train[train.item_id.isin(test_items)] train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y') train['month'] = train['date'].dt.month logger.info('train in test.shape:{}'.format(train.shape)) logger.debug('train in test ends') return train, test
def make_item_cnt_month(train): logger = mylogger() logger.info('MakeItemCntMonth starts') item_cnt_month = train['item_cnt_day'].groupby( \ [train['date_block_num'], train['shop_id'], train['item_id']]).apply(sum) item_cnt_month.name = 'item_cnt_month' item_cnt_month_df = pd.DataFrame(item_cnt_month) item_cnt_month_df = item_cnt_month_df.reset_index() item_cnt_month_df.drop(['date_block_num'], axis=1, inplace=True) item_cnt_month_df.to_csv('../result_tmp/scaled_train.csv', encoding='utf-8-sig', index=False) logger.debug(item_cnt_month_df.shape) logger.debug('MakeItemCntMonth ends') return item_cnt_month_df
def forest_submit(): logger = mylogger() logger.info('RandomForestRegressor start') logger.debug('make_train_data start') #train = pd.read_csv('./result_tmp/scaled_train.csv') train = pd.read_csv('./result_tmp/scaled_train_DateBlockNum.csv') #train = train[train['date_block_num']==33] #直近1ヶ月 train = train.loc[(30 < train['date_block_num']) & (train['date_block_num'] <= 33)] #直近3m y = train['item_cnt_month'] X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values #X = train.drop(['item_cnt_month'], axis=1).values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logger.debug('make_train_data end') logger.info('Fitting start') forest = RandomForestRegressor(n_estimators=50, random_state=1) forest.fit(X, y) logger.debug('Fitting end') #EDAしたいとき #fti = forest.feature_importances_ #print('Feature Importances:') #for i, feature in enumerate(train.colunms): # print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i])) logger.info('Scoring start') #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test))) test_data = load_test_data() test = test_data.drop(['ID'], axis=1).values submission = load_submission() submission['item_cnt_month'] = forest.predict(test).astype( np.float16).clip(0., 20.) #submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False) submission.to_csv('./result_tmp/submit_180827_31-33.csv', encoding='utf-8-sig', index=False) logger.info('submission:\n{}'.format(submission.head())) logger.debug('RandomForestRegressor end') logger.debug('====================')