def main(debug=False, use_pkl=False): num_rows = 10000 if debug else None if use_pkl: df = loadpkl('../output/df.pkl') else: with timer("train & test"): df = train_test(num_rows) with timer("nightley"): df = pd.merge(df, nightley(num_rows), on=['datetime', 'park'], how='outer') with timer("hotlink"): df = pd.merge(df, hotlink(num_rows), on='datetime', how='outer') with timer("colopl"): df = pd.merge(df, colopl(num_rows), on=['park', 'year', 'month'], how='outer') with timer("weather"): df = pd.merge(df, weather(num_rows), on=['datetime', 'park'], how='outer') with timer("nied_oyama"): df = pd.merge(df, nied_oyama(num_rows), on=['datetime', 'park'], how='outer') with timer("agoop"): df = pd.merge(df, agoop(num_rows), on=['park', 'year','month'], how='outer') with timer("jorudan"): df = pd.merge(df, jorudan(num_rows), on=['datetime', 'park'], how='outer') with timer("save pkl"): save2pkl('../output/df.pkl', df) with timer("Run XGBoost with kfold"): print("df shape:", df.shape) feat_importance = kfold_xgboost(df, num_folds=NUM_FOLDS, stratified=True, debug=debug) display_importances(feat_importance ,'../output/xgb_importances.png', '../output/feature_importance_xgb.csv')
def output(train_df, test_df, models, model_params, feature_importance_df, train_preds, test_preds, scores, now, model_name): score = sum(scores) / len(scores) folder_path = make_output_dir(score, now, model_name) for i, m in enumerate(models): save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m) with open('{0}/model_params.json'.format(folder_path), 'w') as f: json.dump(model_params, f, indent=4) with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f: json.dump({i: s for i, s in enumerate(scores)}, f, indent=4) save_importances(feature_importance_df, '{}/importances.png'.format(folder_path), '{}/importance.csv'.format(folder_path)) # 以下の部分はコンペごとに修正が必要 test_df.loc[:, 'target'] = test_preds test_df = test_df.reset_index() # targetが一定値以下のものをoutlierで埋める #q = test_df['target'].quantile(.0003) #q = 3 #test_df.loc[:,'target']=test_df['target'].apply(lambda x: x if abs(x) > q else x-0.0001) test_df[['card_id', 'target']].to_csv('{0}/submit_{1:%Y-%m%d-%H%M-%S}_{2}.csv'.format( folder_path, now, score), index=False) train_df.loc[:, 'OOF_PRED'] = train_preds train_df = train_df.reset_index() train_df[['card_id', 'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), )
def main(is_eval=False): # load csv df = pd.read_csv('../input/sell_prices.csv') # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe release_df = df.groupby(['store_id', 'item_id' ])['wm_yr_wk'].agg(['min']).reset_index() release_df.columns = ['store_id', 'item_id', 'release'] # merge release week df = df.merge(release_df, on=['store_id', 'item_id'], how='left') # days from release df['days_from_release'] = df['wm_yr_wk'] - df['release'] # basic aggregations df['price_max'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('max') df['price_min'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('min') df['price_std'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('std') df['price_mean'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean') # normalized price df['price_norm'] = df['sell_price'] / df['price_max'] # label encoding df['price_nunique'] = df.groupby(['store_id', 'item_id' ])['sell_price'].transform('nunique') df['item_nunique'] = df.groupby(['store_id', 'sell_price' ])['item_id'].transform('nunique') # momentum df['price_momentum'] = df['sell_price'] / df.groupby( ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1)) # reduce memory usage df = reduce_mem_usage(df) # save pkl save2pkl('../feats/sell_prices.pkl', df) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(num_rows=None): # load csv & pkl profiles = pd.read_csv('../input/data_set_phase2/profiles.csv') # change columns name profiles.columns = ['pid']+['profile_{}'.format(i) for i in range(0,66)] # feature engineering feats = [f for f in profiles.columns.to_list() if f not in ['pid']] profiles['profile_sum'] = profiles[feats].mean(axis=1) profiles['profile_mean'] = profiles[feats].sum(axis=1) profiles['profile_std'] = profiles[feats].std(axis=1) profiles['profile_sum_count'] = profiles['profile_sum'].map(profiles['profile_sum'].value_counts()) # svd features svd = TruncatedSVD(n_components=20, n_iter=20, random_state=326) svd_x = svd.fit_transform(profiles[feats].values) svd_x = pd.DataFrame(svd_x) svd_x.columns = ['profile_svd_{}'.format(i) for i in range(20)] svd_x['pid'] = profiles['pid'] # merge profiles = profiles.merge(svd_x, on='pid', how='left') # NMF features nmf = NMF(n_components=20, init='random', random_state=326) nmf_x = nmf.fit_transform(profiles[feats].values) nmf_x = pd.DataFrame(nmf_x) nmf_x.columns = ['profile_nmf_{}'.format(i) for i in range(20)] nmf_x['pid'] = profiles['pid'] # merge profiles = profiles.merge(nmf_x, on='pid', how='left') # k-means clustering kmeans_model = KMeans(n_clusters=10, random_state=326) kmeans_model.fit(profiles[feats].values) profiles['profile_k_means'] = kmeans_model.labels_ # save as pkl save2pkl('../features/profiles.pkl', profiles) line_notify('{} finished.'.format(sys.argv[0]))
def output(train_df, test_df, models, model_params, feature_importance_df, train_preds, test_preds, scores, now, model_name): score = sum(scores) / len(scores) folder_path = make_output_dir(score, now, model_name) for i, m in enumerate(models): save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m) with open('{0}/model_params.json'.format(folder_path), 'w') as f: json.dump(model_params, f, indent=4) with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f: json.dump({i: s for i, s in enumerate(scores)}, f, indent=4) save_importances(feature_importance_df, '{}/importances.png'.format(folder_path), '{}/importance.csv'.format(folder_path)) # 以下の部分はコンペごとに修正が必要 submission_file_name = '{0}/submit_{1:%Y-%m-%d-%H-%M-%S}_{2}.csv'.format( folder_path, now, score) test_df.loc[:, 'target'] = test_preds test_df.loc[:, 'Outlier_Likelyhood'] = test_preds_bin q = test_df['Outlier_Likelyhood'].quantile(.9999) # 1.0930% test_df.loc[:, 'target'] = test_df['Outlier_Likelyhood'].apply( lambda x: 1 if x > q else x) test_df = test_df.reset_index() test_df[['card_id', 'target']].to_csv(submission_file_name, index=False) train_df.loc[:, 'OOF_PRED'] = train_preds train_df = train_df.reset_index() train_df[['card_id', 'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), ) # API経由でsubmit if not is_debug: submit(competition_name, submission_file_name, comment='user02 cv: %.6f' % score)
def kfold_lightgbm(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, categorical_feature=cat_cols, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cat_cols, free_raw_data=False) # params params = { 'device': 'gpu', 'task': 'train', 'boosting': 'gbdt', 'objective': 'multiclass', 'metric': 'multiclass', 'learning_rate': 0.1, 'num_class': 12, 'colsample_bytree': 0.723387165617351, 'max_depth': 8, 'min_child_weight': 42.6805833563236, 'min_data_in_leaf': 34, 'min_split_gain': 0.010945157429729, 'num_leaves': 48, 'reg_alpha': 1.87287994755334, 'reg_lambda': 4.8093341415383, 'subsample': 0.483962708535824, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } clf = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/lgbm_queries_{}.txt'.format(n_fold)) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict( test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( clf.feature_importance(importance_type='gain', iteration=clf.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_queries_profiles.png', '../imp/feature_importance_lgbm_queries_profiles.csv') if not debug: # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_queries_profiles{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df.index # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_queries_profiles{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df.index # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/queries_profiles_pred.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))
def main(num_rows=None): # load csv train_queries = pd.read_csv( '../input/data_set_phase2/train_queries_phase2.csv', nrows=num_rows) test_queries = pd.read_csv('../input/data_set_phase2/test_queries.csv', nrows=num_rows) train_clicks = pd.read_csv( '../input/data_set_phase2/train_clicks_phase2.csv') # phase 1 csv train_queries1 = pd.read_csv( '../input/data_set_phase2/train_queries_phase1.csv') train_clicks1 = pd.read_csv( '../input/data_set_phase2/train_clicks_phase1.csv') # merge click train_queries = pd.merge(train_queries, train_clicks[['sid', 'click_mode']], on='sid', how='left') train_queries1 = pd.merge(train_queries1, train_clicks1[['sid', 'click_mode']], on='sid', how='left') # merge phase 1 data train_queries = train_queries1.append(train_queries) # fill na (no click) train_queries['click_mode'].fillna(0, inplace=True) # set test target as nan test_queries['click_mode'] = np.nan # merge train & test queries_df = train_queries.append(test_queries) del train_queries, test_queries, train_queries1, train_clicks, train_clicks1 gc.collect() # to datetime queries_df['req_time'] = pd.to_datetime(queries_df['req_time']) # distance features queries_df['x_o'] = queries_df['o'].apply( lambda x: x.split(',')[0]).astype(float) queries_df['y_o'] = queries_df['o'].apply( lambda x: x.split(',')[1]).astype(float) queries_df['x_d'] = queries_df['d'].apply( lambda x: x.split(',')[0]).astype(float) queries_df['y_d'] = queries_df['d'].apply( lambda x: x.split(',')[1]).astype(float) # count features queries_df['queries_o_count'] = queries_df['o'].map( queries_df['o'].value_counts()) queries_df['queries_d_count'] = queries_df['d'].map( queries_df['d'].value_counts()) queries_df['queries_x_o_count'] = queries_df['x_o'].map( queries_df['x_o'].value_counts()) queries_df['queries_y_o_count'] = queries_df['y_o'].map( queries_df['y_o'].value_counts()) queries_df['queries_x_d_count'] = queries_df['x_d'].map( queries_df['x_d'].value_counts()) queries_df['queries_y_d_count'] = queries_df['y_d'].map( queries_df['y_d'].value_counts()) queries_df['queries_distance'] = np.sqrt( (queries_df['x_o'] - queries_df['x_d'])**2 + (queries_df['y_o'] - queries_df['y_d'])**2) queries_df['o_d'] = queries_df['o'].astype( str) + '_' + queries_df['d'].astype(str) queries_df['queries_o_d_count'] = queries_df['o_d'].map( queries_df['o_d'].value_counts()) # datetime features queries_df['queries_weekday'] = queries_df['req_time'].dt.weekday queries_df['queries_hour'] = queries_df['req_time'].dt.hour queries_df['queries_is_holiday'] = queries_df['req_time'].apply( lambda x: is_holiday(x)).astype(int) queries_df['queries_weekday_count'] = queries_df['queries_weekday'].map( queries_df['queries_weekday'].value_counts()) queries_df['queries_hour_count'] = queries_df['queries_hour'].map( queries_df['queries_hour'].value_counts()) # coordinate & datetime features queries_df['o_d_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['o_d'] queries_df['o_d_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['o_d'] queries_df['o_d_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['o_d'] queries_df['o_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['o'] queries_df['o_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['o'] queries_df['o_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['o'] queries_df['d_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['d'] queries_df['d_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['d'] queries_df['d_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['d'] queries_df['queries_o_d_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) queries_df['queries_o_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) queries_df['queries_o_d_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) # rounded value features queries_df['x_o_round'] = queries_df['x_o'].round(1) queries_df['y_o_round'] = queries_df['y_o'].round(1) queries_df['x_d_round'] = queries_df['x_d'].round(1) queries_df['y_d_round'] = queries_df['y_d'].round(1) queries_df['queries_distance_round'] = queries_df[ 'queries_distance'].round(1) queries_df['o_round'] = queries_df['x_o_round'].astype( str) + '_' + queries_df['y_o_round'].astype(str) queries_df['d_round'] = queries_df['x_d_round'].astype( str) + '_' + queries_df['y_d_round'].astype(str) queries_df['o_d_round'] = queries_df['o_round'].astype( str) + '_' + queries_df['d_round'].astype(str) queries_df['queries_x_o_round_count'] = queries_df['x_o_round'].map( queries_df['x_o_round'].value_counts()) queries_df['queries_y_o_round_count'] = queries_df['y_o_round'].map( queries_df['y_o_round'].value_counts()) queries_df['queries_x_d_round_count'] = queries_df['x_d_round'].map( queries_df['x_d_round'].value_counts()) queries_df['queries_y_d_round_count'] = queries_df['y_d_round'].map( queries_df['y_d_round'].value_counts()) queries_df['queries_distance_round_count'] = queries_df[ 'queries_distance_round'].map( queries_df['queries_distance_round'].value_counts()) queries_df['queries_o_round_count'] = queries_df['o_round'].map( queries_df['o_round'].value_counts()) queries_df['queries_d_round_count'] = queries_df['d_round'].map( queries_df['d_round'].value_counts()) queries_df['queries_o_d_round_count'] = queries_df['o_d_round'].map( queries_df['o_d_round'].value_counts()) # factorize queries_df['x_o_round'], _ = pd.factorize(queries_df['x_o_round']) queries_df['y_o_round'], _ = pd.factorize(queries_df['y_o_round']) queries_df['x_d_round'], _ = pd.factorize(queries_df['x_d_round']) queries_df['y_d_round'], _ = pd.factorize(queries_df['y_d_round']) queries_df['queries_distance_round'], _ = pd.factorize( queries_df['queries_distance_round']) # target encoding cols_encoding = [ 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] queries_df = targetEncodingMultiClass(queries_df, 'click_mode', cols_encoding) # drop string features cols_drop = [ 'o', 'd', 'o_d', 'o_d_is_holiday', 'o_d_weekday', 'o_d_hour', 'o_is_holiday', 'o_weekday', 'o_hour', 'd_is_holiday', 'd_weekday', 'd_hour', 'o_round', 'd_round', 'o_d_round' ] queries_df.drop(cols_drop, axis=1, inplace=True) # reduce memory usage queries_df = reduce_mem_usage(queries_df) # save as pkl save2pkl('../features/queries.pkl', queries_df) # save configs configs = json.load(open('../configs/101_lgbm_queries.json')) configs['features'] = queries_df.columns.to_list() to_json(configs, '../configs/101_lgbm_queries.json') line_notify('{} finished.'.format(sys.argv[0]))
def main(is_eval=False): # load csv df = pd.read_csv('../input/calendar.csv') # to datetime df['date'] = pd.to_datetime(df['date']) # seasonality df['seasonality'] = np.cos(np.pi * (df['date'].dt.dayofyear / 366 * 2 - 1)) # drop string columns df.drop('weekday', axis=1, inplace=True) df['day'] = df['date'].dt.day df['week'] = df['date'].dt.weekofyear df['month'] = df['date'].dt.month df['year'] = df['date'].dt.year df['year'] = (df['year'] - df['year'].min()) df['weekofmonth'] = df['day'].apply(lambda x: ceil(x / 7)) df['dayofweek'] = df['date'].dt.dayofweek df['is_weekend'] = (df['dayofweek'] >= 5).astype(int) # features holiday df['date'] = df['date'].apply(lambda x: x.date()) # to date holidays_us = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(years=y).items(): holidays_us.append(ptr[0]) holidays_ca = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='CA', years=y).items(): holidays_ca.append(ptr[0]) holidays_tx = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='TX', years=y).items(): holidays_tx.append(ptr[0]) holidays_wi = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='WI', years=y).items(): holidays_wi.append(ptr[0]) df['is_holiday_us'] = df['date'].apply(lambda x: 1 if x in holidays_us else 0) df['is_holiday_ca'] = df['date'].apply(lambda x: 1 if x in holidays_ca else 0) df['is_holiday_tx'] = df['date'].apply(lambda x: 1 if x in holidays_tx else 0) df['is_holiday_wi'] = df['date'].apply(lambda x: 1 if x in holidays_wi else 0) # preprocess event_name_1 # to datetime df['date'] = pd.to_datetime(df['date']) # Moon Phase df['moon'] = df['date'].apply(get_moon_phase) # add ramadan end dates ramadan_end_dates = [ '2011-8-29', '2012-8-18', '2013-8-7', '2014-7-27', '2015-7-16', '2016-7-5' ] for d in ramadan_end_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Ramadan ends' # add Pesach start dates pesach_start_dates = [ '2011-4-18', '2012-4-6', '2013-3-25', '2014-4-14', '2015-4-3', '2016-4-22' ] for d in pesach_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Pesach Start' # add purim start dates purim_start_dates = [ '2011-3-19', '2012-3-7', '2013-2-23', '2014-3-15', '2015-3-4', '2016-3-23' ] for d in purim_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Purim Start' # add chanukah start dates chanukah_start_dates = [ '2011-12-21', '2012-12-9', '2013-11-28', '2014-12-17', '2015-12-7', '2016-12-25' ] for d in chanukah_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Chanukah Start' # add isin features is_nba_final = [] is_lent = [] is_ramadan = [] is_pesach = [] is_purim = [] is_chanukah = [] tmp_nba = 0 tmp_lent = 0 tmp_ramadan = 0 tmp_pesach = 0 tmp_purim = 0 tmp_chanukah = 0 for e in df['event_name_1']: if e == 'NBAFinalsStart': tmp_nba = 1 is_nba_final.append(tmp_nba) if e == 'NBAFinalsEnd': tmp_nba = 0 if e == 'LentStart': tmp_lent = 1 is_lent.append(tmp_lent) if e == 'Easter': tmp_lent = 0 if e == 'Ramadan starts': tmp_ramadan = 1 is_ramadan.append(tmp_ramadan) if e == 'Ramadan ends': tmp_ramadan = 0 if e == 'Pesach Start': tmp_pesach = 1 is_pesach.append(tmp_pesach) if e == 'Pesach End': tmp_pesach = 0 if e == 'Purim Start': tmp_purim = 1 is_purim.append(tmp_purim) if e == 'Purim End': tmp_purim = 0 if e == 'Chanukah Start': tmp_chanukah = 1 is_chanukah.append(tmp_chanukah) if e == 'Chanukah End': tmp_chanukah = 0 df['is_NBA_final'] = is_nba_final df['is_lent'] = is_lent df['is_ramadan'] = is_ramadan df['is_pesach'] = is_pesach df['is_purim'] = is_purim df['is_chanukah'] = is_chanukah # add blackfriday flag blackfriday_dates = [ '2011-11-25', '2012-11-23', '2013-11-29', '2014-11-28', '2015-11-27' ] df['is_blackfriday'] = 0 for d in blackfriday_dates: df.loc[df['date'] == d, 'is_blackfriday'] = 1 # factorize numerical columns cols_string = [ 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2' ] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1, np.nan, inplace=True) # reduce memory usage df = reduce_mem_usage(df) # save pkl save2pkl('../feats/calendar.pkl', df) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['visitors'].notnull()] test_df = df[df['visitors'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # save pkl save2pkl('../output/train_df.pkl', train_df) save2pkl('../output/test_df.pkl', test_df) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['park_japanese_holiday'])): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p( train_df['visitors'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p( train_df['visitors'].iloc[valid_idx]) # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # パラメータは適当です params = { 'device': 'gpu', 'gpu_use_dp': True, 'task': 'train', 'boosting': 'goss', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 64, 'colsample_bytree': 0.977334338875847, 'subsample': 0.027687793278932, 'max_depth': 20, 'reg_alpha': 9.72886163508719, 'reg_lambda': 9.9935502633216, 'min_split_gain': 0.178508066955524, 'min_child_weight': 43.4750700383884, 'min_data_in_leaf': 18, 'other_rate': 0.925113620582013, 'top_rate': 0.006970683025472, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model('../output/lgbm_' + str(n_fold) + '.txt') oof_preds[valid_idx] = np.expm1( reg.predict(valid_x, num_iteration=reg.best_iteration)) sub_preds += np.expm1( reg.predict(test_df[feats], num_iteration=reg.best_iteration)) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # Full MAEスコアの表示&LINE通知 full_mae = mean_absolute_error(train_df['visitors'], oof_preds) line_notify('LigntGBM Full MAE score %.6f' % full_mae) if not debug: # 提出データの予測値を保存 test_df.loc[:, 'visitors'] = sub_preds test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t') # out of foldの予測値を保存 train_df.loc[:, 'OOF_PRED'] = oof_preds train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index=False) return feature_importance_df
def kfold_lightgbm(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, categorical_feature=CAT_COLS, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, categorical_feature=CAT_COLS, free_raw_data=False) # params params = { 'device': 'gpu', 'task': 'train', 'boosting': 'gbdt', 'objective': 'multiclass', 'metric': 'multiclass', 'learning_rate': 0.01, 'num_class': 12, 'num_leaves': 52, 'colsample_bytree': 0.3490457769968177, 'subsample': 0.543646263362097, 'max_depth': 11, 'reg_alpha': 4.762312990232561, 'reg_lambda': 9.98131082276387, 'min_split_gain': 0.19161156850826594, 'min_child_weight': 15.042054927368088, 'min_data_in_leaf': 17, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } clf = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], # feval=eval_f, num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/lgbm_3_{}.txt'.format(n_fold)) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict( test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( clf.feature_importance(importance_type='gain', iteration=clf.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') print('Full F1 Score %.6f' % full_f1) line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_3.png', '../imp/feature_importance_lgbm_3.csv') if not debug: # save prediction for submit test_df['recommend_mode'] = np.argmax(sub_preds, axis=1) test_df = test_df.reset_index() # post processing test_df['recommend_mode'][(test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0 )] = test_df['plan_0_transport_mode'][ (test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0)] # save csv test_df[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # save out of fold prediction train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1) train_df = train_df.reset_index() train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_lgbm_plans{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df['sid'] sub_preds['click_mode'] = test_df['click_mode'] # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_lgbm_plans{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df['sid'] oof_preds['click_mode'] = train_df['click_mode'] # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/lgbm_pred_3.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))
def kfold_xgboost(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['visitors'].notnull()] test_df = df[df['visitors'].isnull()] print("Starting XGBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # save pkl save2pkl('../output/train_df.pkl', train_df) save2pkl('../output/test_df.pkl', test_df) # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # final predict用にdmatrix形式のtest dfを作っておきます test_df_dmtrx = xgb.DMatrix(test_df[feats], label=train_df['visitors']) # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['park_japanese_holiday'])): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['visitors'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['visitors'].iloc[valid_idx]) # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # params params = { 'objective':'gpu:reg:linear', # GPU parameter 'booster': 'gbtree', 'eval_metric':'rmse', 'silent':1, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 19, 'gamma': 0.089444100759612, 'subsample': 0.91842954303314, 'colsample_bytree': 0.870658058238432, 'colsample_bylevel': 0.995353255250289, 'alpha':19.9615600411437, 'lambda': 2.53962270252528, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'seed':int(2**n_fold) } reg = xgb.train( params, xgb_train, num_boost_round=10000, evals=[(xgb_train,'train'),(xgb_test,'test')], early_stopping_rounds= 200, verbose_eval=100 ) # save model reg.save_model('../output/xgb_'+str(n_fold)+'.txt') oof_preds[valid_idx] = np.expm1(reg.predict(xgb_test)) sub_preds += np.expm1(reg.predict(test_df_dmtrx)) / num_folds fold_importance_df = pd.DataFrame.from_dict(reg.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() del test_df_dmtrx gc.collect() # Full MAEスコアの表示&LINE通知 full_mae = mean_absolute_error(train_df['visitors'], oof_preds) line_notify('XGBoost Full MAE score %.6f' % full_mae) if not debug: # 提出データの予測値を保存 test_df.loc[:,'visitors'] = sub_preds test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t') # out of foldの予測値を保存 train_df.loc[:,'OOF_PRED'] = oof_preds train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index= False) return feature_importance_df
def main(): parser = get_parser() try: args = parser.parse_args() except: sys.exit(0) # Set up commands from parser params = dict() ntraj = params['Ntraj'] = args.ntraj seed = params['seed'] = args.seed duration = params['duration'] = args.duration delta_t = params['delta_t'] = args.deltat Nfock_a = params['Nfock_a'] = args.nfocka Nfock_j = params['Nfock_j'] = args.nfockj downsample = params['downsample'] = args.downsample Regime = params['regime'] = args.regime num_systems = params['num_systems'] = args.num_systems drive_second_system = params[ 'drive_second_system'] = args.drive_second_system if args.sdeint_method_name == "": logging.info( "sdeint_method_name not set. Using itoEuler as a default.") sdeint_method_name = params['sdeint_method_name'] = "itoEuler" else: sdeint_method_name = params[ 'sdeint_method_name'] = args.sdeint_method_name R = params['R'] = args.R eps = params['eps'] = args.eps noise_amp = params['noise_amp'] = args.noise_amp trans_phase = params['trans_phase'] = args.trans_phase # Does the user want to print verbose output? quiet = args.quiet if not quiet: print_params(params=params) # How much to downsample results logging.info("Downsample set to %s", downsample) ## Names of files and output if args.outdir is None: outdir = os.getcwd() else: outdir = args.outdir try: os.stat(outdir) except: os.mkdir(outdir) param_str = ("%s-" * 14)[:-1] % (seed, ntraj, delta_t, Nfock_a, Nfock_j, duration, downsample, sdeint_method_name, num_systems, R, eps, noise_amp, trans_phase, drive_second_system) file_name = '%s/QSD_%s_%s' % (outdir, Regime, param_str) # Saving options save_mat = args.save2mat save_pkl = args.save2pkl if save_mat == False and save_pkl == False: logging.warning( "Both pickle and mat save are disabled, no data will be saved.") logging.warning( "You can modify this with args --save2pkl and --save2mat") implicit_type = None if sdeint_method_name in SDEINT_METHODS: sdeint_method = SDEINT_METHODS[sdeint_method_name] ## For now let's use the full implicit method for implicit methods. ## The value implicit_type can be made one of: ## "implicit", "semi_implicit_drift", or "semi_implicit_diffusion". if sdeint_method_name in IMPLICIT_METHODS: implicit_type = "implicit" else: logging.error( "Unknown sdeint_method_name, %s, or not implemented yet.", sdeint_method_name) raise ValueError("Unknown sdeint_method_name, or not implemented yet.") tspan = np.arange(0, duration, delta_t) obsq_data = None if num_systems == 1: if Regime == "absorptive_bistable": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_JC( Nfock_a, Nfock_j) elif Regime == "kerr_bistable": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable( Nfock_a) elif Regime == "kerr_bistable2": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime2( Nfock_a) elif Regime == "kerr_bistable3": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime3( Nfock_a) elif Regime == "kerr_bistable4": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime4( Nfock_a) elif Regime == "kerr_bistable5": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime5( Nfock_a) elif Regime == "kerr_bistable6": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime6( Nfock_a) elif Regime == "kerr_bistable7": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime7( Nfock_a) elif Regime[:len( "kerr_bistable" )] == "kerr_bistable": ##inputs in this case are e.g. kerr_bistableA33.25_... which_kerr = Regime[len( "kerr_bistable")] ## e.g. A in kerr_bistableA33.25_ custom_drive = float(Regime[len("kerr_bistableA"):] ) ## e.g. 33.25 in kerr_bistableA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive( Nfock_a, which_kerr, custom_drive) elif Regime == "kerr_qubit": logging.info("Regime is set to %s", Regime) H, psi0, Ls, obsq_data, obs_names = make_system_kerr_qubit(Nfock_a) else: logging.error("Unknown regime, %s, or not implemented yet.", Regime) raise ValueError("Unknown regime, or not implemented yet.") ### Run simulation for one system D = qsd_solve( H=H, psi0=psi0, tspan=tspan, Ls=Ls, sdeint_method=sdeint_method, obsq=obsq_data, ntraj=ntraj, seed=seed, normalize_state=True, downsample=downsample, implicit_type=implicit_type, ) elif num_systems == 2: if Regime == "absorptive_bistable": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_JC_two_systems( Nfock_a, Nfock_j, drive_second_system) elif Regime == "kerr_bistable": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_two_systems( Nfock_a, drive_second_system) elif Regime == "kerr_qubit": logging.info("Regime is set to %s", Regime) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_qubit_two_systems( Nfock_a, drive_second_system) elif Regime[:len("empty_then_kerr" )] == 'empty_then_kerr': ##e.g. empty_then_kerrA33.25 which_kerr = Regime[len( "empty_then_kerr")] ## e.g. A in empty_then_kerrA33.25_ custom_drive = float(Regime[len("empty_then_kerrA"):] ) ## e.g. 33.25 in empty_then_kerrA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_empty_then_kerr( Nfock_a, which_kerr, custom_drive) elif Regime[:len( "kerr_bistable" )] == "kerr_bistable": ##inputs in this case are e.g. kerr_bistableA33.25_... which_kerr = Regime[len( "kerr_bistable")] ## e.g. A in kerr_bistableA33.25_ custom_drive = float(Regime[len("kerr_bistableA"):] ) ## e.g. 33.25 in kerr_bistableA33.25 logging.info("Regime is set to %s, with custom drive %s" % (Regime, custom_drive)) H1, H2, psi0, L1s, L2s, obsq_data, obs_names = make_system_kerr_bistable_regime_chose_drive_two_systems( Nfock_a, which_kerr, custom_drive) else: logging.error("Unknown regime, %s, or not implemented yet.", Regime) raise ValueError("Unknown regime, or not implemented yet.") ### Run simulation for one system D = qsd_solve_two_systems( H1, H2, psi0, tspan, L1s, L2s, R=R, eps=eps, n=noise_amp, sdeint_method=sdeint_method, trans_phase=trans_phase, obsq=obsq_data, normalize_state=True, downsample=downsample, ops_on_whole_space= False, ## assume the given operators only operate on their own subspace multiprocessing=False, ## disable multiprocessing for now ntraj=ntraj, seed=seed, implicit_type=implicit_type, ) else: logging.error("Unknown num_systems, %s, or not implemented yet.", num_systems) raise ValueError("Unknown num_systems, or not implemented yet.") ### include time in results D.update({'tspan': tspan}) ### downsample D_downsampled = { 'psis': D['psis'], 'obsq_expects': D['obsq_expects'], 'seeds': D['seeds'], 'tspan': D['tspan'][::downsample] } ### Save results if save_mat: logging.info("Saving mat file...") save2mat(data=D_downsampled, file_name=file_name, obs=obs_names, params=params) if save_pkl: logging.info("Saving pickle file...") save2pkl(data=D_downsampled, file_name=file_name, obs=obs_names, params=params)
lambd=lambd, n=noise_amp, sdeint_method=sdeint_method, trans_phase=trans_phase, obsq=obsq_data, normalize_state=True, downsample=downsample, multiprocessing=False, ntraj=ntraj, processes=8, seed=1, implicit_type=None) ## include time in results, and unfiltered behavior of the generated ## trajectory of system 1. D.update({'tspan': tspan, 'sys1_expects': obs}) ### Save results if save_mat: logging.info("Saving mat file...") save2mat(data=D, file_name=output_file_path, obs=obs_names, params=params) if save_pkl: logging.info("Saving pickle file...") save2pkl(data=D, file_name=output_file_path, obs=obs_names, params=params)
def main(): parser = get_parser() try: args = parser.parse_args() except: sys.exit(0) # Set up commands from parser params = dict() ntraj = params['Ntraj'] = args.ntraj seed = params['seed'] = args.seed duration = params['duration'] = args.duration delta_t = params['delta_t'] = args.deltat Nfock_a = params['Nfock_a'] = args.nfocka Nfock_j = params['Nfock_j'] = args.nfockj downsample = params['downsample'] = args.downsample # Does the user want to print verbose output? quiet = args.quiet if not quiet: print_params(params=params) # How much to downsample results logging.info("Downsample set to %s", downsample) ## Names of files and output Regime = "absorptive_bistable" param_str = "%s-%s-%s-%s-%s-%s" % (seed, ntraj, delta_t, Nfock_a, Nfock_j, duration) outdir = "" if args.outdir != None: outdir = args.outdir file_name = '%s/QSD_%s_%s' % (outdir, Regime, param_str) # Saving options save_mat = args.save2mat save_pkl = args.save2pkl if save_mat == False and save_pkl == False: logging.warning( "Both pickle and mat save are disabled, no data will be saved.") logging.warning( "You can modify this with args --save2pkl and --save2mat") # ## Make Operators a = Destroy(1) ad = a.dag() sm = LocalSigma(2, 1, 0) / sqrt(2) sp = sm.dag() sz = sp * sm - sm * sp j = Jminus(2) jp = j.dag() jz = Jz(2) jx = (jp + j) / 2. jy = (jp - j) / 2. # ## Make SLH Model k, g0, g = symbols("kappa, g0,gamma", positive=True) DD, TT = symbols("Delta, Theta", real=True) W = symbols("Omega") L = [sqrt(k) * a, sqrt(g) * j] H = -I * g0 * (a * jp - ad * j) + DD * jz + TT * ad * a S = identity_matrix(2) slh = SLH(S, L, H).coherent_input(W, 0) slh ## Numerical parameters a.space.dimension = Nfock_a j.space.dimension = Nfock_j if Regime == "absorptive_bistable": logging.info("Regime is set to %s", Regime) nparams = make_nparams(W=W, k=k, g=g, g0=g0, DD=DD, TT=TT) else: logging.error("Unknown regime, %s, or not implemented yet.", Regime) raise ValueError("Unknown regime, or not implemented yet.") Hq, Lqs = slh.substitute(nparams).HL_to_qutip() ## Observables obs = (a, j, jz, a * a, a.dag() * a, a * jp, jp, jx, jy) obsq = [o.to_qutip(full_space=slh.space) for o in obs] tspan = np.arange(0, duration, delta_t) psi0 = qutip.tensor(qutip.basis(Nfock_a, 0), qutip.basis(Nfock_j, 0)).data H = Hq.data Ls = [Lq.data for Lq in Lqs] obsq = [ob.data for ob in obsq] ### Run simulation D = qsd_solve(H=H, psi0=psi0, tspan=tspan, Ls=Ls, sdeint_method=sdeint.itoEuler, obsq=obsq, ntraj=ntraj, seed=seed, normalize_state=True) ### include time in results D.update({'tspan': tspan}) ### downsample D_downsampled = { 'psis': D['psis'][:, ::downsample], 'obsq_expects': D['obsq_expects'][:, ::downsample], 'seeds': D['seeds'], 'tspan': D['tspan'][::downsample] } ### Save results if save_mat: logging.info("Saving mat file...") save2mat(data=D_downsampled, file_name=file_name, obs=obs, params=params) if save_pkl: logging.info("Saving pickle file...") save2pkl(data=D_downsampled, file_name=file_name, obs=obs, params=params)
def kfold_xgboost(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting XGBoost. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # dmatrix for test_df test_df_dmtrx = xgb.DMatrix(test_df[feats]) # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # params params = { 'device': 'gpu', 'objective': 'multi:softmax', # GPU parameter 'booster': 'gbtree', 'eval_metric': 'mlogloss', 'num_class': 12, 'eta': 0.05, 'colsample_bytree': 0.3490457769968177, 'subsample': 0.543646263362097, 'max_depth': 11, 'alpha': 4.762312990232561, 'lambda': 9.98131082276387, 'gamma': 0.19161156850826594, 'min_child_weight': 15.042054927368088, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'silent': 1, 'seed': int(2**n_fold) } # train model clf = xgb.train(params, xgb_train, num_boost_round=10000, evals=[(xgb_train, 'train'), (xgb_test, 'test')], early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/xgb_' + str(n_fold) + '.txt') oof_preds[valid_idx] = clf.predict(xgb_test, output_margin=True) sub_preds += clf.predict(test_df_dmtrx, output_margin=True) / folds.n_splits # save feature importances fold_importance_df = pd.DataFrame.from_dict( clf.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y, xgb_train, xgb_test gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') print('Full F1 Score %.6f' % full_f1) line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/xgb_importances.png', '../imp/feature_importance_xgb.csv') if not debug: # save prediction for submit test_df['recommend_mode'] = np.argmax(sub_preds, axis=1) test_df = test_df.reset_index() # post processing test_df['recommend_mode'][(test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0 )] = test_df['plan_0_transport_mode'][ (test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0)] test_df[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # save out of fold prediction train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1) train_df = train_df.reset_index() train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_xgb_plans{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df['sid'] sub_preds['click_mode'] = test_df['click_mode'] # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_xgb_plans{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df['sid'] oof_preds['click_mode'] = train_df['click_mode'] # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/xgb_pred.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))