def main(): # load pkls df = read_pickles('../feats/sales_diff') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d',how='left') df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk']>=df['release']] # make lag features df = make_lags(df,28) # label encoding cols_string = ['item_id','dept_id','cat_id','store_id','state_id'] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1,np.nan,inplace=True) # add price features df_grouped = df[['id','sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f105') # save feature name list features_json = {'features':df.columns.tolist()} to_json(features_json,'../configs/105_all_features_diff.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(debug=False, use_pkl=False): num_rows = 10000 if debug else None if use_pkl: df = loadpkl('../output/df.pkl') else: with timer("train & test"): df = train_test(num_rows) with timer("nightley"): df = pd.merge(df, nightley(num_rows), on=['datetime', 'park'], how='outer') with timer("hotlink"): df = pd.merge(df, hotlink(num_rows), on='datetime', how='outer') with timer("colopl"): df = pd.merge(df, colopl(num_rows), on=['park', 'year', 'month'], how='outer') with timer("weather"): df = pd.merge(df, weather(num_rows), on=['datetime', 'park'], how='outer') with timer("nied_oyama"): df = pd.merge(df, nied_oyama(num_rows), on=['datetime', 'park'], how='outer') with timer("agoop"): df = pd.merge(df, agoop(num_rows), on=['park', 'year','month'], how='outer') with timer("jorudan"): df = pd.merge(df, jorudan(num_rows), on=['datetime', 'park'], how='outer') with timer("save pkl"): save2pkl('../output/df.pkl', df) with timer("Run XGBoost with kfold"): print("df shape:", df.shape) feat_importance = kfold_xgboost(df, num_folds=NUM_FOLDS, stratified=True, debug=debug) display_importances(feat_importance ,'../output/xgb_importances.png', '../output/feature_importance_xgb.csv')
def from_dict(cls, path: Union[str, Path], gz=False, mode=None): """Load and construct dictionary into a file. No restriction about the size or freq, just loads given file and set as the dictionary. Parameters: path (str): Path to the file. gz (bool): Whether the file is gz or not. mode (str or None): If 'i2w' or 'w2i', the loaded dict is used as the actual mapping of given type, and the other mapping is automatically created. """ valid_modes = ['i2w', 'w2i'] dic: dict = loadpkl(path, gz=gz) vocab = Vocab(symbols=[]) if mode is None: vocab.w2i = dic['w2i'] vocab.i2w = dic['i2w'] elif mode in valid_modes: setattr(vocab, mode, dic) setattr(vocab, next(m for m in valid_modes if mode != m), {v: k for k, v in dic.items()}) elif mode == 'w2i': vocab.w2i = dic else: raise ValueError(f"Invalid mode '{mode}'") return vocab
def main(debug=False): with timer("Load Datasets"): # load pkl df = loadpkl('../features/queries_profiles.pkl') # use selected features df = df[configs['features']] # set card_id as index df.set_index('sid', inplace=True) # split train & test train_df = df[df['click_mode'].notnull()] test_df = df[df['click_mode'].isnull()] del df gc.collect() if debug: train_df = train_df.iloc[:1000] with timer("Run LightGBM with kfold"): kfold_lightgbm(train_df, test_df, num_folds=NUM_FOLDS, stratified=True, debug=debug)
def _try_load_cache(self, path: Path) -> bool: r"""Try loading a cached dataset from the data directory. :param path: The path to data directory. :return: Whether loading was successful. """ cache_dir = path / '_cached' if not cache_dir.exists(): return False params_index_path = cache_dir / 'params_index.pkl' params_index: List[Dict[str, Any]] = loadpkl(params_index_path) params = self._get_params() index = next( (idx for idx, p in enumerate(params_index) if (cache_dir / f'{idx}.pkl').exists() and self._compare_params(p, params)), -1) if index != -1: load_path = cache_dir / f'{index}.pkl' self.batches = loadpkl(load_path) self.ntokens = { split: sum(batch.ntokens for _, batches in dataset for batch in batches) for split, dataset in self.batches.items() } LOGGER.info( f"Cached dataset loaded from {load_path}, with settings: {params}" ) # check for excluded keys and warn in case of mismatch load_params = params_index[index] for key in self.EXCLUDE_KEYS: if key in params or key in load_params: current = params.get(key, "<does not exist>") loaded = load_params.get(key, "<does not exist>") if current != loaded: LOGGER.info( Logging.color( 'red', f"Ignored data param '{key}' mismatch " f"(current: {current}, loaded: {loaded})")) return True return False
def main(): # submitファイルをロード sub = pd.read_csv("../input/sample_submit.tsv", sep='\t', header=None) sub_lgbm = pd.read_csv("../output/submission_lgbm.tsv", sep='\t', header=None) sub_xgb = pd.read_csv("../output/submission_xgb.tsv", sep='\t', header=None) # カラム名を変更 sub.columns = ['index', 'visitors'] sub_lgbm.columns = ['index', 'visitors'] sub_xgb.columns = ['index', 'visitors'] # merge sub.loc[:, 'visitors'] = 0.5 * sub_lgbm['visitors'] + 0.5 * sub_xgb['visitors'] del sub_lgbm, sub_xgb gc.collect() # out of foldの予測値をロード oof_lgbm = pd.read_csv("../output/oof_lgbm.csv") oof_xgb = pd.read_csv("../output/oof_xgb.csv") oof_preds = 0.5 * oof_lgbm['OOF_PRED'] + 0.5 * oof_xgb['OOF_PRED'] # train_dfをロード train_df = loadpkl('../output/train_df.pkl') train_df = train_df.sort_values('index') # local cv scoreを算出 local_mae = mean_absolute_error(train_df['visitors'], oof_preds) # LINE通知 line_notify('Blend Local MAE score %.6f' % local_mae) del oof_lgbm, oof_xgb gc.collect() # save submit file sub[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t')
def _save_cache(self, path: Path): r"""Save the dataset to cache in the data directory. :param path: The path to data directory. """ cache_dir = path / '_cached' params_index_path = cache_dir / 'params_index.pkl' params = self._get_params() if not cache_dir.exists(): cache_dir.mkdir() index = 0 params_index = [params] else: params_index = loadpkl(params_index_path) index = next((idx for idx in range(len(params_index)) if not (cache_dir / f'{idx}.pkl').exists()), len(params_index)) params_index[index:(index + 1)] = [params] # replace or append savepkl(params_index, params_index_path) load_path = cache_dir / f'{index}.pkl' savepkl(self.batches, load_path) LOGGER.info(f"Dataset cached to {load_path}, with settings: {params}")
def __init__(self, args: LMArguments, *, vocab_size: int, rel_vocab_size: int, max_unkrel: int): self._rel_vocab_size = rel_vocab_size self._max_unkrel = max_unkrel self._kb_embed_dim = args.kb_embed_dim self._fact_embed_dim = args.kb_embed_dim * 2 pred_input_dim = args.hidden_size + (self._fact_embed_dim if args.use_knowledge_embed else 0) super().__init__(args, vocab_size, pred_input_dim=pred_input_dim) self._num_layers = args.num_layers self._use_knowledge_embed = args.use_knowledge_embed self._train_relation_vec = args.train_relation_vec self._alias_disamb = args.alias_disamb_strategy self.selector = nn.Linear(self._pred_input_dim, 2) if args.alias_disamb_strategy is AliasDisamb.FastText: # Alias disambiguation using FastText. def _alias_path(name): path = Path(args.fasttext_model_path) return path.parent / (path.name + f'.{name}') self.alias_vec = torch.load(_alias_path('alias_vectors.pt')).to( self.device) if args.normalize_fasttext_embeds: self.alias_vec = self.alias_vec / torch.norm( self.alias_vec, dim=1).unsqueeze(0) self.alias_list: List[str] = reverse_map( loadpkl(_alias_path('alias_dict.pkl'))) self._alias_vec_dim = self.alias_vec.size(1) self.hid_to_alias = nn.Linear(self._pred_input_dim, self._alias_vec_dim) # Entity disambiguation using entity embeddings. self.entity_vec, relation_vec = utils.load_kb_embed( args.path, self.device) if self._train_relation_vec: rel_vocab_size, rel_embed_dim = relation_vec.size() self.relation_embed = nn.Embedding(rel_vocab_size, rel_embed_dim) else: self.relation_vec = relation_vec # (added 1) -2: anchor, -3: topic_itself, < -4: UNKs self.special_rel_vecs = nn.ParameterList([ nn.Parameter(torch.Tensor(self._kb_embed_dim)) for _ in range(max_unkrel + 2) ]) self.unk_entity_vec = nn.Parameter(torch.Tensor( self._kb_embed_dim)) # -1 def _(a, b): return a if a != -1 else b # Entity prediction if args.use_rel_mlp: self.hid_to_fact: nn.Module = utils.MLP( self._pred_input_dim, _(args.fact_key_mlp_hidden_dim, self._fact_embed_dim * 2), self._fact_embed_dim, dropout=args.dropout) else: self.hid_to_fact = nn.Linear(self._pred_input_dim, self._fact_embed_dim) self.reset_parameters()
def __init__(self, args: LMArguments, *, vocab_size: int, rel_vocab_size: int, max_unkrel: int): self._word_embed_dim: int = args.embed_size self._hidden_dim: int = args.hidden_size self._rel_vocab_size = rel_vocab_size self._position_embed_dim: int = args.pos_embed_dim self._position_size: int = args.pos_embed_count self._kb_embed_dim: int = args.kb_embed_dim self._num_layers: int = args.num_layers self._dropout: float = args.dropout self._use_anchor_rels: bool = args.use_anchor self._alias_disamb = args.alias_disamb_strategy self._fact_embed_dim: int = self._kb_embed_dim * 2 # concat of TransE embeds for relation & object input_dim = self._fact_embed_dim + self._word_embed_dim + self._position_embed_dim pred_input_dim = self._hidden_dim + self._fact_embed_dim super().__init__(args, vocab_size, input_dim=input_dim, pred_input_dim=pred_input_dim, embed_dim=self._word_embed_dim) self._fact_sel_strategy = args.fact_sel_strategy self._mask_invalid_pos = args.mask_invalid_pos self._use_knowledge_embed = args.use_knowledge_embed def _(a, b): return a if a != -1 else b # All MLP hidden dim specs are copied from the original code # Sec 3.2.2 Fact Extraction self.fact_key_mlp = utils.MLP( self._hidden_dim + (self._fact_embed_dim if self._use_knowledge_embed else 0), _(args.fact_key_mlp_hidden_dim, self._fact_embed_dim * 2), self._fact_embed_dim, dropout=self._dropout) # Sec 3.2.3 Selecting Word Generation Source self.copy_predictor = utils.MLP(self._pred_input_dim, _(args.copy_mlp_hidden_dim, self._hidden_dim), 1, dropout=self._dropout) self.pos_predictor = utils.MLP(self._pred_input_dim, _(args.pos_mlp_hidden_dim, self._position_embed_dim * 2), self._position_size, dropout=self._dropout) if args.alias_disamb_strategy is AliasDisamb.FastText: def _alias_path(name): path = Path(args.fasttext_model_path) return path.parent / (path.name + f'.{name}') self.alias_vec = torch.load(_alias_path('alias_vectors.pt')).to( self.device) if args.normalize_fasttext_embeds: self.alias_vec = self.alias_vec / torch.norm( self.alias_vec, dim=1).unsqueeze(0) self.alias_list: List[str] = reverse_map( loadpkl(_alias_path('alias_dict.pkl'))) self._alias_vec_dim = self.alias_vec.size(1) self.hid_to_alias = nn.Linear(self._pred_input_dim, self._alias_vec_dim) # Embeddings self.position_embed = nn.Embedding(self._position_size, self._position_embed_dim) # KB related self.entity_vec, self.relation_vec = utils.load_kb_embed( args.path, self.device) self.naf_vec = nn.Parameter(torch.Tensor(self._fact_embed_dim)) # -1 # (added 1) -2: anchor, -3: topic_itself, < -4: UNKs self.special_rel_vecs = nn.ParameterList([ nn.Parameter(torch.Tensor(self._kb_embed_dim)) for _ in range(max_unkrel + 2) ]) self.unk_entity_vec = nn.Parameter(torch.Tensor( self._kb_embed_dim)) # -1 self.reset_parameters()
rows = [] for i in list(set(tables[0])): rows.append(len([k for k in tables[0] if k == i])) print(f"rows distribution: {rows}\n") plt.plot(rows) # plt.plot(range(20, 50), rows[20:50]) plt.savefig('rows.png') unique_cells = get_cell_stats(all_tables) total_cells_count = 0 print(f"unique_cells: {len(unique_cells)}\n") cell_len_over_20 = len([i for i in unique_cells if len(i.split(' ')) > 20]) print(f"cell_len_over_20: {cell_len_over_20}\n") total_cells = reduce( lambda x, y: x+y, list(map(lambda x: x[0]*x[1], list(zip(tables[0], tables[1]))))) print(f"total_cells: {total_cells}\n") if __name__ == "__main__": # baseline_f = pd.read_csv('../global_data/features.csv') # tables_subset_3k = list(baseline_f['table_id']) # tables_subset = list( # set(tables_subset_3k+random.sample(all_tables, 20000))) tables_subset = loadpkl('./data/postive_tables_set.pkl') read_all_tables = [read_table(js)['data'] for js in tables_subset] dataset_stats(read_all_tables) vocab = loadpkl('./data/vocab_2D_10-50_complete.pkl') print(f'vocab: {len(vocab)}\n')
from sklearn.model_selection import KFold, StratifiedKFold from preprocess import train_test, nightley, hotlink, colopl, weather, nied_oyama, jorudan, agoop from utils import FEATS_EXCLUDED, NUM_FOLDS, loadpkl, line_notify ################################################################################ # optunaによるhyper parameter最適化 # 参考: https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py ################################################################################ NUM_ROWS = None USE_PKL = True if USE_PKL: DF = loadpkl('../output/df.pkl') else: DF = train_test(NUM_ROWS) DF = pd.merge(DF, nightley(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, hotlink(NUM_ROWS), on='datetime', how='outer') DF = pd.merge(DF, colopl(NUM_ROWS), on=['year', 'month'], how='outer') DF = pd.merge(DF, weather(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, nied_oyama(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, agoop(num_rows), on=['park', 'year', 'month'], how='outer') DF = pd.merge(DF, jorudan(num_rows), on=['datetime', 'park'], how='outer')
import numpy as np import pandas as pd import optuna import gc from sklearn.model_selection import KFold, StratifiedKFold from utils import FEATS_EXCLUDED, NUM_FOLDS, loadpkl, line_notify ################################################################################ # optunaによるhyper parameter最適化 # 参考: https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py ################################################################################ # load datasets TRAIN_DF = loadpkl('../output/train_df.pkl') FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED] def objective(trial): lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS], TRAIN_DF['target'], free_raw_data=False) params = { 'objective': 'regression', 'metric': 'rmse', 'verbosity': -1,
savepkl(f'./data/yp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_p) ''' Generating Negative X, y dataset from the tables (1.3 x X_p) ''' size = int(len(X_p) * 1.3) with Pool(40) as p: X_n = [tqdm(p.imap(generate_neg_table, range(size)), total=size)] p.close() p.join() X_n, y_n = data_prep_pipeline(X_n, '-') savepkl(f'./data/xn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_n) savepkl(f'./data/yn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_n) ''' Generating word distribution dataframe ''' X_n = loadpkl('./data/xn_2D_10-50.pkl') X_p = loadpkl('./data/xp_2D_10-50.pkl') X = np.hstack((X_n, X_p)) print(X.shape, X_n.shape, X_p.shape) result = flatten_1_deg(flatten_1_deg(flatten_1_deg(X))) print('Adding queries...') query_l = [tokenize_str(i) for i in list(baseline_f['query'].unique())] query_l = flatten_1_deg(query_l) result += query_l print(result[:10]) count = Counter(result) c = [[i, count[i]] for i in count.keys()] df = pd.DataFrame(c) df.sort_values(by=[1], ascending=False, inplace=True) df.to_csv('./data/word_distr_2D_complete.csv', index=False, columns=None) '''
print('Data preparation happening...') baseline_f = pd.read_csv('../global_data/features.csv') def t(baseline_f): baseline_f['table_tkn'] = baseline_f.table_id.apply( lambda x: tokenize_table(read_table(x)['data'])) baseline_f['query_tkn'] = baseline_f['query'].apply( lambda x: tokenize_str(x)) return baseline_f baseline_f = mp(baseline_f, t, 20) baseline_f.to_csv('./data/baseline_f_tq-tkn.csv', index=False) if args.path: config = Config() vocab = loadpkl(config['input_files']['vocab_path']) output_dir = f'./output/{args.path}' model_load = torch.load(os.path.join(output_dir, 'model.pt')) baseline_f = pd.read_csv(config['input_files']['baseline_f']) trec = TREC_data_prep(model=model_load, vocab=vocab) baseline_f = mp(df=baseline_f, func=trec.pipeline, num_partitions=20) baseline_f.drop(columns=['table_emb', 'query_emb'], inplace=True) # baseline_f.to_csv('./baseline_f_tq-emb_temp.csv', index=False) # baseline_f = pd.read_csv('./baseline_f_tq-emb_temp.csv') trec_path = os.path.join(output_dir, config['trec']['folder_name']) trec_model = TREC_model(data=baseline_f, output_dir=trec_path, config=config) trec_model.train()
def main(num_rows=None): # load pkls df = read_pickles('../features/plans') queries = loadpkl('../features/queries.pkl') profiles = loadpkl('../features/profiles.pkl') queries_pred = loadpkl('../features/queries_pred.pkl') queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl') # merge df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left') df = pd.merge(df, profiles, on='pid', how='left') df = pd.merge(df, queries_pred, on='sid', how='left') df = pd.merge(df, queries_profiles_pred, on='sid', how='left') del queries, profiles, queries_pred, queries_profiles_pred gc.collect() # reduce memory usage df = reduce_mem_usage(df) # count features df['pid_count'] = df['pid'].map(df['pid'].value_counts()) # time diff df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int) # distance ratio cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)] for i, c in enumerate(cols_plan_distance): df['plan_queries_distance_ratio{}'.format( i)] = df[c] / df['queries_distance'] df['plan_queries_distance_diff{}'.format( i)] = df[c] - df['queries_distance'] # stats features for preds cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)] cols_pred_queries_profiles = [ 'pred_queries_profiles{}'.format(i) for i in range(0, 12) ] df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1) df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1) df['pred_queries_max'] = df[cols_pred_queries].max(axis=1) df['pred_queries_min'] = df[cols_pred_queries].min(axis=1) df['pred_queries_var'] = df[cols_pred_queries].var(axis=1) df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1) df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean( axis=1) df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum( axis=1) df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max( axis=1) df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min( axis=1) df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var( axis=1) df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew( axis=1) # stats features for each classes print('stats features...') for i in tqdm(range(0, 12)): cols = [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ] df['pred_mean{}'.format(i)] = df[cols].mean(axis=1) df['pred_sum{}'.format(i)] = df[cols].sum(axis=1) df['pred_max{}'.format(i)] = df[cols].max(axis=1) df['pred_min{}'.format(i)] = df[cols].min(axis=1) df['pred_var{}'.format(i)] = df[cols].var(axis=1) df['pred_skew{}'.format(i)] = df[cols].skew(axis=1) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1) df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1) df['target_max{}'.format(i)] = df[cols_target].max(axis=1) df['target_min{}'.format(i)] = df[cols_target].min(axis=1) df['target_var{}'.format(i)] = df[cols_target].var(axis=1) df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1) # post processing cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] print('post processing...') for i in tqdm(range(1, 12)): tmp = np.zeros(len(df)) for c in cols_transport_mode: tmp += (df[c] == i).astype(int) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] for c in cols_target + [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ]: df[c] = df[c] * (tmp > 0) # reduce memory usage df = reduce_mem_usage(df) # split data by city df1 = df[df['y_o'] > 37.5] df2 = df[df['y_o'] < 27.5] df3 = df[df['x_o'] > 120.0] del df gc.collect() # cols for target encoding cols_target_encoding = [ 'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour', 'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans', 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] cols_ratio_plan = [ 'plan_price_distance_ratio_max_plan', 'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan', 'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan', 'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan', 'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan', 'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan', 'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan', 'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan', 'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan', 'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan', 'plan_price_distance_prod_ratio_0_max_plan', 'plan_price_distance_prod_ratio_0_min_plan', 'plan_price_eta_prod_ratio_0_max_plan', 'plan_price_eta_prod_ratio_0_min_plan', 'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan', 'plan_price_distance_eta_prod_ratio_0_max_plan', 'plan_price_distance_eta_prod_ratio_0_min_plan' ] cols_min_max_plan = [ 'plan_distance_max_plan', 'plan_distance_min_plan', 'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan', 'plan_eta_min_plan' ] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [ 'profile_k_means' ] # target encoding for each cities print('traget encoding...') for i, df in tqdm(enumerate([df1, df2, df3])): # target encoding df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding) # change dtype for col in df.columns.tolist(): if df[col].dtypes == 'float16': df[col] = df[col].astype(np.float32) # remove missing variables col_missing = removeMissingVariables(df, 0.75) df.drop(col_missing, axis=1, inplace=True) # remove correlated variables col_drop = removeCorrelatedVariables(df, 0.95) df.drop(col_drop, axis=1, inplace=True) # save as feather to_feature(df, '../features/feats{}'.format(i + 1)) # save feature name list features_json = {'features': df.columns.tolist()} to_json(features_json, '../features/00{}_all_features.json'.format(i + 1)) del df gc.collect() line_notify('{} finished.'.format(sys.argv[0]))
def main(): # load predictions pred_lgbm = loadpkl('../features/lgbm_pred.pkl') pred_xgb = loadpkl('../features/xgb_pred.pkl') plans = loadpkl('../features/plans.pkl') # define columns name list cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)] cols_pred_xgb = ['pred_xgb_plans{}'.format(i) for i in range(0, 12)] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] # merge plans & pred pred = pred_lgbm[['sid', 'click_mode']] pred = pd.merge(pred, plans[cols_transport_mode + ['sid', 'plan_num_plans']], on='sid', how='left') del plans gc.collect() # scaling predictions pred_lgbm[cols_pred_lgbm] = scalingPredictions(pred_lgbm[cols_pred_lgbm]) pred_xgb[cols_pred_xgb] = scalingPredictions(pred_xgb[cols_pred_xgb]) # reset index pred_lgbm.reset_index(inplace=True, drop=True) pred_xgb.reset_index(inplace=True, drop=True) # fill predictions for non-exist plans as zero for i in range(1, 12): tmp = np.zeros(len(pred)) for c in cols_transport_mode: tmp += (pred[c] == i).astype(int) pred_lgbm['pred_lgbm_plans{}'.format( i)] = pred_lgbm['pred_lgbm_plans{}'.format(i)] * (tmp > 0) pred_xgb['pred_xgb_plans{}'.format( i)] = pred_xgb['pred_xgb_plans{}'.format(i)] * (tmp > 0) # get best weight for lgbm & xgboost oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()] oof_pred_xgb = pred_xgb[pred_xgb['click_mode'].notnull()] w = getBestWeights(oof_pred_lgbm.click_mode, oof_pred_lgbm, oof_pred_xgb, '../imp/weight.png') # calc prediction for each class cols_pred = [] for i in range(0, 12): pred['pred_{}'.format(i)] = w * pred_lgbm['pred_lgbm_plans{}'.format( i)] + (1.0 - w) * pred_xgb['pred_xgb_plans{}'.format(i)] cols_pred.append('pred_{}'.format(i)) # get out of fold values oof_pred = pred[pred['click_mode'].notnull()] # get best multiples m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4.png') pred['pred_4'] *= m4 oof_pred['pred_4'] *= m4 m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0.png') pred['pred_0'] *= m0 oof_pred['pred_0'] *= m0 m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3.png') pred['pred_3'] *= m3 oof_pred['pred_3'] *= m3 m6 = getBestMultiple(oof_pred, 'pred_6', cols_pred, '../imp/multiple6.png') pred['pred_6'] *= m6 oof_pred['pred_6'] *= m6 # get recommend mode pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1) # if number of plans = 1 and recommend mode != 0, set recommend mode as plan 0 mode. pred['recommend_mode'][(pred['plan_num_plans'] == 1) & ( pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][ (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)] # split train & test sub_pred = pred[pred['click_mode'].isnull()] oof_pred = pred[pred['click_mode'].notnull()] # out of fold score oof_f1_score = f1_score(oof_pred['click_mode'], oof_pred['recommend_mode'], average='weighted') # save csv oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # line notify line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
from glob import glob from sklearn.model_selection import KFold, StratifiedKFold from tqdm import tqdm from utils import FEATS_EXCLUDED, loadpkl, line_notify, to_json #============================================================================== # hyper parameter optimization by optuna # https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py #============================================================================== # load datasets CONFIGS = json.load(open('../configs/101_lgbm_queries.json')) # load feathers DF = loadpkl('../features/queries.pkl') # split train & test TRAIN_DF = DF[DF['click_mode'].notnull()] del DF gc.collect() # use selected features TRAIN_DF = TRAIN_DF[CONFIGS['features']] # set card_id as index TRAIN_DF.set_index('sid', inplace=True) FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED] def objective(trial):
def _extra_init(self, loaded_batches: bool): self.rel_vocab = Vocab.from_dict(self._path / 'rel_names.pkl', mode='i2w') self.vocab: Dict[str, Vocab] = { "word": self.word_vocab, "rel": self.rel_vocab } self.max_unkrel = max( (-rel_typ - 3 for rel_typ in self.rel_vocab.i2w if rel_typ < -3), default=0) if self._use_fasttext: def _alias_path(name): path = Path(self._fasttext_model_path) return path.parent / (path.name + f'.{name}') # gather all entity aliases and compute fastText embeddings alias_dict_path = _alias_path('alias_dict.pkl') if alias_dict_path.exists(): alias_dict: Dict[str, int] = loadpkl(alias_dict_path) loaded = True else: alias_dict = defaultdict(lambda: len(alias_dict)) loaded = False if not loaded_batches: for dataset in self.data.values(): for example in dataset: for idx, rel in enumerate( example.relations): # type: ignore example.relations[ idx] = rel._replace( # type: ignore obj_alias=[ alias_dict[s] for s in rel.obj_alias ]) if not alias_dict_path.exists(): alias_dict = dict(alias_dict) savepkl(alias_dict, alias_dict_path) alias_vectors_path = _alias_path('alias_vectors.pt') if not alias_vectors_path.exists() or not loaded: import fastText ft_model = fastText.load_model(self._fasttext_model_path) alias_vectors = [] alias_list = utils.reverse_map(alias_dict) for alias in utils.progress(alias_list, desc="Building fastText vectors", ascii=True, ncols=80): vectors = [ ft_model.get_word_vector(w) for w in alias.split() ] vectors = np.sum(vectors, axis=0).tolist() alias_vectors.append(vectors) alias_vectors = torch.tensor(alias_vectors) torch.save(alias_vectors, alias_vectors_path) if not loaded_batches and (self._exclude_entity_disamb or self._exclude_alias_disamb): # no need to do this if batches are loaded if self._exclude_entity_disamb: # gather training set stats self.entity_count_per_type = self.gather_entity_stats( self.data['train']) for dataset in self.data.values(): for idx in range(len(dataset)): dataset[idx] = self.remove_ambiguity( dataset[idx], self._exclude_entity_disamb, self._exclude_alias_disamb)
parser.add_argument( "--comment", help="additional comments for simulation to be run." ) return parser.parse_args() if __name__ == '__main__': args = get_args() output_dir, config = utils.setup_simulation(args) model_params = config['model_params'] input_files = config['input_files'] trec_config = config['trec'] torch.manual_seed(model_params['seed']) Xp = loadpkl(input_files['Xp_path']) yp = loadpkl(input_files['yp_path']) logger.info(f"Xp.shape: {Xp.shape}, yp.shape: {yp.shape}") # Xn = loadpkl(input_files['Xn_path']) # yn = loadpkl(input_files['yn_path']) # logger.info(f"Xn.shape: {Xn.shape}, yn.shape: {yn.shape}") vocab = loadpkl(input_files['vocab_path']) logger.info(f"len(vocab): {len(vocab)}") train_writer = make_writer(output_dir, 'train', config) test_writer = make_writer(output_dir, 'test', config) device = torch.device( f"cuda:{args.cuda_no}" if torch.cuda.is_available() else 'cpu') model = models.create_model(config['model_props']['type'], params=(
from glob import glob from sklearn.model_selection import KFold, StratifiedKFold from tqdm import tqdm from utils import FEATS_EXCLUDED, loadpkl, line_notify, to_json #============================================================================== # hyper parameter optimization by optuna # https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py #============================================================================== # load datasets CONFIGS = json.load(open('../configs/102_lgbm_queries_profiles.json')) # load feathers DF = loadpkl('../features/queries_profiles.pkl') # split train & test TRAIN_DF = DF[DF['click_mode'].notnull()] del DF gc.collect() # use selected features TRAIN_DF = TRAIN_DF[CONFIGS['features']] # set card_id as index TRAIN_DF.set_index('sid', inplace=True) FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED] def objective(trial):
tables[i] = np.vectorize(lambda y: w2i[y])(np.array(t)).tolist() return tables if __name__ == '__main__': start = time.time() parser = argparse.ArgumentParser() parser.add_argument("-p", "--pad_data_prep", help="path for the scores", action='store_true') args = parser.parse_args() config = Config() X = loadpkl("./data/xp_2D_10-50.pkl") y = loadpkl("./data/yp_2D_10-50.pkl") vocab = loadpkl(config['input_files']['vocab_path']) table_prep_params = config['table_prep_params'] print(X.shape, y.shape, len(vocab)) if args.pad_data_prep: def pad_table(table): rows = len(table) for row in table: for cell in row: for i in range( 0, table_prep_params['LENGTH_PER_CELL'] - len(cell)): cell.append('<PAD>')
def main(): # load predictions pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl') pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl') pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl') plans = read_pickles('../features/plans') preds = [pred_lgbm1, pred_lgbm2, pred_lgbm3] # define columns name list cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] # remove columns cols_drop = [ c for c in plans.columns if c not in cols_transport_mode + ['sid', 'plan_num_plans', 'click_mode'] ] plans.drop(cols_drop, axis=1, inplace=True) # postprocessing sub_preds = [] oof_preds = [] for i, pred_lgbm in enumerate(preds): # merge plans & pred pred = pred_lgbm[['sid', 'click_mode']] pred = pd.merge(pred, plans[cols_transport_mode + ['sid', 'plan_num_plans']], on='sid', how='left') # scaling predictions pred_lgbm[cols_pred_lgbm] = scalingPredictions( pred_lgbm[cols_pred_lgbm]) # reset index pred_lgbm.reset_index(inplace=True, drop=True) # fill predictions for non-exist plans as zero for j in range(1, 12): tmp = np.zeros(len(pred)) for c in cols_transport_mode: tmp += (pred[c] == j).astype(int) pred_lgbm['pred_lgbm_plans{}'.format( j)] = pred_lgbm['pred_lgbm_plans{}'.format(j)] * (tmp > 0) # get best weight for lgbm & xgboost oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()] # calc prediction for each class cols_pred = [] for j in range(0, 12): pred['pred_{}'.format(j)] = pred_lgbm['pred_lgbm_plans{}'.format( j)] cols_pred.append('pred_{}'.format(j)) # get out of fold values oof_pred = pred[pred['click_mode'].notnull()] # get best multiples m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0_{}.png'.format(i + 1)) pred['pred_0'] *= m0 oof_pred['pred_0'] *= m0 m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3_{}.png'.format(i + 1)) pred['pred_3'] *= m3 oof_pred['pred_3'] *= m3 m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4_{}.png'.format(i + 1)) pred['pred_4'] *= m4 oof_pred['pred_4'] *= m4 # get recommend mode pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1) # if number of plans = 1 and recommend mode != 0, fill recommend mode with plan 0 mode. pred['recommend_mode'][(pred['plan_num_plans'] == 1) & ( pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][ (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)] # split train & test _sub_pred = pred[pred['click_mode'].isnull()] _oof_pred = pred[pred['click_mode'].notnull()] sub_preds.append(_sub_pred) oof_preds.append(_oof_pred) del pred, _sub_pred, _oof_pred gc.collect() # merge preds sub_pred = sub_preds[0].append(sub_preds[1]) sub_pred = sub_pred.append(sub_preds[2]) sub_pred = pd.merge( plans[plans['click_mode'].isnull()][['sid', 'click_mode']], sub_pred[['sid', 'recommend_mode']], on='sid', how='left') oof_pred = oof_preds[0].append(oof_preds[1]) oof_pred = oof_pred.append(oof_preds[2]) oof_pred = pd.merge( plans[plans['click_mode'].notnull()][['sid', 'click_mode']], oof_pred[['sid', 'recommend_mode']], on='sid', how='left') del sub_preds, oof_preds, plans # out of fold score oof_f1_score = f1_score(oof_pred['click_mode'], oof_pred['recommend_mode'], average='weighted') # save csv oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # line notify line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))