def on_fe_end(self, state: State): feature_dir = state.feature_dir features = state.features for name, feature in features.items(): for phase in ["train", "test"]: if isinstance(feature[phase], dict) or isinstance( feature[phase], csr_matrix): mdict = {name: feature[phase]} mat_name = f"{self.prefix}{name}_{phase}.mat" with utils.timer("Saving " + mat_name, state.logger): savemat(feature_dir / mat_name, mdict) elif isinstance(feature[phase], pd.DataFrame): ftr_name = f"{self.prefix}{name}_{phase}.ftr" with utils.timer("Saving " + ftr_name, state.logger): for col in feature[phase].columns: if feature[phase][col].dtype == "float16": feature[phase][col] = feature[phase][ col].astype("float32") feature[phase].to_feather(feature_dir / ftr_name) else: raise NotImplementedError target = state.target target_name = f"{self.prefix}main_target.npy" with utils.timer("Saving " + target_name, state.logger): np.save(feature_dir / target_name, target)
def load_features(config: dict) -> Tuple[cudf.DataFrame, cudf.DataFrame]: feature_path = config["dataset"]["feature_dir"] with timer("load train"): train_feats = [ cudf.read_feather(f"{feature_path}/{f}_train.ftr") for f in config["features"] if Path(f"{feature_path}/{f}_train.ftr").exists() ] cols = [] for feats in train_feats: cols = cols + feats.columns.tolist() print( f"duplicated cols: {[k for k, v in collections.Counter(cols).items() if v > 1]}" ) assert len(cols) == len(np.unique(cols)) x_train = cudf.concat( train_feats, axis=1, sort=False, ) with timer("load test"): x_test = cudf.concat( [ cudf.read_feather(f"{feature_path}/{f}_test.ftr") for f in config["features"] if Path(f"{feature_path}/{f}_test.ftr").exists() ], axis=1, sort=False, ) return x_train, x_test
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True) del train, test gc.collect() with timer("make feats"): groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) new_cols = [col for col in total.columns if col not in org_cols] train = total[new_cols].iloc[:len_train].reset_index(drop=True) test = total[new_cols].iloc[len_train:].reset_index(drop=True) with timer("end"): self.train = train.reset_index(drop=True).to_pandas() self.test = test.reset_index(drop=True).to_pandas()
def load_features(config: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: feature_path = config["dataset"]["feature_dir"] with timer("load train"): x_train = pd.concat( [ load_pickle(f"{feature_path}/{f}_train.pkl") for f in config["features"] if Path(f"{feature_path}/{f}_train.pkl").exists() ], axis=1, sort=False, ) with timer("load test"): x_test = pd.concat( [ load_pickle(f"{feature_path}/{f}_test.pkl") for f in config["features"] if Path(f"{feature_path}/{f}_test.pkl").exists() ], axis=1, sort=False, ) return x_train, x_test
def create_features( self, train_df: cudf.DataFrame, test_df: cudf.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True) with timer("category vectorizer"): new_features = [] for i in [5, 10, 20, 30]: vectorizer = CategoryVectorizer( categorical_columns=cat_var_list, n_components=i, transformer=LatentDirichletAllocation(n_components=i)) new_feats = vectorizer.transform(total) new_features.append(new_feats) new_features = cudf.concat(new_features, axis=1) with timer("end"): self.train = new_features.iloc[:len_train].reset_index(drop=True) self.test = new_features.iloc[len_train:].reset_index(drop=True)
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("combi cats"): new_cat_df = cudf.concat( [ xfeat.ConcatCombination( drop_origin=True, r=r).fit_transform( total[cat_cols].astype(str).fillna("none")) for r in [2, 3, 4] ], axis="columns", ) for col in new_cat_df.columns: le = LabelEncoder() new_cat_df[col] = le.fit_transform( new_cat_df[col]).astype("category") total = cudf.concat( [total, new_cat_df], axis="columns", ) with timer("end"): total = total.sort_values("index") new_cols = [ col for col in total.columns if col not in org_cols + ["index"] ] self.train = total[new_cols].iloc[:len_train].reset_index( drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
def on_data_end(self, state: State): with utils.timer("Data Compressing", state.logger): dfs = state.dataframes for key in dfs: dfs[key] = utils.reduce_mem_usage(dfs[key], verbose=True, logger=state.logger)
def fit( self, train_loader: DataLoader, valid_loader: DataLoader, with_validation: bool = False, ) -> None: for _ in range(self.config.n_epochs): with timer( f"CV {self.cv_num} epoch {self.epoch}", mlflow_on=self.mlflow_on ): summary_loss = self._train_one_epoch(train_loader) if with_validation: summary_loss = self._validation(valid_loader) if summary_loss.avg < self.best_summary_loss: self.best_summary_loss = summary_loss.avg if self.start_time is not None: self.train_model.eval() self.save( f"{self.log_path}/best-checkpoint_cv{self.cv_num}.bin" ) if self.config.validation_scheduler: self.scheduler.step(metrics=summary_loss.avg) self.epoch += 1
def fit_catboost(X, y, cv=None, params: dict = None, verbose=500): if params is None: params = deepcopy(CAT_DEFAULT_PARAMS) if cv is None: cv = StratifiedKFold(n_splits=2, shuffle=True) models = [] # training data の target と同じだけのゼロ配列を用意 # float にしないと悲しい事件が起こるのでそこだけ注意 oof_pred = np.zeros_like(y, dtype=np.float) for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)): # この部分が交差検証のところです。データセットを cv instance によって分割します # training data を trian/valid に分割 x_train, y_train = X[idx_train], y[idx_train] x_valid, y_valid = X[idx_valid], y[idx_valid] clf = CatBoost(params=params) with timer(prefix='fit fold={} '.format(i + 1)): clf_train = Pool(x_train, y_train) clf_val = Pool(x_valid, y_valid) clf.fit(clf_train, eval_set=[clf_val]) pred_i = clf.predict(x_valid, prediction_type='Probability')[:, 1] oof_pred[idx_valid] = pred_i models.append(clf) print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}') score = roc_auc_score(y, oof_pred) print('FINISHED \ whole score: {:.4f}'.format(score)) return oof_pred, models, score
def shrink_dateframe(x_trn: pd.DataFrame, config: dict) -> pd.DataFrame: with timer("shrink datafrme"): if config["shrink_by_release"]: x_trn = shrink_by_release(x_trn) x_trn = shrink_by_date(x_trn, config) return x_trn
def shrink_by_release(x_trn: pd.DataFrame) -> pd.DataFrame: # wm_yr_wk, releaseがカラムに必要 with timer("shrink by release"): logging.info(f"before train shape : {x_trn.shape}") x_trn = x_trn.query("wm_yr_wk >= release").reset_index(drop=True) logging.info(f"after train shape : {x_trn.shape}") return x_trn
def run_loader(self): # TODO implement search of cache for data with timer("loading fpl summary", __file__): self.load_fpl_summary() with timer("adding maps", __file__): self.add_maps() with timer("loading scores", __file__): self.load_match_scores() with timer("adding player IDs", __file__): self.add_player_id_list() with timer("loading player data", __file__): self.load_player_data() with timer("adding team IDs", __file__): self.add_player_team_id_to_player_data() with timer("adding player positions", __file__): self.add_player_positions() # TODO: implement player % filtering if self.add_team_ratings: with timer("calculating attack and defence scores", __file__): self.add_att_def_scores_to_data() with timer("merging att def scores", __file__): self.merge_att_def_ratings_to_all_player_data() if self.add_team_assists: with timer("adding assists", __file__): self.add_assists_to_data() self.data['all_player_data'] = filter_data( self.data['all_player_data'], "corresponding to rows where less than 30 minutes played", self.data['all_player_data'].minutes > 30) return self.data
def _aggregate(self, dataframe): with timer("aggregate"): self.features = [] for param_dict in tqdm(self.param_dict): key, var, agg, on = self._get_params(param_dict) all_features = list(set(key + var)) new_features = self._get_feature_names(key, var, agg) features = (dataframe[all_features].groupby(key)[var].agg( agg).reset_index()) features.columns = key + new_features self.features.append(features) return self
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("log transform"): for sub_target in num_var_list: total[sub_target] = cudf.Series(np.log1p(total[sub_target].to_pandas())) with timer("GroupbyTransformer"): groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) with timer("end"): total = total.sort_values("index") new_cols = [col for col in total.columns if col not in org_cols + ["index"]] self.train = total[new_cols].iloc[:len_train].reset_index(drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
def on_fe_end(self, state: State): features = state.features for key in features: if isinstance(features[key]["train"], pd.DataFrame): with utils.timer(f"Sort columns of features `{key}`", logger=state.logger): features[key]["train"] = features[key]["train"].sort_index( axis=1) if "test" in features[key].keys(): features[key]["test"] = features[key][ "test"].sort_index(axis=1) state.features = features
def shrink_by_date_index(x_trn: pd.Series, config: dict) -> pd.Index: # dateがカラムに必要 with timer("shrink by date index"): logging.info(f"before train shape : {x_trn.shape}") params = config["params"] x_trn = pd.to_datetime(x_trn) x_trn = x_trn[x_trn >= datetime.datetime( params["year"], params["month"], params["day"])] x_trn_idx = x_trn.index logging.info(f"after train shape : {x_trn.shape}") return x_trn_idx
def shrink_by_date(x_trn: pd.DataFrame, config: dict) -> pd.DataFrame: # dateがカラムに必要 with timer("shrink by date"): logging.info(f"before train shape : {x_trn.shape}") params = config["params"] x_trn["date"] = pd.to_datetime(x_trn["date"]) x_trn = x_trn[x_trn["date"] >= datetime.datetime( params["year"], params["month"], params["day"])] x_trn = x_trn.reset_index(drop=True) logging.info(f"after train shape : {x_trn.shape}") return x_trn
def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None: super().__init__() self.logger = logger self.timer = timer() self.timer.initialize(time.time(), 60 * 100) self.meta_epoch = meta_epoch self.valid_check_epoch = valid_check_epoch self.patience = patience self.valid_tasks = valid_tasks self.batch_size = batch_size self.first_eval = first_eval self.training_mode = 0 self.training_stage = 0 self.saving = False
def run( self, train_df: XDataFrame, test_df: Optional[XDataFrame] = None, log: bool = False, ): with timer(self.name, log=log): self.create_features(train_df, test_df=test_df) prefix = self.prefix + "_" if self.prefix else "" suffix = self.suffix + "_" if self.suffix else "" self.train.columns = pd.Index([str(c) for c in self.train.columns]) self.valid.columns = pd.Index([str(c) for c in self.valid.columns]) self.test.columns = pd.Index([str(c) for c in self.test.columns]) self.train.columns = prefix + self.train.columns + suffix self.valid.columns = prefix + self.valid.columns + suffix self.test.columns = prefix + self.test.columns + suffix return self
def fit_xgb(X, y, cv=None, params: dict = None, verbose=500): if params is None: params = deepcopy(XGB_DEFAULT_PARAMS) if cv is None: cv = StratifiedKFold(n_splits=2, shuffle=True) models = [] # training data の target と同じだけのゼロ配列を用意 # float にしないと悲しい事件が起こるのでそこだけ注意 oof_pred = np.zeros_like(y, dtype=np.float) for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)): # この部分が交差検証のところです。データセットを cv instance によって分割します # training data を trian/valid に分割 x_train, y_train = X[idx_train], y[idx_train] x_valid, y_valid = X[idx_valid], y[idx_valid] with timer(prefix='fit fold={} '.format(i + 1)): print(x_train.shape, y_train.shape) print(x_valid.shape, y_valid.shape) dtrain = xgb.DMatrix(x_train, label=y_train) dval = xgb.DMatrix(x_valid, label=y_valid) evals = [(dtrain, 'train'), (dval, 'eval')] clf = xgb.train( params, dtrain, evals=evals, early_stopping_rounds=params['early_stopping_rounds'], num_boost_round=params['num_boost_round'], verbose_eval=verbose) pred_i = clf.predict(dval) oof_pred[idx_valid] = pred_i models.append(clf) print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}') score = roc_auc_score(y, oof_pred) print('FINISHED \ whole score: {:.4f}'.format(score)) return oof_pred, models, score
def on_fe_end(self, state: State): features = state.features as_sparse = False for feature in features.values(): if isinstance(feature["train"], dict) or isinstance( feature["train"], csr_matrix): as_sparse = True break main_feature = {} with utils.timer("Concatenating `main` features", state.logger): if as_sparse: for phase in ["train", "test"]: sparse_matrices = [] for f in features.values(): if isinstance(f[phase], pd.DataFrame): feature_values = csr_matrix(f[phase].values) sparse_matrices.append(feature_values) elif isinstance(f[phase], dict): sparse_dict = f[phase] for sp_mat in sparse_dict.values(): sparse_matrices.append(sp_mat) elif isinstance(f[phase], csr_matrix): sparse_matrices.append(f[phase]) main_feature[phase] = hstack(sparse_matrices).tocsr() else: for phase in ["train", "test"]: dfs = [] for f in features.values(): dfs.append(f[phase]) main_feature[phase] = pd.concat(dfs, axis=1) state.features["main"] = main_feature if self.delete_original: keys = list(features.keys()) keys.remove("main") for key in keys: del state.features[key] gc.collect()
def run(self): self._run_callbacks(phase="start") for config in self.state.config: method, kwargs = data.file_open_method(config) columns = data.required_columns(config) if columns is not None: kwargs["columns"] = columns filepath = Path(config["dir"]) / config["name"] if self.state.data_stats[str(filepath)] is not None: stats_path = self.state.data_stats[str(filepath)] stats = data.open_stats(stats_path) dtypes = stats["dtypes"] if columns is not None: dtypes_cols = {} for col in columns: dtypes_cols[col] = dtypes[col] if method == "read_csv" and config["mode"] == "normal": kwargs["dtype"] = dtypes_cols else: kwargs["dtype"] = dtypes with utils.timer("Reading " + config["name"], self.state.logger): if method in {"read_parquet", "read_pickle", "read_feather"}: df = pd.__getattribute__(method)(filepath, **kwargs) self.state.dataframes[str(filepath)] = df elif method == "read_csv": if config["mode"] == "normal": df = pd.__getattribute__(method)(filepath, **kwargs) self.state.dataframes[str(filepath)] = df elif config["mode"] == "large": raise NotImplementedError else: pass else: raise NotImplementedError self.state.dataframe_roles[str(filepath)] = config["role"] self._run_callbacks(phase="end")
def _merge(self, dataframe, merge=True): with timer("merge"): for param_dict, features in tqdm(zip(self.param_dict, self.features), total=len(self.features)): key, var, agg, on = self._get_params(param_dict) if merge: if is_cudf(dataframe): dataframe = cudf.merge(dataframe, features, how="left", on=on) else: dataframe = dataframe.merge(features, how="left", on=on) else: new_features = self._get_feature_names(key, var, agg) dataframe = pd.concat([dataframe, features[new_features]], axis=1) return dataframe
def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None: super().__init__() self.logger = logger self.timer = timer() self.timer.initialize(time.time(), 60 * 1000) self.meta_epoch = meta_epoch self.valid_check_epoch = valid_check_epoch self.patience = patience self.valid_tasks = valid_tasks self.batch_size = batch_size self.first_eval = first_eval self.data_augmentor = DataArgumentor( ) if self.use_data_augmentation else None self.turn_on_data_augmentor = False
def run_tuner(self): with timer('Optimising with method {}'.format(self.method), __file__): logger.info("Null model likelihood: {:.4E}".format( self._get_null_model_likelihood())) self.tuner_params.log_initial() minimise_kwargs = self.minimize_args() try: if self.use_multicore_gradient: optimal = multioptimiser(**minimise_kwargs) else: optimal = minimize(**minimise_kwargs) except (KeyboardInterrupt, SystemExit) as e: time.sleep(2) logger.info('Cancelling optimisation........') self.teardown_params() raise e logger.info( 'Finished having run {} evaluations over {} iterations'.format( optimal.nfev, optimal.nit)) self.tuner_params.update_using_opt_array(optimal.x) self.teardown_params() return self.tuner_params.nested_params
def main(): parser = ArgumentParser() parser.add_argument('--config_file', type=str, required=True) parser.add_argument('--valid', action='store_true') args = parser.parse_args() config_file = Path(args.config_file) config = load_config(config_file) config.setdefault('max_len', 220) config.setdefault('max_head_len', 128) config.setdefault('epochs', 2) config.setdefault('down_sample_frac', 0.5) config.setdefault('lr', 1.5e-5) config.setdefault('batch_size', 16) config.setdefault('accumulation_steps', 4) config.setdefault('lr_weight_decay_coef', 1.0) config.setdefault('warmup', 0.05) config.setdefault('old_data', False) config.setdefault('old_fine_tuned', False) config.setdefault('device', 'cuda') config.setdefault('seed', 1234) assert 'lm_model_name' in config assert not (config.old_fine_tuned and config.old_data) assert config.max_len >= config.max_head_len assert config.epochs <= 2 lm_model_name = config_file.stem if config.old_fine_tuned: PRETRAINED_PATH = Path(f'../output/{lm_model_name}_old_fine_tune/') assert PRETRAINED_PATH.exists() else: PRETRAINED_PATH = args.lm_model MODE = args.lm_model[:4] LOWER_CASE = 'uncased' in args.lm_model LARGE_MODEL = 'large' in args.lm_model DEVICE = torch.device(config.device) if config.old_data: lm_model_name += '_old_fine_tune' if args.valid: valid_size = 200000 shuffle_seed = 1029 lm_model_name += '_valid' else: valid_size = 0 shuffle_seed = config.seed OUT_DIR = Path(f'../output/{lm_model_name}/') TEST_SUBMISSION = OUT_DIR / 'submission.csv' VALID_SUBMISSION = OUT_DIR / 'valid_submission.csv' OUT_DIR.mkdir(exist_ok=True) warnings.filterwarnings('ignore') seed_torch(config.seed) if not args.old: train_data = TRAIN_DATA test_data = TEST_DATA sample_submission = SAMPLE_SUBMISSION train_size = 1804874 - valid_size else: train_data = TRAIN_OLD test_data = TEST_OLD sample_submission = SAMPLE_OLD train_size = 159571 - valid_size TOXICITY_COLUMN = OLD_TOXICITY_COLUMN IDENTITY_COLUMNS = OLD_IDENTITY_COLUMNS AUX_TOXICITY_COLUMNS = OLD_AUX_TOXICITY_COLUMNS if MODE == 'bert': from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam lm_tokenizer = BertTokenizer.from_pretrained(args.lm_model, cache_dir=None, do_lower_case=LOWER_CASE) model = BertForSequenceClassification.from_pretrained( PRETRAINED_PATH, cache_dir=None, num_labels=1 + len(AUX_TOXICITY_COLUMNS)) optimizer_class = BertAdam else: from pytorch_pretrained_bert import GPT2Tokenizer, OpenAIAdam, GPT2Model lm_tokenizer = GPT2Tokenizer.from_pretrained(args.lm_model, cache_dir=None) model = GPT2ClassificationHeadModel.from_pretrained( PRETRAINED_PATH, clf_dropout=config.get('dropout_rate', 0.1), n_class=1 + len(AUX_TOXICITY_COLUMNS)) optimizer_class = OpenAIAdam assert config.lr_weight_decay_coef == 1.0 with timer('preprocess'): tokenizer = MyTokenizer(lm_tokenizer, config.max_len, config.max_head_len, MODE) df_train = pd.read_csv(TRAIN_DATA).sample( frac=1, random_state=shuffle_seed).reset_index(drop=True) df_train['comment_text'] = df_train['comment_text'].astype(str) df_train = df_train.fillna(0) X_train = tokenizer.tokenize( df_train['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000) df_test = pd.read_csv(TEST_DATA) df_test['comment_text'] = df_test['comment_text'].astype(str) df_test = df_test.fillna(0) X_test = tokenizer.tokenize( df_test['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000) df_train.drop(['comment_text'], axis=1, inplace=True) df_test.drop(['comment_text'], axis=1, inplace=True) X_valid = X_train[train_size:] X_train = X_train[:train_size] y_identity_train = df_train[IDENTITY_COLUMNS].values y_annotator_counts_train = df_train['toxicity_annotator_count'].values weights = training_weights(df_train, TOXICITY_COLUMN, IDENTITY_COLUMNS) y_train = np.hstack( (df_train[TOXICITY_COLUMN].values.reshape(-1, 1), weights.reshape(-1, 1), df_train[AUX_TOXICITY_COLUMNS].values)) y_valid = y_train[train_size:] y_train = y_train[:train_size] y_identity_valid = y_identity_train[train_size:] y_identity_train = y_identity_train[:train_size] y_annotator_counts_valid = y_annotator_counts_train[train_size:] y_annotator_counts_train = y_annotator_counts_train[:train_size] loss_weight = 1.0 / weights.mean() if not args.old else None # drop negative samples here frac = config.down_sample_frac target_negative = (y_train > 0.0).sum(axis=1) == 1 identity_negative = (y_identity_train > 0.0).sum(axis=1) == 0 negative_mask = identity_negative & target_negative negative_indices = np.arange(len(y_train))[negative_mask] drop_indices_0 = set( negative_indices[:int(len(negative_indices) * frac)]) drop_indices_1 = set( negative_indices[int(len(negative_indices) * (1 - frac)):]) drop_indices_list = [drop_indices_0, drop_indices_1] len_train = len(y_train) - len(drop_indices_0) with timer('train'): model.zero_grad() model = model.to(DEVICE) num_layers = 24 if LARGE_MODEL else 12 optimizer_grouped_parameters = get_optimizer_params( model, config.lr, config.lr_weight_decay_coef, num_layers) num_train_optimization_steps = int(config.epochs * len_train / config.batch_size / config.accumulation_steps) optimizer = optimizer_class(optimizer_grouped_parameters, lr=config.lr, warmup=config.warmup, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) model = model.train() batch_count = len_train // config.batch_size loss_fn = CustomLoss(loss_weight) for epoch, drop_indices in zip(range(config.epochs), drop_indices_list): sample_indices = np.array( [i for i in range(len(y_train)) if i not in drop_indices]) X_sampled_train = [X_train[i] for i in sample_indices] y_sampled_train = y_train[sample_indices] y_sampled_identity_train = y_identity_train[sample_indices] y_sampled_annotator_counts_train = y_annotator_counts_train[ sample_indices] train_dataset = TextDataset(X_sampled_train, y_sampled_train, y_sampled_identity_train, y_sampled_annotator_counts_train) train_loader = LengthBucketingDataLoader( train_dataset, shuffle=True, drop_last=True, batch_size=config.batch_size) tk0 = tqdm(enumerate(train_loader), total=batch_count) optimizer.zero_grad() for i, (x_batch, _, a_batch, y_batch, y_identity_batch) in tk0: y_pred = model(x_batch.to(DEVICE), attention_mask=(x_batch > 0).to(DEVICE), labels=None) loss = loss_fn(y_pred, y_batch.to(DEVICE)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % config.accumulation_steps == 0: optimizer.step() optimizer.zero_grad() model.save_pretrained(OUT_DIR) with timer('evaluate'): if args.valid: valid_dataset = TextDataset(X_valid, y_valid, y_identity_valid, y_annotator_counts_valid) valid_preds = predict(model, valid_dataset, device=DEVICE) df_valid = df_train.tail(valid_size) df_valid['model1'] = valid_preds evaluator = JigsawEvaluator(df_valid[TOXICITY_COLUMN].values, df_valid[IDENTITY_COLUMNS].values) final_score, _ = evaluator.get_final_metric( df_valid['model1'].values) valid_prediction = predict(model, TextDataset(X_valid), device=DEVICE) valid_submission = pd.DataFrame({ 'id': df_valid['id'], 'prediction': valid_prediction }) valid_submission.to_csv(VALID_SUBMISSION, index=False) print(f'validation score: {final_score:.5f}') test_prediction = predict(model, TextDataset(X_test), device=DEVICE) submission = pd.DataFrame({ 'id': df_test['id'], 'prediction': test_prediction }) submission.to_csv(TEST_SUBMISSION, index=False)
def main(): parser = ArgumentParser() parser.add_argument('--config_file', type=str, required=True) args = parser.parse_args() # settings config_path = Path(args.config_file) config = Config.load(config_path) warnings.filterwarnings('ignore') set_seed(config.seed) start_time = time.time() with timer('load data'): DATA_DIR = './input/riiid-test-answer-prediction/' usecols = [ 'row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', ] dtype = { 'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32' } train_df = pd.read_csv(DATA_DIR + 'train.csv', usecols=usecols, dtype=dtype) question_df = pd.read_csv(DATA_DIR + 'questions.csv', usecols=['question_id', 'part']) train_df = train_df[train_df['content_type_id'] == 0].reset_index( drop=True) question_df['part'] += 1 # 0: padding id, 1: start id train_df['content_id'] += 2 # 0: padding id, 1: start id question_df['question_id'] += 2 train_df = train_df.merge(question_df, how='left', left_on='content_id', right_on='question_id') with timer('validation split'): train_idx, valid_idx, epoch_valid_idx = virtual_time_split( train_df, valid_size=config.valid_size, epoch_valid_size=config.epoch_valid_size) valid_y = train_df.iloc[valid_idx]['answered_correctly'].values epoch_valid_y = train_df.iloc[epoch_valid_idx][ 'answered_correctly'].values print('-' * 20) print(f'train size: {len(train_idx)}') print(f'valid size: {len(valid_idx)}') with timer('prepare data loader'): train_user_seqs = get_user_sequences(train_df.iloc[train_idx]) valid_user_seqs = get_user_sequences(train_df.iloc[valid_idx]) train_dataset = TrainDataset(train_user_seqs, window_size=config.window_size, stride_size=config.stride_size) valid_dataset = ValidDataset(train_df, train_user_seqs, valid_user_seqs, valid_idx, window_size=config.window_size) train_loader = DataLoader(train_dataset, **config.train_loader_params) valid_loader = DataLoader(valid_dataset, **config.valid_loader_params) # valid loader for epoch validation epoch_valid_user_seqs = get_user_sequences( train_df.iloc[epoch_valid_idx]) epoch_valid_dataset = ValidDataset(train_df, train_user_seqs, epoch_valid_user_seqs, epoch_valid_idx, window_size=config.window_size) epoch_valid_loader = DataLoader(epoch_valid_dataset, **config.valid_loader_params) with timer('train'): if config.model == 'akt': content_encoder_config = BertConfig( **config.content_encoder_config) knowledge_encoder_config = BertConfig( **config.knowledge_encoder_config) decoder_config = BertConfig(**config.decoder_config) content_encoder_config.max_position_embeddings = config.window_size + 1 knowledge_encoder_config.max_position_embeddings = config.window_size decoder_config.max_position_embeddings = config.window_size + 1 model = AktEncoderDecoderModel(content_encoder_config, knowledge_encoder_config, decoder_config) elif config.model == 'saint': encoder_config = BertConfig(**config.encoder_config) decoder_config = BertConfig(**config.decoder_config) encoder_config.max_position_embeddings = config.window_size decoder_config.max_position_embeddings = config.window_size model = SaintEncoderDecoderModel(encoder_config, decoder_config) else: raise ValueError(f'Unknown model: {config.model}') model.to(config.device) model.zero_grad() optimizer = optim.Adam(model.parameters(), **config.optimizer_params) scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps) loss_ema = None for epoch in range(config.n_epochs): epoch_start_time = time.time() model.train() progress = tqdm(train_loader, desc=f'epoch {epoch + 1}', leave=False) for i, (x_batch, w_batch, y_batch) in enumerate(progress): y_pred = model(**x_batch.to(config.device).to_dict()) loss = nn.BCEWithLogitsLoss(weight=w_batch.to(config.device))( y_pred, y_batch.to(config.device)) loss.backward() if (config.gradient_accumulation_steps is None or (i + 1) % config.gradient_accumulation_steps == 0): optimizer.step() optimizer.zero_grad() scheduler.step() loss_ema = loss_ema * 0.9 + loss.item( ) * 0.1 if loss_ema is not None else loss.item() progress.set_postfix(loss=loss_ema) valid_preds = predict(model, epoch_valid_loader, device=config.device) valid_score = roc_auc_score(epoch_valid_y, valid_preds) elapsed_time = time.time() - epoch_start_time print( f'Epoch {epoch + 1}/{config.n_epochs} \t valid score: {valid_score:.5f} \t time: {elapsed_time / 60:.1f} min' ) with timer('predict'): valid_preds = predict(model, valid_loader, device=config.device) valid_score = roc_auc_score(valid_y, valid_preds) print(f'valid score: {valid_score:.5f}') output_dir = Path(f'./output/{config_path.stem}/') output_dir.mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), output_dir / 'model.pt') torch.save(optimizer.state_dict(), output_dir / 'optimizer.pt') elapsed_time = time.time() - start_time print(f'all processes done in {elapsed_time / 60:.1f} min.')
def create_features( self, train_df: cudf.DataFrame, test_df: cudf.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True) with timer("label encoding"): with timer("rating"): rating_dict = { "RP": 0, "EC": 1, "K-A": 2, "E": 2, "E10+": 3, "T": 4, "M": 5, "AO": 5, } total["Rating"] = total["Rating"].replace(rating_dict).astype( int) with timer("other cat cols"): cat_cols = [ "Name", "Platform", "Genre", "Publisher", "Developer", ] for col in cat_cols: le = LabelEncoder(handle_unknown="ignore") le.fit(total[col]) total[col] = le.transform(total[col]).astype("category") with timer("User_Score"): total["User_Score"] = (total["User_Score"].replace( to_replace="tbd", value=np.nan).astype(float)) with timer("Year_of_Release"): total["Year_of_Release"] = total["Year_of_Release"].replace( to_replace=2020.0, value=2017.0) with timer("log_User_Count"): total["log_User_Count"] = np.log1p(total["User_Count"].to_pandas()) with timer("end"): basic_cols = [ "Name", "Platform", "Year_of_Release", "Genre", "Publisher", "Critic_Score", "Critic_Count", "User_Score", "User_Count", "log_User_Count", "Developer", "Rating", ] target_cols = [ "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales", ] self.train = total[basic_cols + target_cols].iloc[:len_train].reset_index( drop=True) self.test = total[basic_cols].iloc[len_train:].reset_index( drop=True)
def main(): parser = ArgumentParser() parser.add_argument('--valid', action='store_true') args = parser.parse_args() config = load_config('./config/lstm_f.json') config.setdefault('max_len', 220) config.setdefault('max_features', 100000) config.setdefault('batch_size', 512) config.setdefault('train_epochs', 10) config.setdefault('tolerance', 10) config.setdefault('num_folds', 5) config.setdefault('lr', 1e-3) config.setdefault('loss_alpha', 0.1) config.setdefault('loss_beta', 1.0) config.setdefault('device', 'cuda') config.setdefault('seed', 1029) device = torch.device(config.device) OUT_DIR = Path(f'../output/lstm_f/') submission_file_name = 'valid_submission.csv' if args.valid else 'submission.csv' SUBMISSION_PATH = OUT_DIR / submission_file_name OUT_DIR.mkdir(exist_ok=True) warnings.filterwarnings('ignore') seed_torch(config.seed) with timer('preprocess'): train = pd.read_csv(TRAIN_DATA) if args.valid: train = train.sample(frac=1, random_state=1029).reset_index(drop=True) test = train.tail(200000) train = train.head(len(train) - 200000) else: test = pd.read_csv(TEST_DATA) train['comment_text'] = train['comment_text'].apply(preprocess) test['comment_text'] = test['comment_text'].apply(preprocess) # replace blank with nan train['comment_text'].replace('', np.nan, inplace=True) test['comment_text'].replace('', np.nan, inplace=True) # nan prediction nan_pred = train['target'][train['comment_text'].isna()].mean() # fill up the missing values X_train = train['comment_text'].fillna('_##_').values X_test = test['comment_text'].fillna('_##_').values # get the target values weights = training_weights(train, TOXICITY_COLUMN, IDENTITY_COLUMNS) loss_weight = 1.0 / weights.mean() y_train_identity = train[IDENTITY_COLUMNS].values y_train_annotator_counts = train['toxicity_annotator_count'].values y_train = np.hstack( (train[TOXICITY_COLUMN].values.reshape(-1, 1), weights.reshape(-1, 1), train[AUX_TOXICITY_COLUMNS].values)) train_nan_mask = X_train == '_##_' test_nan_mask = X_test == '_##_' vocab = build_vocab(chain(X_train), config.max_features) fasttext_embedding_matrix = load_embedding(EMBEDDING_FASTTEXT, vocab['token2id']) glove_embedding_matrix = load_embedding(EMBEDDING_GLOVE, vocab['token2id']) joblib.dump(vocab, OUT_DIR / 'vocab.pkl') np.save(OUT_DIR / 'fasttext_embedding_matrix', fasttext_embedding_matrix) np.save(OUT_DIR / 'glove_embedding_matrix', glove_embedding_matrix) X_train = np.array(tokenize(X_train, vocab, config.max_len)) X_test = np.array(tokenize(X_test, vocab, config.max_len)) all_related_columns = [TOXICITY_COLUMN ] + AUX_TOXICITY_COLUMNS + IDENTITY_COLUMNS negative_indices = np.arange( 0, len(train))[(train[all_related_columns] == 0.0).sum( axis=1) == len(all_related_columns)] with timer('train'): skf = StratifiedKFold(n_splits=config.num_folds, shuffle=True, random_state=1) num_aux_targets = y_train.shape[-1] - 2 custom_loss = CustomLoss( loss_weight, alpha=config.loss_alpha, beta=config.loss_beta, use_annotator_counts=True, weight_from_annotator_counts=lambda x: torch.log(x + 2)) test_dataset = TextDataset(token_lists=X_test) test_prediction = np.zeros(len(test_dataset)) test_prediction_count = 0 models = {} for i, (train_idx, valid_idx) in enumerate( skf.split(X_train, y_train[:, 0] >= 0.5)): seed_torch(i) np.random.shuffle(negative_indices) drop_indices = set(negative_indices[:len(negative_indices) // 2]) train_idx = [i for i in train_idx if i not in drop_indices] train_token_lists = [X_train[i] for i in train_idx] valid_token_lists = [X_train[i] for i in valid_idx] train_dataset = TextDataset( token_lists=train_token_lists, targets=y_train[train_idx], identities=y_train_identity[train_idx], annotator_counts=y_train_annotator_counts[train_idx]) valid_dataset = TextDataset( token_lists=valid_token_lists, targets=y_train[valid_idx], identities=y_train_identity[valid_idx], annotator_counts=y_train_annotator_counts[valid_idx]) model = LstmGruNet(embedding_matrices=[ glove_embedding_matrix, fasttext_embedding_matrix ], num_aux_targets=num_aux_targets).to(device) model, records = train(model, custom_loss, train_dataset, valid_dataset, device=device, batch_size=config.batch_size, num_epochs=config.train_epochs, tolerance=config.tolerance, lr=config.lr) test_prediction += predict(model, test_dataset, device) test_prediction_count += 1 torch.save(model.state_dict(), OUT_DIR / f'model.{i}.json') with open(OUT_DIR / f'records.{i}.json', 'w') as f: import json json.dump(records, f, indent=4) submission = pd.DataFrame({ 'id': test['id'], 'prediction': test_prediction / test_prediction_count }) submission.to_csv(SUBMISSION_PATH, index=False) display_tables(OUT_DIR)
# %% dataset.public_log = public_log feat_train, feat_test = pd.DataFrame(), pd.DataFrame() feature_blocks = [ *[CountEncodingBlock(column=c) for c in ['hour']], DateBlock(), PublicLogBlock(dataset), MetaInformationBlock(), UserHistoryBlock(dataset), ] for block in feature_blocks: with timer(prefix='fit {} '.format(block)): out_i = block.fit(train_meta) assert len(train_meta) == len(out_i), block feat_train = pd.concat([feat_train, out_i], axis=1) for block in feature_blocks: with timer(prefix='fit {} '.format(block)): out_i = block.transform(test_meta) assert len(test_meta) == len(out_i), block feat_test = pd.concat([feat_test, out_i], axis=1) # %% print(feat_train.columns) feat_train.head(30)