def preprocess_dfs(use_features, is_local=False, logger=None, debug=True): # read dataframes with timer("read datasets"): if debug: nrows = 200000 else: nrows = None sub = pd.read_csv(base_path + '/sample_submission.csv') # if is_local: # org_train = pickle_load("../input/train.pkl") # org_test = pickle_load("../input/test.pkl") # else: # org_train = pd.read_csv(base_path + "/train.csv", nrows=nrows) # org_test = pd.read_csv(base_path + "/test.csv", nrows=nrows) org_train = pd.read_pickle(f'{base_path}/train.pkl.gz') # org_test = pd.read_csv(f'{base_path}/test.csv') org_test = org_train org_train = memory_reducer(org_train, verbose=True) org_test = org_test[org_test.installation_id.isin(sub.installation_id)] org_test.sort_values(['installation_id', 'timestamp'], inplace=True) org_test.reset_index(inplace=True) org_test = memory_reducer(org_test, verbose=True) train_labels = pd.read_csv(base_path + "/train_labels.csv", nrows=nrows) specs = pd.read_csv(base_path + "/specs.csv", nrows=nrows) # basic preprocess org_train["timestamp"] = pd.to_datetime(org_train["timestamp"]) org_test["timestamp"] = pd.to_datetime(org_test["timestamp"]) with timer("merging features"): train_df = add_features(use_features, org_train, org_test, train_labels, specs, datatype="train", is_local=is_local, logger=None) train_df = train_df.reset_index(drop=True) test_df = add_features(use_features, org_train, org_test, train_labels, specs, datatype="test", is_local=is_local, logger=None) test_df = test_df.reset_index(drop=True) # df = pd.concat([df, feat_df], axis=1) print("preprocess done!!") return train_df, test_df
def do_adversarial_valid_kfold(self, model_conf, n_splits=2): sp = Splitter() target = "is_test" split_x = self.train["installation_id"] split_y = self.train[target] seed = 773 sp.get_kfold_idx(split_x, split_y, seed, n_cv=n_splits, stratified=True, pref="adv") target_length = 1 oof: ndarray = np.zeros(self.train.shape[0]) prediction = np.zeros(self.test.shape[0]) clf_list = [] self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50) self.logger.log(logging.DEBUG, model_conf["train_cols"]) self.validation_scores = [] for i, (trn_idx, val_idx) in enumerate(sp.idx_list): self.logger.log(logging.DEBUG, "-" * 60) self.logger.log(logging.DEBUG, f"start training: {i}") with timer(f"fold {i}", self.logger): train_df, valid_df = self.train.loc[trn_idx], self.train.loc[ val_idx] model = self.generate_model(model_conf) clf, fold_oof, feature_importance_df = model.train( train_df, valid_df, self.logger) # calc validation score using clf.best_iteration_ fold_val_score = get_val_score(valid_df[target], fold_oof) self.validation_scores.append(fold_val_score) self.logger.log(logging.DEBUG, f"fold_val_score: {fold_val_score:,.5f}") clf_list.append(clf) oof[val_idx] = fold_oof feature_importance_df["fold"] = i self.feature_importance.append(feature_importance_df) self.logger.log( logging.DEBUG, f"Total Validation Score: {sum(self.validation_scores) / len(self.validation_scores):,.5f}" ) oof = np.expm1(oof) self.train["pred_y"] = oof self.feature_importance = pd.concat(self.feature_importance, axis=0) return clf_list, oof, prediction, self.feature_importance
def feature_extract(self, org_train, org_test): if self.check_feature_exec(): with timer(f"FE: {self.name}", self.logger): a = self.calc_feature(org_train, org_test) return a
def do_valid_kfold(self, model_conf, n_splits=5): sp = Splitter() target = model_conf["target"] split_x = self.train["installation_id"] split_y = self.train[target] seed = 773 sp.get_kfold_idx(split_x, split_y, seed, n_cv=n_splits, stratified=False, group=True, pref=self.exp_conf["exp_name"]) oof: ndarray = np.zeros((self.train.shape[0])) prediction = np.zeros((self.test.shape[0])) clf_list = [] self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50) self.logger.log(logging.DEBUG, model_conf["train_cols"]) self.validation_scores = [] optimizers = [] valid_qwks = [] for i, (trn_idx, val_idx) in enumerate(sp.idx_list): self.logger.log(logging.DEBUG, "-" * 60) self.logger.log(logging.DEBUG, f"start training: {i}") with timer(f"fold {i}", self.logger): train_df, valid_df = self.train.loc[trn_idx], self.train.loc[ val_idx] model = self.generate_model(model_conf) clf, fold_oof, feature_importance_df = model.train( train_df, valid_df, self.logger) # fold_oof_class = fold_oof.argmax(axis = 1) fold_prediction = model.predict(self.test, self.logger) # fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK") optR = OptimizedRounder() optR.fit(fold_oof, valid_df[target]) coefficients = optR.coefficients() opt_preds = optR.predict(fold_oof, coefficients) fold_qwk = qwk(valid_df[target], opt_preds) optimizers.append(optR) valid_qwks.append(fold_qwk) clf_list.append(clf) oof[val_idx] = fold_oof prediction += fold_prediction / n_splits feature_importance_df["fold"] = i self.feature_importance.append(feature_importance_df) # self.logger.log(logging.DEBUG, # f"Total Validation Score: {sum(self.validation_scores) / # len(self.validation_scores):,.5f}") self.feature_importance = pd.concat(self.feature_importance, axis=0) return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks
def do_valid_kfold(self, model_conf, n_splits=5, trn_mode='simple', val_mode='simple'): sp = Splitter() target = model_conf["target"] split_x = self.train["installation_id"] split_y = self.train[target] seed = 773 sp.get_kfold_idx(split_x, split_y, seed, n_cv=n_splits, stratified=False, group=True, pref=self.exp_conf["exp_name"]) oof: ndarray = np.zeros((self.train.shape[0])) labels = np.zeros((self.train.shape[0])) prediction = np.zeros((self.test.shape[0])) clf_list = [] self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50) self.logger.log(logging.DEBUG, model_conf["train_cols"]) self.validation_scores = [] for i, (trn_idx, val_idx) in enumerate(sp.idx_list): self.logger.log(logging.DEBUG, "-" * 60) self.logger.log(logging.DEBUG, f"start training: {i}") with timer(f"fold {i}", self.logger): _train = self.train.copy() if self.another_train: _train = pd.concat([_train, self.another_train]) trn_idx = np.concatenate([trn_idx, self.another_train_idx]) if trn_mode == 'simple': pass elif trn_mode == 'last_truncated': trn_idx = self.get_last_trancated_idx(_train, trn_idx) if val_mode == 'simple': pass elif val_mode == 'last_truncated': val_idx = self.get_last_trancated_idx(_train, val_idx) train_df, valid_df = _train.loc[trn_idx], _train.loc[val_idx] model = self.generate_model(model_conf) clf, fold_oof, feature_importance_df = model.train( train_df, valid_df, self.logger) # fold_oof_class = fold_oof.argmax(axis = 1) fold_prediction = model.predict(self.test, self.logger) # fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK") # calc validation score using best iteration # self.validation_scores.append(fold_val_score) # self.logger.log(logging.DEBUG, f"fold_val_score: {fold_val_score:,.5f}") clf_list.append(clf) oof[val_idx] = fold_oof labels[val_idx] = valid_df['accuracy_group'].values prediction += fold_prediction / n_splits feature_importance_df["fold"] = i self.feature_importance.append(feature_importance_df) # self.logger.log(logging.DEBUG, # f"Total Validation Score: {sum(self.validation_scores) / # len(self.validation_scores):,.5f}") self.feature_importance = pd.concat(self.feature_importance, axis=0) return clf_list, oof, prediction, self.feature_importance, labels