def test_softmax_classifier(self): clf = RGFClassifier(prefix='clf', calc_prob='Softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_softmax_classifier(self): clf = RGFClassifier(calc_prob='softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier(prefix='clf') bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier() bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def rgf(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame, parameters: Dict): n_splits = 5 # n_neighbors = parameters["n_neighbors"] folds = KFold(n_splits=n_splits, shuffle=True, random_state=42) oof = np.zeros((df.shape[0] + test.shape[0], 9)) for trn_idx, val_idx in folds.split(df, target): train_x = df.iloc[trn_idx, :].values val_x = df.iloc[val_idx, :].values train_y = target[trn_idx].values val_y = target[val_idx].values classifier = RGFClassifier( n_jobs=14, algorithm="RGF", loss="Log", ) classifier.fit(train_x, train_y) y_hat = classifier.predict_proba(val_x) print(log_loss(val_y, y_hat)) print(oof.shape, y_hat.shape) oof[val_idx] = y_hat pred = classifier.predict_proba(test.values) oof[len(target):, :] += pred / n_splits print(oof.shape) # np.save("data/04_features/oof.npz", oof) # oof = np.load("data/04_features/oof.npy") n_name = ["knn_{}".format(i) for i in range(9)] oof = pd.DataFrame(oof) oof.to_csv("data/09_oof/rgf_{}.csv".format(3)) return oof[len(target):].values
def objective(max_leaf, l2, min_samples_leaf, learning_rate): max_leaf = int(max_leaf) min_samples_leaf = int(min_samples_leaf) assert type(max_leaf) == int assert type(min_samples_leaf) == int model = RGFClassifier( max_leaf=max_leaf, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, algorithm="RGF_Sib", test_interval=100, ) model.fit(train_m, label_m) pred_proba = model.predict_proba(train_val) score = roc_auc_score(label_val, pred_proba[:, 1]) return score
def train(params): # log hyperparams for this run for k, v in params.items(): mlflow.log_param(k, v) # load dataset files # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta' dataset = np.load('preprocessed/dataset.npz') X_arr = dataset['X_arr'] Y_arr = dataset['Y_arr'] # split for train-test X_train, X_test, Y_train, Y_test = train_test_split(X_arr, Y_arr, stratify=Y_arr, test_size=0.2) # instantiate model with params rgf_clf = RGFClassifier(**params) rgf_clf.fit(X_train, Y_train) # predict on test data Y_pred = rgf_clf.predict(X_test) Y_pred_proba = rgf_clf.predict_proba(X_test) # log logistic loss value logistic_loss = log_loss(Y_test, Y_pred_proba) mlflow.log_metric('log_loss', logistic_loss) # log precision, recall, f1 p, r, f, _ = precision_recall_fscore_support(y_true=Y_test, y_pred=Y_pred, average='binary') mlflow.log_metric('precision', p) mlflow.log_metric('recall', r) mlflow.log_metric('f1', f) # which features matter the most print("========== FEATURE IMPORTANCES ==========") print(rgf_clf.feature_importances_)
} num_boost_round = 190 fit_model = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100, early_stopping_rounds=100) else: fit_model = xgbmodel.fit(X_train, y_train) # Generate validation predictions for this fold if USE_RGF_INSTEAD: pred = rgf.predict_proba(X_valid.fillna(X_train.mean()))[:, 1] else: pred = fit_model.predict(X_valid) # pred = fit_model.predict_proba(X_valid)[:, 1] gini_results.append(eval_gini(y_valid, pred)) print(" Gini = ", gini_results[-1]) y_valid_pred.iloc[test_index] = pred # Accumulate test set predictions if USE_RGF_INSTEAD: probs = rgf.predict_proba(X_test.fillna(X_train.mean()))[:, 1] try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e:
def train_predict(train_df, test_df, params, model_name=None): if model_name == None: #model_name = 'l1_rgf_%s'%datetime.now().strftime('%m%d%H%M') model_name = 'l1_rgf' log = Logger(os.path.join('log', '%s.log' % model_name)) cols = [c for c in train_df.columns if c not in ['id', 'target']] log.info('Features:') for col in cols: log.info('- %s' % col) log.info('\n') log.info('Parameters:') param_items = params.items() for param_item in param_items: log.info('- %s: %s' % (param_item[0], str(param_item[1]))) log.info('\n') X = train_df[cols].values y = train_df['target'].values X_test = test_df[cols].values prob_train = np.zeros(len(X)) prob_test = np.zeros(len(X_test)) kfold = 5 scores = [] skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=41) for i, (train_ind, valid_ind) in enumerate(skf.split(X, y)): X_train, X_valid = X[train_ind], X[valid_ind] y_train, y_valid = y[train_ind], y[valid_ind] model = RGFClassifier(**params) model.fit(X_train, y_train) prob = model.predict_proba(X_valid)[:, 1] prob_train[valid_ind] = prob score = gini_norm(prob, y_valid) scores.append(score) log.info('- Fold %d/%d score: %f' % (i + 1, kfold, score)) prob = model.predict_proba(X_test)[:, 1] prob_test += prob / kfold try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) mean_score = np.mean(scores) log.info('- Mean score: %f' % mean_score) prob_train_df = pd.DataFrame({'id': train_df['id'], 'target': prob_train}) prob_train_df.to_csv(os.path.join('local_cv', '%s.csv.gz' % model_name), index=False, compression='gzip') prob_test_df = pd.DataFrame({'id': test_df['id'], 'target': prob_test}) prob_test_df.to_csv(os.path.join('submission', '%s.csv.gz' % model_name), index=False, compression='gzip') return mean_score
else: blindloodata = pd.concat([blindloodata, blindtrain]) for c in highcardinality: test['loo' + c] = ProjectOnMean(train, test, c) test.drop(highcardinality, inplace=True, axis=1) train = blindloodata train.drop(highcardinality, inplace=True, axis=1) train = train.fillna(train.mean()) test = test.fillna(train.mean()) # In[ ]: rgf = RGFClassifier( max_leaf=1000, #Try increasing this as a starter algorithm="RGF_Sib", test_interval=250, loss="Log", verbose=True) rgf.fit(train[train.columns[2:]], train.target) x = rgf.predict_proba(train[train.columns[2:]]) print(GiniScore(train.target, x[:, 1])) # In[ ]: sub = pd.read_csv('../input/sample_submission.csv') x = rgf.predict_proba(test[test.columns[2:]]) sub.target = x[:, 1] sub.to_csv('rgfsubmission.csv', index=False)
class Level1Model(object): train_features = [ "ps_car_13", # : 1571.65 / shadow 609.23 "ps_reg_03", # : 1408.42 / shadow 511.15 "ps_ind_05_cat", # : 1387.87 / shadow 84.72 "ps_ind_03", # : 1219.47 / shadow 230.55 "ps_ind_15", # : 922.18 / shadow 242.00 "ps_reg_02", # : 920.65 / shadow 267.50 "ps_car_14", # : 798.48 / shadow 549.58 "ps_car_12", # : 731.93 / shadow 293.62 "ps_car_01_cat", # : 698.07 / shadow 178.72 "ps_car_07_cat", # : 694.53 / shadow 36.35 "ps_ind_17_bin", # : 620.77 / shadow 23.15 "ps_car_03_cat", # : 611.73 / shadow 50.67 "ps_reg_01", # : 598.60 / shadow 178.57 "ps_car_15", # : 593.35 / shadow 226.43 "ps_ind_01", # : 547.32 / shadow 154.58 "ps_ind_16_bin", # : 475.37 / shadow 34.17 "ps_ind_07_bin", # : 435.28 / shadow 28.92 "ps_car_06_cat", # : 398.02 / shadow 212.43 "ps_car_04_cat", # : 376.87 / shadow 76.98 "ps_ind_06_bin", # : 370.97 / shadow 36.13 "ps_car_09_cat", # : 214.12 / shadow 81.38 "ps_car_02_cat", # : 203.03 / shadow 26.67 "ps_ind_02_cat", # : 189.47 / shadow 65.68 "ps_car_11", # : 173.28 / shadow 76.45 "ps_car_05_cat", # : 172.75 / shadow 62.92 "ps_calc_09", # : 169.13 / shadow 129.72 "ps_calc_05", # : 148.83 / shadow 120.68 "ps_ind_08_bin", # : 140.73 / shadow 27.63 "ps_car_08_cat", # : 120.87 / shadow 28.82 "ps_ind_09_bin", # : 113.92 / shadow 27.05 "ps_ind_04_cat", # : 107.27 / shadow 37.43 "ps_ind_18_bin", # : 77.42 / shadow 25.97 "ps_ind_12_bin", # : 39.67 / shadow 15.52 "ps_ind_14", # : 37.37 / shadow 16.65 ] combs = [ ('ps_reg_01', 'ps_car_02_cat'), ('ps_reg_01', 'ps_car_04_cat'), ] def __init__(self, strat=True, splits=5, random_state=15, submit=False, mean_sub=False, metric=None): # type: (bool, int, int, bool, bool, Callable) -> None self.curr_date = datetime.datetime.now() self._submit = submit self._id = "" self.trn = None self.target = None self.sub = None self.model = None self.metric = metric self.mean_submission = mean_sub if strat: self._folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state) else: self._folds = KFold(n_splits=splits, shuffle=True, random_state=random_state) self.set_model() def set_model(self): self.model = RGFClassifier(max_leaf=1000, # 1000, algorithm="RGF", # RGF_Sib, RGF_Opt loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 ) @property def do_submission(self): return self._submit @property def id(self): return self._get_id() @abc.abstractmethod def _get_id(self): self._id = "rgf_full_feat_" if self._id == "": raise ValueError("Id is not set for class " + str(type(self))) return self._id def read_data(self): self.trn = pd.read_csv("../../input/train.csv", index_col=0) self.target = self.trn["target"] del self.trn["target"] if self.do_submission: self.sub = pd.read_csv("../../input/test.csv", index_col=0) def add_combinations(self): # type: (...) -> (pd.DataFrame, Optional[DataFrame]) start = time.time() for n_c, (f1, f2) in enumerate(self.combs): name1 = f1 + "_plus_" + f2 print('current feature %60s %4d in %5.1f' % (name1, n_c + 1, (time.time() - start) / 60), end='') print('\r' * 75, end='') self.trn[name1] = self.trn[f1].apply(lambda x: str(x)) + "_" + self.trn[f2].apply(lambda x: str(x)) if self.do_submission: self.sub[name1] = self.sub[f1].apply(lambda x: str(x)) + "_" + self.sub[f2].apply(lambda x: str(x)) self.trn[name1], indexer = pd.factorize(self.trn[name1]) self.sub[name1] = indexer.get_indexer(self.sub[name1]) else: self.trn[name1], _ = pd.factorize(self.trn[name1]) def prepare_data(self): noisy_features = list(set(self.trn.columns) - set(self.train_features)) # Bin continuous variables before One-Hot Encoding for f in ["ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_14"]: full_f = pd.concat([self.trn[f], self.sub[f]], axis=0) full_cut = np.array(pd.cut(full_f, 50, labels=False)) self.trn[f] = full_cut[:len(self.trn)] self.sub[f] = full_cut[len(self.trn):] del full_f del full_cut self.add_combinations() # Remove noisy features self.trn.drop(noisy_features, axis=1, inplace=True) if self.do_submission: self.sub.drop(noisy_features, axis=1, inplace=True) print(self.trn.columns) def predict_oof_and_submission(self): self.read_data() self.prepare_data() pos_ratio = .3 class_weight = {0: 1 / (2 * (1 - pos_ratio)), 1: 1 / (2 * pos_ratio)} if self.model is None: raise ValueError("Model is not set for class " + str(type(self))) if self.target is None: raise ValueError("Model is not set for class " + str(type(self))) if self.trn is None: raise ValueError("Model is not set for class " + str(type(self))) if (self.sub is None) and self.do_submission: raise ValueError("Model is not set for class " + str(type(self))) # Prepare predictors oof_preds = np.zeros(len(self.trn)) if self.sub is not None: sub_preds = np.zeros(len(self.sub)) # Go through folds start = time.time() f_cats = [f for f in self.trn.columns if "_cat" in f] for i_fold, (trn_idx, val_idx) in enumerate(self._folds.split(self.target, self.target)): # Split data trn_x, trn_y = self.trn.iloc[trn_idx].copy(), self.target.iloc[trn_idx] val_x, val_y = self.trn.iloc[val_idx].copy(), self.target.iloc[val_idx] # Compute target averages for f in f_cats: ft = TargetAverageTransformation(feature_name=f, average=TargetAverageTransformation.MEAN, min_samples_leaf=200, smoothing=10, noise_level=0) trn_x[f + "_avg"] = ft.fit_transform(data=trn_x, target=trn_y) val_x[f + "_avg"] = ft.transform(data=val_x) if self.do_submission: self.sub[f + "_avg"] = ft.transform(data=self.sub) # Fit model eval_sets = [(trn_x.values, trn_y.values), (val_x.values, val_y.values)] sample_weight = trn_y.apply(lambda x: class_weight[x]).values self.model.fit(trn_x.values, trn_y.values) # Predict OOF oof_preds[val_idx] = self.model.predict_proba(val_x.values)[:, 1] # Predict SUB if mean is requested if (self.sub is not None) and self.mean_submission: sub_preds += self.model.predict_proba(self.sub.values)[:, 1] / self._folds.n_splits # Print results of current fold print("Fold %2d score : %.6f in [%5.1f]" % (i_fold + 1, self.metric(val_y, oof_preds[val_idx]), (time.time() - start) / 60)) del trn_x del val_x gc.collect() # display OOF result oof_score = self.metric(self.target, oof_preds) print("Full OOF score : %.6f" % oof_score) # Check if we need to fit the model on the full dataset if (self.sub is not None) and not self.mean_submission: # Compute target averages for f in f_cats: ft = TargetAverageTransformation(feature_name=f, average=TargetAverageTransformation.MEAN, min_samples_leaf=200, smoothing=10, noise_level=0) self.trn[f + "_avg"] = ft.fit_transform(data=self.trn, target=self.target) self.sub[f + "_avg"] = ft.transform(data=self.sub) # Fit model self.model.fit(self.trn, self.target) # Compute prediction for submission sub_preds = self.model.predict_proba(self.sub)[:, 1] if self.do_submission: filename = "../output_preds/" + self.id filename += str(int(1e6 * oof_score)) + "_" filename += self.curr_date.strftime("%Y_%m_%d_%Hh%M") # Save OOF predictions for stacking self.trn[self.id] = oof_preds self.trn[[self.id]].to_csv(filename + "_oof.csv", float_format="%.9f") # Save submission prediction for stacking or submission self.sub["target"] = sub_preds self.sub[["target"]].to_csv(filename + "_sub.csv", float_format="%.9f")