Exemple #1
0
    def test_softmax_classifier(self):
        clf = RGFClassifier(prefix='clf', calc_prob='Softmax')
        clf.fit(self.iris.data, self.iris.target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0]))

        score = clf.score(self.iris.data, self.iris.target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #2
0
    def test_softmax_classifier(self):
        clf = RGFClassifier(calc_prob='softmax')
        clf.fit(self.iris.data, self.iris.target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0]))

        score = clf.score(self.iris.data, self.iris.target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #3
0
    def test_bin_classifier(self):
        clf = RGFClassifier(prefix='clf')
        bin_target = (self.iris.target == 2).astype(int)
        clf.fit(self.iris.data, bin_target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0]))

        score = clf.score(self.iris.data, bin_target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #4
0
    def test_bin_classifier(self):
        clf = RGFClassifier()
        bin_target = (self.iris.target == 2).astype(int)
        clf.fit(self.iris.data, bin_target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0]))

        score = clf.score(self.iris.data, bin_target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #5
0
def rgf(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame,
        parameters: Dict):
    n_splits = 5
    # n_neighbors = parameters["n_neighbors"]
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof = np.zeros((df.shape[0] + test.shape[0], 9))

    for trn_idx, val_idx in folds.split(df, target):
        train_x = df.iloc[trn_idx, :].values
        val_x = df.iloc[val_idx, :].values
        train_y = target[trn_idx].values
        val_y = target[val_idx].values

        classifier = RGFClassifier(
            n_jobs=14,
            algorithm="RGF",
            loss="Log",
        )
        classifier.fit(train_x, train_y)

        y_hat = classifier.predict_proba(val_x)

        print(log_loss(val_y, y_hat))
        print(oof.shape, y_hat.shape)
        oof[val_idx] = y_hat
        pred = classifier.predict_proba(test.values)

        oof[len(target):, :] += pred / n_splits

    print(oof.shape)
    # np.save("data/04_features/oof.npz", oof)
    # oof = np.load("data/04_features/oof.npy")
    n_name = ["knn_{}".format(i) for i in range(9)]
    oof = pd.DataFrame(oof)
    oof.to_csv("data/09_oof/rgf_{}.csv".format(3))
    return oof[len(target):].values
    def objective(max_leaf, l2, min_samples_leaf, learning_rate):
        max_leaf = int(max_leaf)
        min_samples_leaf = int(min_samples_leaf)

        assert type(max_leaf) == int
        assert type(min_samples_leaf) == int

        model = RGFClassifier(
            max_leaf=max_leaf,
            l2=l2,
            min_samples_leaf=min_samples_leaf,
            learning_rate=learning_rate,
            algorithm="RGF_Sib",
            test_interval=100,
        )
        model.fit(train_m, label_m)
        pred_proba = model.predict_proba(train_val)
        score = roc_auc_score(label_val, pred_proba[:, 1])
        return score
Exemple #7
0
def train(params):
    # log hyperparams for this run
    for k, v in params.items():
        mlflow.log_param(k, v)

    # load dataset files
    # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta'
    dataset = np.load('preprocessed/dataset.npz')
    X_arr = dataset['X_arr']
    Y_arr = dataset['Y_arr']

    # split for train-test
    X_train, X_test, Y_train, Y_test = train_test_split(X_arr,
                                                        Y_arr,
                                                        stratify=Y_arr,
                                                        test_size=0.2)

    # instantiate model with params
    rgf_clf = RGFClassifier(**params)
    rgf_clf.fit(X_train, Y_train)

    # predict on test data
    Y_pred = rgf_clf.predict(X_test)
    Y_pred_proba = rgf_clf.predict_proba(X_test)

    # log logistic loss value
    logistic_loss = log_loss(Y_test, Y_pred_proba)
    mlflow.log_metric('log_loss', logistic_loss)

    # log precision, recall, f1
    p, r, f, _ = precision_recall_fscore_support(y_true=Y_test,
                                                 y_pred=Y_pred,
                                                 average='binary')
    mlflow.log_metric('precision', p)
    mlflow.log_metric('recall', r)
    mlflow.log_metric('f1', f)

    # which features matter the most
    print("========== FEATURE IMPORTANCES ==========")
    print(rgf_clf.feature_importances_)
Exemple #8
0
        }

        num_boost_round = 190
        fit_model = lgbm.train(params,
                               dtrain,
                               num_boost_round,
                               valid_sets=dvalid,
                               feval=evalerror,
                               verbose_eval=100,
                               early_stopping_rounds=100)
    else:
        fit_model = xgbmodel.fit(X_train, y_train)

    # Generate validation predictions for this fold
    if USE_RGF_INSTEAD:
        pred = rgf.predict_proba(X_valid.fillna(X_train.mean()))[:, 1]
    else:
        pred = fit_model.predict(X_valid)
        # pred = fit_model.predict_proba(X_valid)[:, 1]
    gini_results.append(eval_gini(y_valid, pred))
    print("  Gini = ", gini_results[-1])
    y_valid_pred.iloc[test_index] = pred

    # Accumulate test set predictions
    if USE_RGF_INSTEAD:
        probs = rgf.predict_proba(X_test.fillna(X_train.mean()))[:, 1]
        try:
            subprocess.call('rm -rf /tmp/rgf/*', shell=True)
            print("Clean up is successfull")
            print(glob.glob("/tmp/rgf/*"))
        except Exception as e:
Exemple #9
0
def train_predict(train_df, test_df, params, model_name=None):
    if model_name == None:
        #model_name = 'l1_rgf_%s'%datetime.now().strftime('%m%d%H%M')
        model_name = 'l1_rgf'
    log = Logger(os.path.join('log', '%s.log' % model_name))

    cols = [c for c in train_df.columns if c not in ['id', 'target']]

    log.info('Features:')
    for col in cols:
        log.info('- %s' % col)
    log.info('\n')

    log.info('Parameters:')
    param_items = params.items()
    for param_item in param_items:
        log.info('- %s: %s' % (param_item[0], str(param_item[1])))
    log.info('\n')

    X = train_df[cols].values
    y = train_df['target'].values
    X_test = test_df[cols].values

    prob_train = np.zeros(len(X))
    prob_test = np.zeros(len(X_test))

    kfold = 5
    scores = []
    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=41)
    for i, (train_ind, valid_ind) in enumerate(skf.split(X, y)):
        X_train, X_valid = X[train_ind], X[valid_ind]
        y_train, y_valid = y[train_ind], y[valid_ind]

        model = RGFClassifier(**params)
        model.fit(X_train, y_train)

        prob = model.predict_proba(X_valid)[:, 1]
        prob_train[valid_ind] = prob
        score = gini_norm(prob, y_valid)
        scores.append(score)
        log.info('- Fold %d/%d score: %f' % (i + 1, kfold, score))

        prob = model.predict_proba(X_test)[:, 1]
        prob_test += prob / kfold

        try:
            subprocess.call('rm -rf /tmp/rgf/*', shell=True)
            print("Clean up is successfull")
            print(glob.glob("/tmp/rgf/*"))
        except Exception as e:
            print(str(e))

    mean_score = np.mean(scores)
    log.info('- Mean score: %f' % mean_score)

    prob_train_df = pd.DataFrame({'id': train_df['id'], 'target': prob_train})
    prob_train_df.to_csv(os.path.join('local_cv', '%s.csv.gz' % model_name),
                         index=False,
                         compression='gzip')
    prob_test_df = pd.DataFrame({'id': test_df['id'], 'target': prob_test})
    prob_test_df.to_csv(os.path.join('submission', '%s.csv.gz' % model_name),
                        index=False,
                        compression='gzip')

    return mean_score
Exemple #10
0
    else:
        blindloodata = pd.concat([blindloodata, blindtrain])

for c in highcardinality:
    test['loo' + c] = ProjectOnMean(train, test, c)
test.drop(highcardinality, inplace=True, axis=1)

train = blindloodata
train.drop(highcardinality, inplace=True, axis=1)
train = train.fillna(train.mean())
test = test.fillna(train.mean())

# In[ ]:

rgf = RGFClassifier(
    max_leaf=1000,  #Try increasing this as a starter
    algorithm="RGF_Sib",
    test_interval=250,
    loss="Log",
    verbose=True)
rgf.fit(train[train.columns[2:]], train.target)
x = rgf.predict_proba(train[train.columns[2:]])
print(GiniScore(train.target, x[:, 1]))

# In[ ]:

sub = pd.read_csv('../input/sample_submission.csv')
x = rgf.predict_proba(test[test.columns[2:]])
sub.target = x[:, 1]
sub.to_csv('rgfsubmission.csv', index=False)
class Level1Model(object):
    train_features = [
        "ps_car_13",  # : 1571.65 / shadow  609.23
        "ps_reg_03",  # : 1408.42 / shadow  511.15
        "ps_ind_05_cat",  # : 1387.87 / shadow   84.72
        "ps_ind_03",  # : 1219.47 / shadow  230.55
        "ps_ind_15",  # :  922.18 / shadow  242.00
        "ps_reg_02",  # :  920.65 / shadow  267.50
        "ps_car_14",  # :  798.48 / shadow  549.58
        "ps_car_12",  # :  731.93 / shadow  293.62
        "ps_car_01_cat",  # :  698.07 / shadow  178.72
        "ps_car_07_cat",  # :  694.53 / shadow   36.35
        "ps_ind_17_bin",  # :  620.77 / shadow   23.15
        "ps_car_03_cat",  # :  611.73 / shadow   50.67
        "ps_reg_01",  # :  598.60 / shadow  178.57
        "ps_car_15",  # :  593.35 / shadow  226.43
        "ps_ind_01",  # :  547.32 / shadow  154.58
        "ps_ind_16_bin",  # :  475.37 / shadow   34.17
        "ps_ind_07_bin",  # :  435.28 / shadow   28.92
        "ps_car_06_cat",  # :  398.02 / shadow  212.43
        "ps_car_04_cat",  # :  376.87 / shadow   76.98
        "ps_ind_06_bin",  # :  370.97 / shadow   36.13
        "ps_car_09_cat",  # :  214.12 / shadow   81.38
        "ps_car_02_cat",  # :  203.03 / shadow   26.67
        "ps_ind_02_cat",  # :  189.47 / shadow   65.68
        "ps_car_11",  # :  173.28 / shadow   76.45
        "ps_car_05_cat",  # :  172.75 / shadow   62.92
        "ps_calc_09",  # :  169.13 / shadow  129.72
        "ps_calc_05",  # :  148.83 / shadow  120.68
        "ps_ind_08_bin",  # :  140.73 / shadow   27.63
        "ps_car_08_cat",  # :  120.87 / shadow   28.82
        "ps_ind_09_bin",  # :  113.92 / shadow   27.05
        "ps_ind_04_cat",  # :  107.27 / shadow   37.43
        "ps_ind_18_bin",  # :   77.42 / shadow   25.97
        "ps_ind_12_bin",  # :   39.67 / shadow   15.52
        "ps_ind_14",  # :   37.37 / shadow   16.65
    ]
    combs = [
        ('ps_reg_01', 'ps_car_02_cat'),
        ('ps_reg_01', 'ps_car_04_cat'),
    ]

    def __init__(self, strat=True, splits=5, random_state=15, submit=False, mean_sub=False, metric=None):
        # type: (bool, int, int, bool, bool, Callable) -> None
        self.curr_date = datetime.datetime.now()
        self._submit = submit
        self._id = ""
        self.trn = None
        self.target = None
        self.sub = None
        self.model = None
        self.metric = metric
        self.mean_submission = mean_sub
        if strat:
            self._folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state)
        else:
            self._folds = KFold(n_splits=splits, shuffle=True, random_state=random_state)
        self.set_model()

    def set_model(self):
        self.model = RGFClassifier(max_leaf=1000,  # 1000,
                                   algorithm="RGF",  # RGF_Sib, RGF_Opt
                                   loss="Log",
                                   l2=0.01,
                                   sl2=0.01,
                                   normalize=False,
                                   min_samples_leaf=10,
                                   n_iter=None,
                                   opt_interval=100,
                                   learning_rate=.5,
                                   calc_prob="sigmoid",
                                   n_jobs=-1,
                                   memory_policy="generous",
                                   verbose=0
                                   )

    @property
    def do_submission(self):
        return self._submit

    @property
    def id(self):
        return self._get_id()

    @abc.abstractmethod
    def _get_id(self):
        self._id = "rgf_full_feat_"
        if self._id == "":
            raise ValueError("Id is not set for class " + str(type(self)))
        return self._id

    def read_data(self):
        self.trn = pd.read_csv("../../input/train.csv", index_col=0)
        self.target = self.trn["target"]
        del self.trn["target"]
        if self.do_submission:
            self.sub = pd.read_csv("../../input/test.csv", index_col=0)

    def add_combinations(self):
        # type: (...) -> (pd.DataFrame, Optional[DataFrame])
        start = time.time()
        for n_c, (f1, f2) in enumerate(self.combs):
            name1 = f1 + "_plus_" + f2
            print('current feature %60s %4d in %5.1f'
                  % (name1, n_c + 1, (time.time() - start) / 60), end='')
            print('\r' * 75, end='')
            self.trn[name1] = self.trn[f1].apply(lambda x: str(x)) + "_" + self.trn[f2].apply(lambda x: str(x))
            if self.do_submission:
                self.sub[name1] = self.sub[f1].apply(lambda x: str(x)) + "_" + self.sub[f2].apply(lambda x: str(x))
                self.trn[name1], indexer = pd.factorize(self.trn[name1])
                self.sub[name1] = indexer.get_indexer(self.sub[name1])
            else:
                self.trn[name1], _ = pd.factorize(self.trn[name1])

    def prepare_data(self):
        noisy_features = list(set(self.trn.columns) - set(self.train_features))

        # Bin continuous variables before One-Hot Encoding
        for f in ["ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_14"]:
            full_f = pd.concat([self.trn[f], self.sub[f]], axis=0)
            full_cut = np.array(pd.cut(full_f, 50, labels=False))
            self.trn[f] = full_cut[:len(self.trn)]
            self.sub[f] = full_cut[len(self.trn):]
            del full_f
            del full_cut

        self.add_combinations()
        # Remove noisy features
        self.trn.drop(noisy_features, axis=1, inplace=True)
        if self.do_submission:
            self.sub.drop(noisy_features, axis=1, inplace=True)

        print(self.trn.columns)

    def predict_oof_and_submission(self):

        self.read_data()
        self.prepare_data()
        pos_ratio = .3
        class_weight = {0: 1 / (2 * (1 - pos_ratio)), 1: 1 / (2 * pos_ratio)}

        if self.model is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if self.target is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if self.trn is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if (self.sub is None) and self.do_submission:
            raise ValueError("Model is not set for class " + str(type(self)))

        # Prepare predictors
        oof_preds = np.zeros(len(self.trn))
        if self.sub is not None:
            sub_preds = np.zeros(len(self.sub))
        # Go through folds
        start = time.time()
        f_cats = [f for f in self.trn.columns if "_cat" in f]
        for i_fold, (trn_idx, val_idx) in enumerate(self._folds.split(self.target, self.target)):
            # Split data
            trn_x, trn_y = self.trn.iloc[trn_idx].copy(), self.target.iloc[trn_idx]
            val_x, val_y = self.trn.iloc[val_idx].copy(), self.target.iloc[val_idx]

            # Compute target averages
            for f in f_cats:
                ft = TargetAverageTransformation(feature_name=f,
                                                 average=TargetAverageTransformation.MEAN,
                                                 min_samples_leaf=200,
                                                 smoothing=10,
                                                 noise_level=0)
                trn_x[f + "_avg"] = ft.fit_transform(data=trn_x, target=trn_y)
                val_x[f + "_avg"] = ft.transform(data=val_x)
                if self.do_submission:
                    self.sub[f + "_avg"] = ft.transform(data=self.sub)
            # Fit model
            eval_sets = [(trn_x.values, trn_y.values),
                         (val_x.values, val_y.values)]
            sample_weight = trn_y.apply(lambda x: class_weight[x]).values

            self.model.fit(trn_x.values,
                           trn_y.values)
            # Predict OOF
            oof_preds[val_idx] = self.model.predict_proba(val_x.values)[:, 1]

            # Predict SUB if mean is requested
            if (self.sub is not None) and self.mean_submission:
                sub_preds += self.model.predict_proba(self.sub.values)[:, 1] / self._folds.n_splits

            # Print results of current fold
            print("Fold %2d score : %.6f in [%5.1f]"
                  % (i_fold + 1,
                     self.metric(val_y, oof_preds[val_idx]),
                     (time.time() - start) / 60))
            del trn_x
            del val_x
            gc.collect()

        # display OOF result
        oof_score = self.metric(self.target, oof_preds)
        print("Full OOF score : %.6f" % oof_score)

        # Check if we need to fit the model on the full dataset
        if (self.sub is not None) and not self.mean_submission:
            # Compute target averages
            for f in f_cats:
                ft = TargetAverageTransformation(feature_name=f,
                                                 average=TargetAverageTransformation.MEAN,
                                                 min_samples_leaf=200,
                                                 smoothing=10,
                                                 noise_level=0)
                self.trn[f + "_avg"] = ft.fit_transform(data=self.trn, target=self.target)
                self.sub[f + "_avg"] = ft.transform(data=self.sub)
            # Fit model
            self.model.fit(self.trn, self.target)
            # Compute prediction for submission
            sub_preds = self.model.predict_proba(self.sub)[:, 1]

        if self.do_submission:
            filename = "../output_preds/" + self.id
            filename += str(int(1e6 * oof_score)) + "_"
            filename += self.curr_date.strftime("%Y_%m_%d_%Hh%M")

            # Save OOF predictions for stacking
            self.trn[self.id] = oof_preds
            self.trn[[self.id]].to_csv(filename + "_oof.csv", float_format="%.9f")

            # Save submission prediction for stacking or submission
            self.sub["target"] = sub_preds
            self.sub[["target"]].to_csv(filename + "_sub.csv", float_format="%.9f")