def main():
    transaction = pd.read_csv('transaction_new.csv')
    dis = pd.read_csv('submit_disv1.csv')
    transaction_new = pd.merge(transaction,
                               dis[['TransactionID', 'score']],
                               on='TransactionID')
    feature = [
        f for f in transaction_new.columns
        if f != 'TransactionID' and f != 'split' and f != 'isFraud'
    ]
    fmap = {}
    for f in feature:
        fmap[f] = f.replace(' ', '_')
    transaction_new = transaction_new.rename(columns=fmap)
    data = transaction_new[transaction_new['split'] == 1]
    valid = transaction_new[transaction_new['split'] == 2]

    train, test = train_test_split(data, test_size=0.3, random_state=42)
    train_x = train[list(fmap.values())]
    test_x = test[list(fmap.values())]
    train_y = train['isFraud'].astype('int')
    test_y = test['isFraud'].astype('int')

    clf = LGBMClassifier(
        boosting_type='gbdt',
        colsample_bytree=0.2,
        drop_rate=0.1,
        importance_type='split',
        learning_rate=0.04,
        max_bin=500,
        max_depth=4,
        min_child_samples=50,
        min_split_gain=0.1,
        n_estimators=500,
        n_jobs=-1,
        num_leaves=9,
        objective=None,
        random_state=24,
        reg_alpha=40,
        reg_lambda=10,
        sigmoid=0.4,
        silent=True,
        #class_weight={0:1,1:10},
        #subsample=0.3,
        subsample_for_bin=24000,
        is_unbalance=True,
        subsample_freq=1)
    clf.fit(train_x, train_y)
    train_y_pred = clf.predict_proba(train_x)[:, 1]
    train_ks = cal_ks_scipy(train_y_pred, train_y)
    y_pred = clf.predict_proba(test_x)[:, 1]
    test_ks = cal_ks_scipy(y_pred, test_y)
    print(train_ks, test_ks)
    tr_auc = metrics.roc_auc_score(train_y, train_y_pred)
    te_auc = metrics.roc_auc_score(test_y, y_pred)
    print(tr_auc, te_auc)

    valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:,
                                                                             1]
    valid[['TransactionID', 'isFraud']].to_csv('submit6.csv', index=False)
Ejemplo n.º 2
0
def fit_lgb_model(model_spec, early_stopping_rounds=10):
    ms = model_spec
    print("loading data using", ms["filename"])
    df = ms["dataset"]()
    print("converting all feature columns to float32")
    for col in ms["features"].values():
        df[col] = df[col].astype("float32")
    print(df.describe().T)
    n_targets_with_null = df["target"].isna().sum()
    print("dropping", n_targets_with_null, "rows with null in target")
    df = df[df["target"].notna()]
    monotone_constraints = [
        ms["monotone_constraints"].get(col, 0) for col in ms["features"].keys()
    ]
    print("monotone constraints:", monotone_constraints)
    model = LGBMClassifier(
        n_estimators=5_000,
        num_leaves=11,
        learning_rate=0.01,
        monotone_constraints=monotone_constraints,
        monotone_constraints_method="advanced",
    )
    X = df[ms["features"].values()]
    y = df["target"]
    n_games = df["game_id"].max() + 1
    game_pct = (df["game_id"] + 1) / n_games
    w = config.GAME_WEIGHTING_FACTOR + (1 - config.GAME_WEIGHTING_FACTOR) * game_pct
    eval_size = ms["validation_size"]
    X_tr, X_te = X.iloc[:-eval_size], X.iloc[-eval_size:]
    y_tr, y_te = y.iloc[:-eval_size], y.iloc[-eval_size:]
    w_tr, w_te = w.iloc[:-eval_size], w.iloc[-eval_size:]
    eval_set = [(X_te.values, y_te.values)]
    model.fit(
        X_tr,
        y_tr,
        sample_weight=w_tr,
        eval_set=eval_set,
        eval_sample_weight=[w_te],
        early_stopping_rounds=early_stopping_rounds,
        verbose=early_stopping_rounds,
    )
    print("refitting model with full dataset")
    model.set_params(n_estimators=model.best_iteration_)
    model.fit(X, y, sample_weight=w)
    pred = pd.Series(model.predict_proba(X)[:, 1])
    print("distribution of predictions:")
    print(pred.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]))
    feature_importances = [
        (feature, importance)
        for feature, importance in zip(
            model.booster_.feature_name(),
            model.booster_.feature_importance(importance_type="gain"),
        )
    ]
    print("feature importance (by gain):")
    for feature, importance in sorted(feature_importances, key=lambda row: -row[1]):
        print(f"    {feature}: {importance}")
    filepath = os.path.join(FILEPATH, "models", ms["filename"])
    print("saving to", filepath)
    model.booster_.save_model(filepath)
Ejemplo n.º 3
0
        def score(params, skf=skf, sample_weight=sample_weight):
            params = {"max_depth": int(params["max_depth"]),
                      "subsample": params["subsample"],
                      "colsample_bytree": params['colsample_bytree'],
                      "num_leaves": int(params['num_leaves']),
                      "n_jobs": -2
                      }

            clf = LGBMClassifier(n_estimators=500, learning_rate=0.05, **params)

            list_score_acc = []
            list_score_logloss = []

            for train, val in skf.split(self.X, self.y):
                X_train, X_val = self.X[train], self.X[val]
                y_train, y_val = self.y[train], self.y[val]

                weight_train = sample_weight[train]
                weight_val = sample_weight[val]

                clf.fit(X_train, y_train,
                        sample_weight=weight_train,
                        eval_sample_weight=[weight_val],
                        eval_set=[(X_val, y_val)],

                        eval_metric="logloss",
                        early_stopping_rounds=0,
                        verbose=False
                        )

                _score_acc = accuracy_score(y_val, clf.predict(X_val), sample_weight=weight_val)
                _score_logloss = log_loss(y_val, clf.predict_proba(X_val), sample_weight=weight_val)

                list_score_acc.append(_score_acc)
                list_score_logloss.append(_score_logloss)
                """
                ##n_estimaters=0 causes error at .fit()
                if clf.best_iteration_ != -1:
                    list_best_iter.append(clf.best_iteration_)
                else:
                    list_best_iter.append(params['n_estimators'])
                break
                """
            # logger.info("n_estimators: {}".format(list_best_iter))
            # params["n_estimators"] = np.mean(list_best_iter, dtype=int)

            score_acc = (np.mean(list_score_acc), np.min(list_score_acc), np.max(list_score_acc))
            # logger.info("score_acc %s" % np.mean(list_score_acc))

            # score_logloss = (np.mean(list_score_logloss), np.min(list_score_logloss), np.max(list_score_logloss))
            # score_f1 = (np.mean(list_score_f1), np.min(list_score_f1), np.max(list_score_f1))
            # score_auc = (np.mean(list_score_auc), np.min(list_score_auc), np.max(list_score_auc))

            logloss = np.mean(list_score_logloss)
            return {'loss': logloss, 'status': STATUS_OK, 'localCV_acc': score_acc}
Ejemplo n.º 4
0
def fit():
    train, validation, _ = train_validation_holdout_split(read('./data/train_set.csv'))

    steps = [
        preprocess,
        russia_only,
        rouble_only,
        with_transaction_location,
        with_job,
        (partial(fit_categories, ['mcc', 'city', 'terminal_id']), transform_categories),
        partial(calc_is_close, ['transaction_lat', 'transaction_lon'], ['work_add_lat', 'work_add_lon'])
    ]

    pipeline, train = fit_pipeline(steps, train)
    validation = pipeline(validation)

    feature_columns = ['mcc', 'city', 'amount', 'terminal_id']
    print(f'Train size: {len(train)}, Validation size: {len(validation)}')
    print(f'Features: {feature_columns}')
    model = LGBMClassifier()
    model.fit(train[feature_columns], train['is_close'])

    predictions = model.predict_proba(validation[feature_columns])
    accuracy_value = accuracy_score(validation['is_close'], np.argmax(predictions, axis=1))
    logloss_value = log_loss(validation['is_close'], predictions)
    print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}')
    print(classification_report(validation['is_close'], np.argmax(predictions, axis=1)))

    validation['probs'] = predictions[:, 1]
    top1_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(1).is_close.max()).mean()
    top5_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(5).is_close.max()).mean()
    top10_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(10).is_close.max()).mean()
    print(f'Top1: {top1_accuracy:.5f}')
    print(f'Top5: {top5_accuracy:.5f}')
    print(f'Top10: {top10_accuracy:.5f}')

    # contributions = model._Booster.predict(validation[feature_columns], pred_contrib=True)
    # contributions_df = pd.DataFrame(
    #     index=validation.index,
    #     data=contributions,
    #     columns=list(map(lambda col: col + '_contr', feature_columns)) + ['expected_value']
    # )

    # debug_df = pd.concat([validation, contributions_df], axis=1)
    # debug_df.index.name = 'id'
    # debug_df.to_csv('./data/debug.csv')

    import pdb; pdb.set_trace()
Ejemplo n.º 5
0
def single_model(df_final, train_y,weight=None,metric=None):
    train_values, test_values = df_final[:train_num], df_final[test_num:]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1,
                         colsample_bytree=0.8, random_state=2019)
    test_pred_prob = np.zeros((test_values.shape[0], 33))
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, train_y['label'])):
        print(i, 'fold...')
        t = time.time()
        trn_x, trn_y = train_values[trn_idx], train_y['label'][trn_idx]
        val_x, val_y = train_values[val_idx], train_y['label'][val_idx]
        train_amt, val_amt = train_y['due_amt'][trn_idx].values, train_y['due_amt'][val_idx].values
        clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)],sample_weight=weight,
                eval_metric=metric, early_stopping_rounds=100, verbose=5)
        test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits
        print('runtime: {}\n'.format(time.time() - t))
    print('单模型拟合已完成')
    return test_pred_prob
Ejemplo n.º 6
0
 def single_train(label_list,var_list):
     """
     用该特征单独训练一个模型,得到pvalue,用来计算AUC/KS。K折交叉赋分,比较准确             
     """
     print("用该特征单独训练一个模型,得到pvalue,用来计算AUC/KS。K折交叉赋分,比较准确!")
     df = pd.DataFrame({'label':label_list, 'var':var_list})
     df['label'] = df['label'].astype('int64')
     clf = LGBMClassifier(
               metric='binary_logloss', is_unbalance=True, random_state=11, 
               silent=True, n_jobs=10, reg_alpha=0.3, reg_lambda=0.3
               ,learning_rate=0.01, n_estimators=2000, subsample=0.6, colsample_bytree=0.3
               ,num_leaves=7, max_depth=3, min_child_samples=2000
               ,min_split_gain=0.1, min_child_weight=0.1
               #,is_training_metric=True, max_bin=255, subsample_for_bin=400, ,objective='binary'
               ,importance_type='gain',
               )     
     # kfold
     kf = KFold(n_splits=3, shuffle=True) # 注意交叉赋分的时候必须要打乱顺序,保证好坏样本比例类似!!!要不然每一折的test AUC不稳定
     df = df.reset_index(drop=True) # index必须从0开始,要不然交叉的时候出错
     for train_index, test_index in kf.split(df): 
         clf.fit(df.loc[train_index, 'var'].values.reshape(-1, 1),df.loc[train_index, 'label']) 
         prob_list = clf.predict_proba(df.loc[test_index, 'var'].values.reshape(-1, 1))[:, 1] # 给test集打分 
         df.loc[test_index, "pvalue"] = prob_list  # 保留交叉赋分之后的pvalue
     return df['pvalue'].tolist()
Ejemplo n.º 7
0
def main(mode, params, model_type):

    n_estimators = params["n_estimators"]
    early_stopping_rounds = params["early_stopping_rounds"]
    eval_date = params["eval_date"]
    print("******************** eval_month: %s ********************" %
          eval_date)

    t2 = time.time()
    X_train, y_train, X_test, y_test = get_dataset(eval_date, mode, model_type)
    print("used time:{}m".format((time.time() - t2) // 60))

    gc.collect()

    print(X_train.shape, X_test.shape)
    print(y_train.value_counts())
    print(y_test.value_counts())

    print('************** training **************')
    # class weight
    clf = LGBMClassifier(
        learning_rate=0.01,
        n_estimators=n_estimators,
        num_leaves=127,  # 20, max_depth:5
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2019,
        #scale_pos_weight=50,
        metric=None)

    F1.clear()

    if mode == "test":
        clf.fit(X_train,
                y_train,
                eval_set=[(X_train, y_train)],
                early_stopping_rounds=early_stopping_rounds,
                verbose=10)
        now_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        joblib.dump(clf, 'pakdd_model{}.pkl'.format(model_type))
        return

    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=lambda y_true, y_pred: [custom_f1_eval(y_true, y_pred)],
        early_stopping_rounds=early_stopping_rounds,
        verbose=10)
    joblib.dump(clf, 'pakdd_model_valid.pkl')

    y_pred = clf.predict_proba(X_test)[:, 1]
    test_sub = TEST_SUB.copy()
    y_ranked = rank_result(y_pred, test_sub, verbose=True)
    evaluate_classification_new(y_ranked, verbose=True)

    file_path = "../offline/"
    now_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    submit = y_ranked[['manufacturer', 'model', 'serial_number', 'dt']]
    print("submit shape", submit.shape)
    submit.to_csv(file_path + "submit_%s.csv" % now_time,
                  index=False,
                  header=None)

    feature_importaces = pd.Series(
        clf.feature_importances_,
        index=X_train.columns).sort_values(ascending=False)
    feature_importaces.to_frame().to_csv(file_path + 'lgb_feat_imp.csv',
                                         header=None)
class PHSICAdasynLGBM(BaseEstimator):
    """
    An estimator upsampling minority classes, finding a small set of 
    stable biomarkers, and fitting a gradient boosting model over them

    Parameters
    ----------
    n_features : int, optional (default=30)
        Max. number of biomarkers (important features) to be selected

    adasyn_neighbors : int, optional (default=10)
        K neighbors for ADASYN upsampling algorithm
        
    B : int, optional (default=20)
        Block size for Block HSIC Lasso
        
    M : int, optional (default=10)
        Max allowed permutations of samples for Block HSIC Lasso

    hsic_splits :  int, optional (default=5)
        number of folds for verifying feature stability

    feature_neighbor_threshold : float, optional (default=0.4)
        threshold for considering neighbors of important features in stability check
    """
    def __init__(self,
                 n_features=30,
                 adasyn_neighbors=10,
                 B=20,
                 M=10,
                 hsic_splits=5,
                 feature_neighbor_threshold=0.4):
        self.n_features = n_features
        self.adasyn_neighbors = adasyn_neighbors
        self.M = M
        self.B = B
        self.hsic_splits = hsic_splits
        self.neighbor_threshold = feature_neighbor_threshold

    def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self

    def predict_proba(self, X):
        return self.clf_.predict_proba(X[:, self.hsic_idx_])

    def predict(self, X):
        return self.clf_.predict(X[:, self.hsic_idx_])
Ejemplo n.º 9
0
    del test

    gc.collect()
    clf = LGBMClassifier(
        learning_rate=0.001,
        n_estimators=100,
        num_leaves=127,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2019,
        #     is_unbalenced = 'True',
        metric=None)
    print('************** training **************')
    print(train_df.shape, test_x.shape)
    clf.fit(train_df,
            labels,
            eval_set=[(train_df, labels)],
            eval_metric='auc',
            early_stopping_rounds=10,
            verbose=10)

    sub['p'] = clf.predict_proba(test_x)[:, 1]
    sub['label'] = sub['p'].rank()
    sub['label'] = (sub['label'] >= sub.shape[0] * 0.996).astype(int)
    submit = sub.loc[sub.label == 1]
    ###这里我取故障率最大的一天进行提交,线上测了几个阈值,100个左右好像比较好。。。。
    submit = submit.sort_values('p', ascending=False)
    submit = submit.drop_duplicates(['serial_number', 'model'])
    submit[['manufacturer', 'model', 'serial_number',
            'dt']].to_csv("../sub.csv", index=False, header=None)
    submit.shape
Ejemplo n.º 10
0
    trn_x, trn_y = train_values[trn_idx], clf_labels[trn_idx]
    val_x, val_y = train_values[val_idx], clf_labels[val_idx]
    val_repay_amt = amt_labels[val_idx]
    val_due_amt = train_due_amt_df[val_idx]
    val_df = train_df[[
        'listing_id', 'auditing_date', 'due_date', 'due_amt', 'repay_date'
    ]].iloc[val_idx]

    clf.fit(trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            early_stopping_rounds=50,
            verbose=5,
            feature_name=list(feature_name))
    # shape = (-1, 33)
    val_pred_prob_everyday = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_)
    prob_oof[val_idx] = val_pred_prob_everyday
    val_pred_prob_today = [
        val_pred_prob_everyday[i][val_y[i]]
        for i in range(val_pred_prob_everyday.shape[0])
    ]
    val_pred_repay_amt = val_due_amt * val_pred_prob_today
    print('val rmse:',
          np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt)))
    print('val mae:', mean_absolute_error(val_repay_amt, val_pred_repay_amt))
    print('val new rmse:', new_rmse(val_df, val_pred_prob_everyday))
    amt_oof[val_idx] = val_pred_repay_amt
    test_pred_prob += clf.predict_proba(
        test_values, num_iteration=clf.best_iteration_) / skf.n_splits

    print('runtime: {}\n'.format(time.time() - t))
Ejemplo n.º 11
0
        all_pred = np.zeros(y_train.shape[0])
        for train, test in cv[:1]:
            trn_x = x_train[train]
            val_x = x_train[test]
            trn_y = y_train[train]
            val_y = y_train[test]

            clf = LGBMClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=True,
                # eval_metric='logloss',
                early_stopping_rounds=30)
            pred = clf.predict_proba(val_x)[:, 1]
            all_pred[test] = pred

            _score = log_loss(val_y, pred)
            _score2 = -roc_auc_score(val_y, pred)
            # logger.debug('   _score: %s' % _score)
            list_score.append(_score)
            list_score2.append(_score2)
            if clf.best_iteration != -1:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])
            # break
            with open('train_cv_pred_base.pkl', 'wb') as f:
                pickle.dump(pred, f, -1)
Ejemplo n.º 12
0
        trn_y = y_train[train]
        val_y = y_train[test]

        trn_w = sample_weight[train]
        val_w = sample_weight[test]

        clf = LGBMClassifier(**params)
        clf.fit(trn_x, trn_y,
                sample_weight=trn_w,
                eval_sample_weight=[val_w],
                eval_set=[(val_x, val_y)],
                verbose=False,
                # eval_metric='logloss',
                early_stopping_rounds=100
                )
        pred = clf.predict_proba(val_x)[:, 1]
        _score = log_loss(val_y, pred, sample_weight=val_w)
        _score2 = - roc_auc_score(val_y, pred, sample_weight=val_w)
        list_score.append(_score)
        list_score2.append(_score2)
        if clf.best_iteration != -1:
            list_best_iter.append(clf.best_iteration)
        else:
            list_best_iter.append(params['n_estimators'])
    logger.info('trees: {}'.format(list_best_iter))
    params['n_estimators'] = np.mean(list_best_iter, dtype=int)
    score = (np.mean(list_score), np.min(list_score), np.max(list_score))
    score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2))

    logger.info('param: %s' % (params))
    logger.info('loss: {} (avg min max {})'.format(score[use_score], score))
Ejemplo n.º 13
0
            clf = LGBMClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                callbacks=[callback],
                # init_score=trn_sc, eval_init_score=[val_sc],
                # sample_weight=trn_w,
                # eval_sample_weight=[val_w],
                eval_set=[(val_x, val_y)],
                verbose=False,
                eval_metric=dummy,  # f1_metric,
                # early_stopping_rounds=50
                #categorical_feature=['o_product_id', 'o_user_id', 'p_aisle_id', 'p_department_id']
            )
            pred = clf.predict_proba(val_x)[:, 1]
            all_pred[test] = pred

            _score = log_loss(val_y, pred)
            _score2 = -roc_auc_score(val_y, pred)
            _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float))
            logger.debug('   _score: %s' % _score3)
            list_score.append(_score)
            list_score2.append(_score2)
            list_score3.append(-1 * _score3)
            if clf.best_iteration != -1:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])

            with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f:
Ejemplo n.º 14
0
oof = np.zeros((len(train), 6))
predictions = np.zeros((len(test), 6))
feature_importance_df = pd.DataFrame()

for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, clf_labels)):
    print(i, 'fold...')
    trn_x, trn_y = train_values[trn_idx], clf_labels[trn_idx]
    val_x, val_y = train_values[val_idx], clf_labels[val_idx]

    clf.fit(trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            early_stopping_rounds=100,
            verbose=50)

    oof[val_idx] = clf.predict_proba(train_values[val_idx],
                                     num_iteration=clf.best_iteration_)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = +1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)

    predictions += clf.predict_proba(
        test_values, num_iteration=clf.best_iteration_) / skf.n_splits

# 评价指标: AUC指标 准确率
clf_one_hot = pd.Series(clf_labels)
clf_one_hot = pd.get_dummies(clf_one_hot)
auc = roc_auc_score(clf_one_hot, oof, average='weighted')
Ejemplo n.º 15
0
    def ks_score(kwargs: dict) -> dict:
        starttime = time.time()

        # Retrieve the subsample if present otherwise set to 1.0
        subsample = kwargs["boosting_type"].get("subsample", 1.0)

        # Extract the boosting type
        kwargs["boosting_type"] = kwargs["boosting_type"]["boosting_type"]
        kwargs["subsample"] = subsample

        # Make sure parameters that need to be integers are integers
        for k in (
                "feature_num",
                "max_depth",
                "num_leaves",
                "n_estimators",
                "min_child_samples",
                "subsample_for_bin",
        ):
            if isinstance(kwargs.get(k), float):
                kwargs[k] = int(kwargs[k])

        ks_list = []
        chosen_feature = raw_features[:kwargs["feature_num"]]
        X = X_raw[chosen_feature]

        logger.info("开始训练1111111111111111")
        logger.info("{}".format(kwargs))

        iter = 0
        estr = {}
        for train_idx, test_idx in kf.split(X):

            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            params.update(kwargs)
            try:
                params.pop("n_estimators")
            except KeyError:
                pass

            if model == "lgb":

                # # Retrieve the subsample if present otherwise set to 1.0
                # subsample = params["boosting_type"].get("subsample", 1.0)
                #
                # # Extract the boosting type
                # params["boosting_type"] = params["boosting_type"]["boosting_type"]
                # params["subsample"] = subsample

                train_data = lgb.Dataset(X_train, y_train, silent=True)

                cv_result = lgb.cv(
                    params,
                    train_data,
                    num_boost_round=10000,  # n_estimators
                    early_stopping_rounds=100,
                    nfold=5,
                    seed=555,
                    metrics=
                    "auc",  # Evaluation metrics to be monitored while CV.
                    verbose_eval=False,
                )
                # Boosting rounds that returned the highest cv score
                print('222222222222222')
                n_estimators = int(np.argmax(cv_result["auc-mean"]) + 1)

                logger.info("n_estimators:{}".format(n_estimators))
                params["n_estimators"] = n_estimators
                clf = LGBMClassifier(**params)
                clf.fit(X_train, y_train)
                y_pred_test = clf.predict_proba(X_test)[:, 1]

                iter += 1
                estr[iter] = {n_estimators}

            # elif model == 'xgb':
            #     train_data = xgb.DMatrix(X_train, y_train, silent=False)
            #     cv_result = xgb.cv(params, train_data, num_boost_round=500, nfold=3, metrics='auc',
            #                        early_stopping_rounds=100)
            #     params['n_estimators'] = len(cv_result)
            #     clf = XGBClassifier(**params)
            #     clf.fit(X_train, y_train)
            #     y_pred_test = clf.predict_proba(X_test)[:, 1]
            # elif model == 'adaboost':
            #     clf = AdaBoostClassifier(DecisionTreeClassifier(**kwargs, min_samples_leaf=0.05))
            #     clf.fit(X_train, y_train)
            #     y_pred_test = clf.predict_proba(X_test)[:, 1]
            else:
                raise NotImplementedError("Not implemented!")
            ks_list.append(calc_ks(y_pred_test, y_test, method="crosstab"))
            # model.predict_proba(X_test)=
            # array([[0.1,0.9],   #代表[2,3,4,5]被判断为0的概率为0.1,被判断为1的概率为0.9
            #        [0.8,0.2]])  #代表[3,4,5,6]被判断为0的概率为0.8,被判断为1的概率为0.2
        ks_arr = np.asarray(ks_list)
        score = np.mean(ks_arr) - 1.96 * np.std(ks_arr) / np.sqrt(
            len(ks_arr)
        )  # 置信区间 https://www.shuxuele.com/data/confidence-interval.html

        loss = -score
        run_time = time.time() - starttime

        # Write to the csv file ('a' means append)
        of_connection = open(out_file, "a")
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, estr, run_time, i])

        return {
            "loss": loss,
            "params": params,
            "estimators": i,
            "train_time": run_time,
            "status": STATUS_OK,
        }
Ejemplo n.º 16
0


for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, clf_labels_2)):#特征数据,标签

    print(i, 'fold...')

    t = time.time()

    trn_x, trn_y = train_values.values[trn_idx], clf_labels_2[trn_idx]#训练集

    val_x, val_y = train_values.values[val_idx], clf_labels_2[val_idx]#测试集

    clf.fit(

        trn_x, trn_y,

        eval_set=[(trn_x, trn_y), (val_x, val_y)],

        early_stopping_rounds=100, verbose=5

    )#交叉验证进行训练

    # shepe = (-1, 33)


    test_pred_prob += clf.predict_proba(test_values.values, num_iteration=clf.best_iteration_) / skf.n_splits#每个测试集样本的33个类别概率

    joblib.dump(clf, '../data/paipaidai_binary_%d.pkl'%i)

    print('runtime: {}\n'.format(time.time() - t))
Ejemplo n.º 17
0
    print('=========', y, '=========')
    t = time.time()
    clf = LGBMClassifier(learning_rate=0.05,
                         n_estimators=5000,
                         num_leaves=63,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         random_state=2021,
                         metric='None')
    clf.fit(trn_x[cols],
            trn_x[y],
            eval_set=[(val_x[cols], val_x[y])],
            eval_metric='auc',
            early_stopping_rounds=100,
            verbose=50)
    val_x[y + '_score'] = clf.predict_proba(val_x[cols])[:, 1]
    val_uauc = uAUC(val_x[y], val_x[y + '_score'], val_x['userid'])
    uauc_list.append(val_uauc)
    print(val_uauc)
    r_list.append(clf.best_iteration_)
    print('runtime: {}\n'.format(time.time() - t))
weighted_uauc = 0.4 * uauc_list[0] + 0.3 * uauc_list[1] + 0.2 * uauc_list[
    2] + 0.1 * uauc_list[3]
print(uauc_list)
print(weighted_uauc)
##################### 全量训练 #####################
r_dict = dict(zip(y_list[:4], r_list))
for y in y_list[:4]:
    print('=========', y, '=========')
    t = time.time()
    clf = LGBMClassifier(learning_rate=0.05,
Ejemplo n.º 18
0
x_min, x_max = x_train[:, 0].min() - .5, x_train[:, 0].max() + .5
y_min, y_max = x_train[:, 1].min() - .5, x_train[:, 1].max() + .5
h = .02
xx, yy = np.meshgrid(np.arange(x_min,x_max,h),
                     np.arange(y_min,y_max,h))
print(xx)
cm = plt.cm.RdBu
cm_bright = plt.cm.RdBu
ax = plt.subplot(1,1,1)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright,
           edgecolors='k')
ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright,
           alpha = .6, edgecolors='k')




z = lgb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

z=z.reshape(xx.shape)

ax.contourf(xx, yy, z, cmap=cm, alpha=.8)



plt.show()
Ejemplo n.º 19
0
                     metric=None)

print('************** training **************')
clf.fit(train_x,
        train_y,
        eval_set=[(val_x, val_y)],
        eval_metric='auc',
        categorical_feature=cate_cols,
        early_stopping_rounds=200,
        verbose=50)
print('runtime:', time.time() - t)

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

print(
    '=============================================== training predict ==============================================='
)
clf = LGBMClassifier(learning_rate=0.01,
                     n_estimators=best_rounds,
                     num_leaves=255,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     random_state=2019)

print('************** training **************')
clf.fit(train_df,
Ejemplo n.º 20
0
                                                    random_state=42)

print('Train classifier...')
clf = LGBMClassifier(boosting_type='gbdt',
                     objective='binary',
                     max_depth=-1,
                     num_leaves=2**7 - 1,
                     learning_rate=0.01,
                     n_estimators=2000,
                     min_split_gain=0.0,
                     min_child_weight=0.001,
                     subsample=0.8,
                     colsample_bytree=0.7,
                     random_state=888)

clf.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='logloss',
    early_stopping_rounds=100,
    verbose=50,
)
########################################

print('Predict...')
pred = clf.predict_proba(tfidf_test)
make_submission(pred[:, 1])

print('Complete')
Ejemplo n.º 21
0
                                                    y,
                                                    random_state=0,
                                                    test_size=0.2)
#%% time
clf = LGBMClassifier(num_leaves=80,
                     objective='binary',
                     max_depth=30,
                     learning_rate=0.01,
                     min_child_samples=20,
                     random_state=2021,
                     n_estimators=1000,
                     subsample=0.9,
                     colsample_bytree=0.9)
clf.fit(X_train, y_train)
# %%
pred_y = clf.predict_proba(X_train)[:, -1]
print("train:", roc_auc_score(y_train, pred_y))

pred_y = clf.predict_proba(X_test)[:, -1]
print("test:", roc_auc_score(y_test, pred_y))

# %%
import tools
#%%
test = pd.read_csv(f"{dirPath}rawData/testB.csv")
test = tools.preprocess(test, savePath=f"{dirPath}data/test_v2.pkl")
#%%
test_sub = pd.read_csv(f"{dirPath}rawData/sample_submit.csv")
pred_y = clf.predict_proba(test)
test_sub['isDefault'] = pred_y[:, -1]
test_sub.to_csv(f"{dirPath}submits/subMay5-12.csv", index=False)
Ejemplo n.º 22
0
    val_x, val_y = train_values.values[val_idx], clf_labels[val_idx]  #测试集

    val_repay_amt = amt_labels[val_idx]  #训练集对应的实际还款金额

    val_due_amt = train_due_amt_df.iloc[val_idx]  #训练集对应的应还款金额

    clf.fit(trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            early_stopping_rounds=100,
            verbose=5)  #交叉验证进行训练

    # shepe = (-1, 33)

    val_pred_prob_everyday = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_
    )  #预测,是一个shepe = (-1, 33)的结果,每一行的33个数代表每一类别的概率

    prob_oof[val_idx] = val_pred_prob_everyday  #一折训练集的预测结果,将预测结果填到相对应的验证集的位置上

    val_pred_prob_today = [
        val_pred_prob_everyday[i][val_y[i]]
        for i in range(val_pred_prob_everyday.shape[0])
    ]  #i表示第i条验证集的预测结果,该结果包含33个概率,val_y[i]表示验证集真实的label,所以val_pred_prob_today保存的就是第i条验证集样本的33个预测类别中,真实类别的概率

    val_pred_repay_amt = val_due_amt[
        'due_amt'].values * val_pred_prob_today  #实际还款那一天的预测还款金额=实际还款那一天对应的概率*应还款金额

    print(
        'val rmse:',
        np.sqrt(mean_squared_error(
Ejemplo n.º 23
0
    early_stopping_rounds=200,
    verbose=200,
)

best_iter = clf.best_iteration_
X_trn, Y_trn = df_train[feats], df_train.label
X_sub = df_test[feats]

clf = LGBMClassifier(
    objective='binary',
    learning_rate=0.05,
    n_estimators=best_iter,
    num_leaves=63,
    subsample=0.6,
    colsample_bytree=0.6,
    random_state=2020,
    n_jobs=32,
)

clf.fit(X_trn, Y_trn, verbose=200)
sub = clf.predict_proba(X_sub)[:, 1]
imp = clf.feature_importances_

pd.DataFrame({
    'id': df_test.id,
    'probability': sub.astype('float32'),
}).to_csv('submission_1.csv', index=None)
pd.DataFrame({
    'feat': feats,
    'imp': imp,
}).to_csv('feat_imp_1.csv', index=None)
Ejemplo n.º 24
0
def train_lightgbm(verbose=True, seed=0):
    """Train a boosted tree with LightGBM."""

    """
    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)
    """
    all_params = {'max_depth': [3],
                  'learning_rate': [0.06],
                  'n_estimators': [1500],
                  'min_child_weight': [0],
                  'subsample': [1],
                  'colsample_bytree': [0.5],
                  'boosting_type': ['gbdt'],
                  #'num_leaves': [2, 3],
                  #'reg_alpha': [0.1, 0, 1],
                  #'reg_lambda': [0.1, 0, 1],
                  #'is_unbalance': [True, False],
                  #'subsample_freq': [1, 3],
                  'seed': [2261]
                  }
    min_score = 100
    min_params = None
    for params in ParameterGrid(all_params):
        pass
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    list_score = []
    for train, test in cv.split(x, y):
        trn_x = x[train]
        val_x = x[test]
        trn_y = y[train]
        val_y = y[test]

        clf = LGBMClassifier(**params)
        clf.fit(trn_x, trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                # eval_metric=log_loss,
                early_stopping_rounds=300
                )
        _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])
        list_score.append(_score)
    score = np.mean(list_score)
    max_score = np.max(list_score)

    list_score = []
    for train, test in cv.split(x, y):
        trn_x = x2[train]
        val_x = x2[test]
        trn_y = y[train]
        val_y = y[test]

        clf = LGBMClassifier(**params)
        clf.fit(trn_x, trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                # eval_metric=log_loss,
                early_stopping_rounds=300
                )
        _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])
        list_score.append(_score)
    score2 = np.mean(list_score)
    max_score2 = np.max(list_score)
    logger.info('seed: %s, score: %s %s %s %s' % (seed, score, score2, score - score2, max_score - max_score2))

    return clf
Ejemplo n.º 25
0
def train_lightgbm(verbose=True, idx=0):
    """Train a boosted tree with LightGBM."""
    logger.info("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    data = []
    use_idx = []
    for id in df['id'].tolist():
        nd = np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id)))
        if nd.shape[0] <= idx:
            continue
        data.append(nd[idx])
        use_idx.append(id)

    x = np.array(data)
    y = df[df['id'].isin(use_idx)]['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(
        x, y, random_state=42, stratify=y, test_size=0.20)

    all_params = {
        'max_depth': [5, 10, 20, 50],
        'learning_rate': [0.06, 0.1],
        'n_estimators': [1500],
        'min_child_weight': [0, 0.1],
        'subsample': [0.99, 0.8],
        'colsample_bytree': [0.8, 0.5, 1],
        'boosting_type': ['gbdt'],
        'num_leaves': [10, 21, 50],
        'seed': [2261]
    }
    min_score = 100
    min_params = None
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for params in ParameterGrid(all_params):
        params = {
            'min_child_weight': 0,
            'num_leaves': 10,
            'learning_rate': 0.06,
            'subsample': 0.99,
            'max_depth': 5,
            'boosting_type': 'gbdt',
            'seed': 2261,
            'colsample_bytree': 0.5,
            'n_estimators': 1500
        }
        list_score = []
        for train, test in cv.split(x, y):
            trn_x = x[train]
            val_x = x[test]
            trn_y = y[train]
            val_y = y[test]

            clf = LGBMClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                # eval_metric=log_loss,
                early_stopping_rounds=300)
            _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])
            list_score.append(_score)
        score = np.mean(list_score)
        params['n_estimators'] = clf.best_iteration
        logger.info('idx:%s, score: %s' % (idx, score))
        break

    return clf
Ejemplo n.º 26
0
                     metric=None)

print('************** training **************')
clf.fit(train_x,
        train_y,
        eval_set=[(val_x, val_y)],
        eval_metric='auc',
        categorical_feature=cate_cols,
        early_stopping_rounds=200,
        verbose=50)
print('runtime:', time.time() - t)

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

print(
    '=============================================== training predict ==============================================='
)
clf = LGBMClassifier(learning_rate=0.01,
                     n_estimators=best_rounds,
                     num_leaves=255,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     random_state=2019)

print('************** training **************')
clf.fit(train_df,
Ejemplo n.º 27
0
	'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat','ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat']
	
    for f in encode_feature:
        tempDfTrain,tempDfTest = cat_feature_encoder(tempDfTrain,tempDfTest,f)
	
    tempDfTrain=tempDfTrain.drop(encode_feature,axis=1)
    tempDfTest=tempDfTest.drop(encode_feature,axis=1)
	
    X_train = tempDfTrain.drop(['id','target'],axis=1)
    y_train = y[train_index]
    X_valid = tempDfTest.drop(['id','target'],axis=1)
    y_valid = y[test_index]
    LGB = LGBMClassifier(n_estimators=2000,max_depth=8,learning_rate=0.01,subsample=0.7,colsample_bytree=0.7,min_child_weight=50)
    eval_set=[(X_valid, y_valid)]
    LGB.fit(X_train, y_train,early_stopping_rounds=100,eval_metric=gini_lgb_used,eval_set=eval_set,verbose=100)
    pred = LGB.predict_proba(X_valid,num_iteration=LGB.best_iteration)[:,1]
    
    
    cv_vaild_.iloc[test_index] =pred 
    score = gini_c(pred,y_valid)
    print('valid-gini:'+ str(score))
    score_list.append(score)
    p_test = LGB.predict_proba(test,num_iteration=LGB.best_iteration)[:,1]
    p_test = np.log(p_test)
    sub['model_lgb_submit'] += p_test/kfold
    
print('avg-valid-gini:'+ str(sum(score_list)/kfold))

sub['target'] = np.e**sub['model_lgb_submit']
sub.to_csv('../output/model_lgb_submit.csv', index=False, float_format='%.5f') 

lgb1 = LGBMClassifier(boosting_type='gbdt',
                      learning_rate=0.1,
                      n_estimators=1000,
                      max_depth=5,
                      min_child_weight=1,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      objective='binary',
                      n_jobs=4,
                      random_state=66)

lgbfit(lgb1, X_train, y_train, useTrainCV=True)

test_target = lgb1.predict_proba(X_test)[:, 1]
test_target = (np.exp(test_target) - 1.0).clip(0, 1).tolist()
sub = pd.DataFrame([test_id, test_target]).transpose()
sub.columns = ['id', 'target']
sub['id'] = sub['id'].astype('int32')
sub.to_csv('submission.csv', index=False, float_format='%.5f')

###############################Bagging#####################################
bag = BaggingClassifier(lgb1, max_samples=0.8, max_features=0.8)
bag.fit(X_train, y_train)

test_target = bag.predict_proba(X_test)[:, 1].tolist()
sub = pd.DataFrame([test_id, test_target]).transpose()
sub.columns = ['id', 'target']
sub['id'] = sub['id'].astype('int32')
sub.to_csv('submission.csv', index=False, float_format='%.5f')
Ejemplo n.º 29
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    logger.info("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    # x = np.array([np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id)))[IDX], axis=0).flatten()
    #              for id in df['id'].tolist()])
    x = np.array([
        np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))),
                axis=0).flatten() for id in df['id'].tolist()
    ])

    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(
        x, y, random_state=42, stratify=y, test_size=0.20)

    all_params = {
        'max_depth': [5, 10, 20, 50],
        'learning_rate': [0.06, 0.1],
        'n_estimators': [1500],
        'min_child_weight': [0, 0.1],
        'subsample': [0.99, 0.8],
        'colsample_bytree': [0.8, 0.5, 1],
        'boosting_type': ['gbdt'],
        'num_leaves': [10, 21, 50],
        'seed': [2261]
    }
    min_score = 100
    min_params = None
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for params in ParameterGrid(all_params):
        list_score = []
        for train, test in cv.split(x, y):
            trn_x = x[train]
            val_x = x[test]
            trn_y = y[train]
            val_y = y[test]

            clf = LGBMClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                # eval_metric=log_loss,
                early_stopping_rounds=300)
            _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])
            list_score.append(_score)
        score = np.mean(list_score)
        params['n_estimators'] = clf.best_iteration

        score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])

        logger.info('param: %s' % (params))
        logger.info('score: %s' % score)
        if min_score > score:
            min_score = score
            min_params = params
        logger.info('best score: %s' % min_score)
        logger.info('best_param: %s' % (min_params))
    # params = {'seed': 2261, 'min_child_weight': 0, 'boosting_type': 'gbdt', 'subsample': 0.99, 'num_leaves': 21,
    #          'n_estimators': 18, 'max_depth': 20, 'colsample_bytree': 0.8, 'learning_rate': 0.1}

    clf = LGBMClassifier(**min_params)
    clf.fit(x, y)

    return clf
Ejemplo n.º 30
0
def fit(objective):
    if objective == 'work':
        target_columns = ['work_add_lat', 'work_add_lon']
    else:
        target_columns = ['home_add_lat', 'home_add_lon']

    train, validation, _ = train_validation_holdout_split(
        read('./data/train_set.csv'))

    steps = [
        preprocess, russia_only, rouble_only, with_transaction_location,
        partial(with_columns, target_columns), cluster, merge_cluster_features,
        (partial(fit_categories, ['mcc']), transform_categories),
        partial(calc_is_close, ['transaction_lat', 'transaction_lon'],
                target_columns)
    ]

    pipeline, train = fit_pipeline(steps, train)
    validation = pipeline(validation)

    feature_columns = [
        # Original transaction features
        'amount',
        'mcc',
        # Cluster features
        'amount_hist_-1.0',
        'amount_hist_-2.0',
        'amount_hist_0.0',
        'amount_hist_1.0',
        'amount_hist_2.0',
        'amount_hist_3.0',
        'amount_hist_4.0',
        'amount_hist_5.0',
        'amount_hist_6.0',
        'amount_ratio',
        'area',
        'cluster_id',
        'date_ratio',
        'day_hist_0',
        'day_hist_1',
        'day_hist_2',
        'day_hist_3',
        'day_hist_4',
        'day_hist_5',
        'day_hist_6',
        'mcc_hist_4111.0',
        'mcc_hist_5261.0',
        'mcc_hist_5331.0',
        'mcc_hist_5411.0',
        'mcc_hist_5499.0',
        'mcc_hist_5541.0',
        'mcc_hist_5691.0',
        'mcc_hist_5812.0',
        'mcc_hist_5814.0',
        'mcc_hist_5912.0',
        'mcc_hist_5921.0',
        'mcc_hist_5977.0',
        'mcc_hist_6011.0',
        'mcc_hist_nan',
        'transaction_ratio'
    ]

    print(f'Train size: {len(train)}, Validation size: {len(validation)}')
    print(f'Features: {feature_columns}')
    model = LGBMClassifier()
    model.fit(train[feature_columns], train['is_close'])

    predictions = model.predict_proba(validation[feature_columns])
    accuracy_value = accuracy_score(validation['is_close'],
                                    np.argmax(predictions, axis=1))
    logloss_value = log_loss(validation['is_close'], predictions)
    print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}')
    print(
        classification_report(validation['is_close'],
                              np.argmax(predictions, axis=1)))

    validation['probs'] = predictions[:, 1]
    top1_accuracy = validation.groupby('customer_id').apply(
        lambda group: group.sort_values('probs').tail(1).is_close.max()).mean(
        )
    top5_accuracy = validation.groupby('customer_id').apply(
        lambda group: group.sort_values('probs').tail(5).is_close.max()).mean(
        )
    top10_accuracy = validation.groupby('customer_id').apply(
        lambda group: group.sort_values('probs').tail(10).is_close.max()).mean(
        )
    print(f'Top1: {top1_accuracy:.5f}')
    print(f'Top5: {top5_accuracy:.5f}')
    print(f'Top10: {top10_accuracy:.5f}')

    import pdb
    pdb.set_trace()

    return model, pipeline, feature_columns