Exemple #1
0
def lgb_run(param):
    import pprint
    import numpy as np
    pprint.pprint(param)
    from sklearn.datasets import load_svmlight_file as load_svm
    from PPMoney.core.model import BinaryLGB
    from PPMoney.core.model.metrics import ModelMetric

    file_tr = param["train_file"]
    model_root = param["model_root"]
    n_fold = param["n_fold"]

    from PPMoney.core.data import HDFDataSet
    dataset_load = HDFDataSet(file_tr, chunk_size=2048)

    X, y = dataset_load['feature'], dataset_load['label']
    label = y == 1
    print(f"X.shape, label.shape: {X.shape, label.shape}")

    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n_fold, random_state=0)
    train_index, valid_index = next(skf.split(X, label))

    X_tr, X_v = X[train_index], X[valid_index]
    label_tr, label_v = label[train_index], label[valid_index]

    # X_tr = X[:-n_val]
    # label_tr = label[:-n_val]
    # X_v = X[-n_val:]
    # label_v = label[-n_val:]

    import os
    if not os.path.isdir(model_root):
        os.mkdir(model_root)

    v_metric = ModelMetric(feature_t=X_v,
                           label_t=label_v,
                           name="VA",
                           metrics=["AUC"],
                           minimize=False)
    tr_metric = ModelMetric(feature_t=X_tr,
                            label_t=label_tr,
                            name="TR",
                            metrics=["auc"],
                            minimize=False)

    model = BinaryLGB(
        model_root=model_root,
        model_metric=[v_metric, tr_metric],  #可以定义多个metric,其中第一个的作为模型选择的基准
        model_name=None  # for random model name
    )
    return model.fit(param, X_tr, label_tr)
X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)

X_0 = X_train.values
y_0 = y_train.values
X_1 = X_test.values

sub = X_test_raw['id'].to_frame()
sub['target'] = 0
sub_train = X_train_raw['id'].to_frame()
sub_train['target'] = 0

# %% 希望将1st使用的特征工程后的数据集用HDFDataSet存成h5
from PPMoney.core.data import HDFDataSet

dataset_tr = HDFDataSet(os.path.join(data_path, 'mjahrer_1st_train.dataset'),
                        chunk_size=2048)
dataset_tr.add({'label': y_0, 'feature': X_0})
dataset_t = HDFDataSet(os.path.join(data_path, 'mjahrer_1st_test.dataset'),
                       chunk_size=2048)
dataset_t.add({'feature': X_1})

# 从文件中读入dataset
dataset_load = HDFDataSet(os.path.join(data_path, 'mjahrer_1st_train.dataset'),
                          chunk_size=2048)
dataset_load['feature']
dataset_load['label']

# %% 使用RankGauss处理非bin数据并存成h5
import os
import numpy as np
import pandas as pd
Exemple #3
0
from collections import Counter
import numpy as np
np.random.seed(20)
import pandas as pd
from PPMoney.core.data import HDFDataSet
from tensorflow import set_random_seed

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
nfold = 5
skf = StratifiedKFold(n_splits=nfold, random_state=0)
'''Data loading & preprocessing
'''

dataset_tr = HDFDataSet(os.path.join(data_path, 'mjahrer_1st_train.dataset'),
                        chunk_size=2048)
dataset_t = HDFDataSet(os.path.join(data_path, 'mjahrer_1st_test.dataset'),
                       chunk_size=2048)
X_0 = dataset_tr['dae_hidden_feature']
y_0 = dataset_tr['label']
X_1 = dataset_t['dae_hidden_feature']
print(f'shapes of X_0, X_1: {X_0.shape, X_1.shape}')

X_train_raw = pd.read_csv(base_path + 'train.csv')
X_test_raw = pd.read_csv(base_path + 'test.csv')
sub = X_test_raw['id'].to_frame()
sub['target'] = 0
sub_train = X_train_raw['id'].to_frame()
sub_train['target'] = 0

# %% The last cell preprocess data. In this cell, let data go into one model