import lightgbm as lgb
from sklearn.model_selection import train_test_split

#loading data into dataframe
df = pd.read_csv('https://query.data.world/s/67p5gkjye5vocfiqm2cuxnrkx4ijim')
#printig first five rows
df.head()
#getting basic detail
df.info()

#filling missing values
df['3P%'].fillna(0, inplace=True)

#checking data balance
df['TARGET_5Yrs'].value_counts().plot.bar()

#getting target and features in different variables
y_train = df['TARGET_5Yrs']
X_train = df.drop(['TARGET_5Yrs', 'Name'], axis=1)

#splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

#creating an instance
clf = lgb.LGBMClassifier()

#training the classifier and testing
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(acc)
Ejemplo n.º 2
0
        y_val = y_train[val_idx]
        X_test_ = X_test.copy()
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)
        with timer('target encoding'):
            cat_cols = X_train.select_dtypes(['object']).columns.tolist()
            te = TargetEncoder(cols=cat_cols)
            X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols],
                                                      y_trn)
            X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols])
            X_test_.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols])
            X_trn.fillna(-9999)
            X_val.fillna(-9999)
            X_test_.fillna(-9999)

        with timer('fit'):
            model = lgb.LGBMClassifier(**lgb_params)
            model.fit(X_trn,
                      y_trn,
                      eval_set=[(X_trn, y_trn), (X_val, y_val)],
                      **fit_params)

        p = model.predict_proba(X_val)[:, 1]
        val_series.iloc[val_idx] = p
        cv_results.append(roc_auc_score(y_val, p))
        test_df[i] = model.predict_proba(X_test_)[:, 1]
        feat_df[i] = model.feature_importances_

val_df = pd.DataFrame({
    'TARGET': y_train,
    'p': val_series
}).to_csv(OUTPUT / f'{NAME}_cv_pred.csv', index=False)
Ejemplo n.º 3
0
# -*- coding: utf-8 -*-
# @Time    : 2018/5/9 19:34
# @Author  : LeonHardt
# @File    : predictor_lgm.py

import os
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from score import score_roc
from sklearn.externals import joblib

x_train = np.loadtxt(os.getcwd()+'/data_error/x_train_error93.txt', delimiter=',')
y_label = np.loadtxt(os.getcwd()+'/data_error/y_train_error93.txt', delimiter=',')
x_test = np.loadtxt(os.getcwd()+'/data_error/x_test_error93.txt', delimiter=',')
X_train, X_test, y_train, y_test = train_test_split(x_train, y_label, test_size=0.20, random_state=314)
# print(x_train_sample.shape)
gbm = lgb.LGBMClassifier(n_estimators=4000, learning_rate=0.05, objective='binary', is_unbalance=True,
                         colsample_bytree=0.8665631328558623,
                         min_child_samples=122, num_leaves=48, reg_alpha=2, reg_lambda=50,
                         subsample=0.7252600946741159, scale_pos_weight=2)

fit_params = {"early_stopping_rounds":30,
            "eval_metric" : 'auc',
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose': 100}
gbm.fit(X_train, y_train, **fit_params)

prob = gbm.predict_proba(x_test)
np.savetxt(os.getcwd()+"/prediction/lgb4000_error_kaggle.txt", prob, delimiter=',')
Ejemplo n.º 4
0
def complex_lightgbm():
    import lightgbm
    return lightgbm.LGBMClassifier(max_depth=5,
                                   num_leaves=11,
                                   class_weight='balanced')
Ejemplo n.º 5
0
import multiprocessing

#test = pd.read_csv('./*.csv')
train = pd.read_csv('/*.csv')
#feat_cols=pd.read_csv('/*.csv')
#feat_cols=feat_cols.iloc[:,0].values.tolist()
num_cores = multiprocessing.cpu_count()
print('core: ',num_cores)
#del train['target']
#---------binary------------
myscore=None
myobj='binary'
target=train['target']

model=lgb.LGBMClassifier(boosting_type='gbdt',random_state=4590,
                    n_jobs=num_cores,
#                        max_depth=-1,
                    bagging_freq= 1, bagging_seed= 11,verbosity=0)    
print('BINARY')
#--------------------------
#target = train['target'] 
#myobj='regression'
#myscore='neg_mean_squared_error'
#param = {'num_leaves': 31,
#         'min_data_in_leaf': 30, 
#         'objective':'regression',
#         'max_depth': 10,
#         'learning_rate': 0.01,
#         "min_child_samples": 20,
#         "boosting": "gbdt",
#         "feature_fraction": 1,
#         "bagging_fraction": 0.9 ,
Ejemplo n.º 6
0
                                                y_temp,
                                                stratify=y_temp,
                                                test_size=0.5,
                                                random_state=42)
print('Shape of X_train:', X_train.shape)
print('Shape of X_val:', X_val.shape)
print('Shape of X_test:', X_test.shape)

# # Selection of features and plotting feature importance

# In[61]:

model_sk = lgb.LGBMClassifier(boosting_type='gbdt',
                              max_depth=7,
                              learning_rate=0.01,
                              n_estimators=2000,
                              class_weight='balanced',
                              subsample=0.9,
                              colsample_bytree=0.8,
                              n_jobs=-1)
train_features, valid_features, train_y, valid_y = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42)
model_sk.fit(train_features,
             train_y,
             early_stopping_rounds=100,
             eval_set=[(valid_features, valid_y)],
             eval_metric='auc',
             verbose=200)

# In[62]:

feature_imp = pd.DataFrame(sorted(
Ejemplo n.º 7
0
    'learning_rate': 0.02,
    'colsample_bytree': 0.3,
    'subsample': 0.7,
    'subsample_freq': 2,
    'num_leaves': 16,
    'seed': 99
}

lgb_params3 = {
    'n_estimators': 110,
    'max_depth': 4,
    'learning_rate': 0.02,
    'seed': 99
}

lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model2 = lgb.LGBMClassifier(**lgb_params2)
lgb_model3 = lgb.LGBMClassifier(**lgb_params3)
xgmodel = xgb.XGBClassifier(max_depth=8,
                            n_estimators=1000,
                            min_child_weight=300,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            eta=0.3,
                            seed=42)

lgb_model4 = lgb.LGBMClassifier(boosting_type="gbdt",
                                num_leaves=15,
                                reg_alpha=0,
                                reg_lambda=0.,
                                max_depth=-1,
Ejemplo n.º 8
0
def recursive_feature_elimination(train, from_backup=True):
    """
    conduct recursive feature elimination on the given training dataset
    :param train: training dataset
    :param from_backup: load from historical result (stored as list of strings), defaults to True
    :return top ranked features:
    """
    # defaults to return backup list
    if from_backup:
        return ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2',
                'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
                'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14',
                'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V7', 'V12', 'V13', 'V19',
                'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V43', 'V44', 'V45', 'V47', 'V48',
                'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V57', 'V58', 'V60', 'V61', 'V62', 'V69', 'V70', 'V72', 'V74',
                'V75', 'V76', 'V78', 'V81', 'V82', 'V83', 'V87', 'V90', 'V91', 'V94', 'V95', 'V96', 'V97', 'V99',
                'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V139', 'V140', 'V143', 'V145', 'V149', 'V150', 'V152',
                'V156', 'V158', 'V159', 'V160', 'V162', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V187',
                'V188', 'V189', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210',
                'V212', 'V213', 'V215', 'V216', 'V217', 'V218', 'V219', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226',
                'V228', 'V231', 'V232', 'V233', 'V234', 'V243', 'V244', 'V251', 'V254', 'V256', 'V257', 'V258', 'V261',
                'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275',
                'V276', 'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292',
                'V294', 'V303', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323',
                'V324', 'V326', 'V331', 'V332', 'V333', 'V335', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                'id_38', 'DeviceType', 'DeviceInfo', 'device_name', 'OS_id_30', 'version_id_30', 'browser_id_31',
                'version_id_31', 'screen_width', 'screen_height', 'P_emaildomain_bin', 'P_emaildomain_suffix',
                'R_emaildomain_bin', 'R_emaildomain_suffix', 'TransactionAmt_Log', 'TransactionAmt_decimal']

        # if desire to conduct RFE again...
    train = reduce_mem_usage(train)

    X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
    y = train.sort_values('TransactionDT')['isFraud']

    del train
    gc.collect()

    X.fillna(-999, inplace=True)

    # parameters chosen by BayesianOptimization
    # Credit to this notebook: https://www.kaggle.com/vincentlugat/ieee-lgb-bayesian-opt/notebook
    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        "boosting_type": "gbdt",
        "bagging_seed": 11,
        "metric": 'auc',
        "verbosity": -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47
    }

    import lightgbm as lgb

    clf = lgb.LGBMClassifier(**params)
    rfe = RFECV(estimator=clf, step=10, cv=KFold(n_splits=5, shuffle=False), scoring='roc_auc', verbose=2)

    rfe.fit(X, y)

    return X.columns[rfe.ranking_ == 1].tolist()
Ejemplo n.º 9
0
print(X_train.shape, X_test.shape, len(y_train), len(y_test))

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

model_lgb = lgb.LGBMClassifier(
    n_jobs=4,
    n_estimators=100000,
    boost_from_average='false',
    learning_rate=0.01,
    num_leaves=64,
    num_threads=4,
    max_depth=-1,
    tree_learner="serial",
    feature_fraction=0.7,
    bagging_freq=5,
    bagging_fraction=0.7,
    min_data_in_leaf=100,
    silent=-1,
    verbose=-1,
    max_bin=255,
    bagging_seed=11,
)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

auc_scores = []
models = []
for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
Ejemplo n.º 10
0
    
    train_set = lgb.Dataset(data = train_features, label = train_labels)
    test_set = lgb.Dataset(data = test_features, label = test_labels)
    return train_set, test_set, train_features, test_features, train_labels, test_labels


# In[3]:


train_set, test_set, train_features, test_features, train_labels, test_labels = prepare_lgb_df(df)


# In[51]:


model = lgb.LGBMClassifier()
default_params = model.get_params()
del default_params['n_estimators']
print(default_params)

cv_results = lgb.cv(default_params,
                    train_set,
                    num_boost_round=10000,
                    early_stopping_rounds=100,
                   metrics='auc',
                   nfold=n_folds,
                   seed=50)


# In[84]:
    # trainX = clf.mergeToOne(trainX,new_feature)
    # testX = clf.mergeToOne(testX, new_test_features)

    # #TODO:模型搭建
    start = time.time()
    model = lgb.LGBMClassifier(
        boosting_type="gbdt",
        num_leaves=48,
        max_depth=-1,
        learning_rate=0.05,
        n_estimators=3000,
        subsample_for_bin=50000,
        objective="binary",
        min_split_gain=0,
        min_child_weight=5,
        min_child_samples=30,  #10
        subsample=0.8,
        subsample_freq=1,
        colsample_bytree=1,
        reg_alpha=3,
        reg_lambda=5,
        feature_fraction=0.9,
        bagging_fraction=0.9,  #此次添加的
        seed=2019,
        n_jobs=10,
        slient=True,
        num_boost_round=3000)
    n_splits = 7
    random_seed = 2019
    skf = StratifiedKFold(shuffle=True,
                          random_state=random_seed,
                          n_splits=n_splits)
Ejemplo n.º 12
0
if __name__ == '__main__':
    np.random.seed(2707)

    X_train, X_test, y_train = utils.load_data(data_name='log_flipped',
                                               columns=COLUMNS)

    clf = None
    metric = 'logloss' if CLASSIFIER == 'xgb' else 'binary_logloss'
    if CLASSIFIER == 'xgb':
        clf = xgb.XGBClassifier(**PARAMS)
    else:
        par = PARAMS.copy()
        par['num_leaves'] = 2**par['max_depth']
        del par['gamma']
        del par['max_depth']
        clf = lgb.LGBMClassifier(**par)

    if MODE == 'cv':
        utils.perform_cv(X_train,
                         y_train,
                         clf,
                         MODEL_NAME + '-' + CLASSIFIER,
                         fit_params={'eval_metric': metric},
                         stratify_labels=utils.load_stratify_labels())
    elif MODE == 'ensemble':
        utils.VJUH(X_train,
                   X_test,
                   y_train,
                   clf,
                   MODEL_NAME,
                   'ensemble',
Ejemplo n.º 13
0
test = get_feature(op_test, trans_test, sub).fillna(-1)

train = train.drop(['Tag'], axis=1).fillna(-1)
label = y['Tag']

test_id = test['UID']
test = test.drop(['Tag'], axis=1).fillna(-1)

lgb_model = lgb.LGBMClassifier(boosting_type='gbdt',
                               num_leaves=100,
                               reg_alpha=3,
                               reg_lambda=5,
                               max_depth=-1,
                               n_estimators=5000,
                               objective='binary',
                               subsample=0.9,
                               colsample_bytree=0.77,
                               subsample_freq=1,
                               learning_rate=0.05,
                               random_state=1000,
                               n_jobs=16,
                               min_child_weight=4,
                               min_child_samples=5,
                               min_split_gain=0)
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
best_score = []

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test_id.shape[0])

for index, (train_index, test_index) in enumerate(skf.split(train, label)):
    lgb_model.fit(train.iloc[train_index],
Ejemplo n.º 14
0
data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv')
df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv')

#se quitan las columnas que no se usan
data_x.drop(labels=['building_id'], axis=1,inplace = True)
data_x_tst.drop(labels=['building_id'], axis=1,inplace = True)
data_y.drop(labels=['building_id'], axis=1,inplace = True)

y = np.ravel(data_y.values)
X, X_tst, selec = preprocessing(data_x, y, data_x_tst)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
    
print("------ LightGBM...")
lgbm = lgb.LGBMClassifier(objective='regression_l1', n_estimators=200, n_jobs=2, num_leaves = 40, scale_pos_weight = 0.1)
lgbm, y_test_lgbm = validacion_cruzada(lgbm, X, y, skf)


# Entreno de nuevo con el total de los datos
# El resultado que muestro es en training, será mejor que en test
clf = lgbm
clf = clf.fit(X,y)
plotImp(clf, selec, X.shape[1])
y_pred_tra = clf.predict(X)
print("F1 score (tra): {:.4f}".format(f1_score(y,y_pred_tra,average='micro')))

y_pred_tst = clf.predict(X_tst)

df_submission['damage_grade'] = y_pred_tst
df_submission.to_csv("../Submissions/submission_" + sys.argv[0][-5:-3] + ".csv", index=False)
Ejemplo n.º 15
0
def stack_test(train_x, train_y, test_x, test_y):
    print("start stacking test")
    clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=150,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf2 = lgb.LGBMClassifier(boosting_type='dart',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=150,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf3 = lgb.LGBMClassifier(boosting_type='rf',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=150,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)

    clf4 = XGBClassifier(max_depth=5,
                         learning_rate=0.1,
                         n_estimators=150,
                         objective='binary:logistic',
                         booster='gbtree',
                         n_jobs=-1,
                         min_child_weight=50,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         random_state=2018)

    stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                   meta_classifier=clf4,
                                   use_probas=True,
                                   average_probas=True,
                                   verbose=1)

    stack_clf.fit(train_x, train_y)
    pred_score = stack_clf.predict_proba(test_x)[:, 1]
    auc_score = roc_auc_score(test_y, pred_score)
    print("auc score is {}".format(auc_score))

    return stack_clf
Ejemplo n.º 16
0
"""
# LIGHT GBM

# Instantiate classifier

classifier = lgbm.LGBMClassifier(
    objective='binary',
    #metric='binary_logloss',
    metric = 'auc',
    boosting='gbdt',
    num_leaves=10,
    learning_rate=0.01,
    n_estimators=20000,
    #max_bin=50,
    max_bin=200,
    max_depth=-1,
    min_gain_to_split = 2,
    bagging_fraction=0.75,
    bagging_freq=5,
    bagging_seed=7,
    feature_fraction=0.5,
    feature_fraction_seed=7,
    verbose=-1,
    min_data_in_leaf=80,
    min_sum_hessian_in_leaf=11
)


# Fit the data
classifier.fit(X_train, y_train,)
Ejemplo n.º 17
0
        (executors.CSharpExecutor, C_SHARP),
        (executors.PowershellExecutor, POWERSHELL),
        (executors.RExecutor, R),
        (executors.PhpExecutor, PHP),
        (executors.DartExecutor, DART),
        (executors.HaskellExecutor, HASKELL),
        (executors.RubyExecutor, RUBY),
        (executors.FSharpExecutor, F_SHARP),
        (executors.RustExecutor, RUST),
    ],

    # These models will be tested against each language specified in the previous list.
    [
        # LightGBM
        regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS)),
        classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS)),
        classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS)),

        # LightGBM (DART)
        regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_DART)),
        classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_DART)),
        classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_DART)),

        # LightGBM (GOSS)
        regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_GOSS)),
        classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_GOSS)),
        classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_GOSS)),

        # LightGBM (RF)
        regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_RF)),
        classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_RF)),
Ejemplo n.º 18
0
                                    LossFunction = modelscore,
                                    label = 'is_trade',
                                    columnname = ColumnName[1::2], # the pattern for selection
                                    start = temp,
                                    CrossMethod = CrossMethod, # your cross term method
                                    PotentialAdd = [] # potential feature for Simulated Annealing
                                    )
    try:
        a.run()
    finally:
        with open(RecordFolder, 'a') as f:
            f.write('\n{}\n%{}%\n'.format(type,'-'*60))

if __name__ == "__main__":
    model = {'xgb': xgb.XGBClassifier(seed = 1, max_depth = 5, n_estimators = 2000, nthread = -1),
             'lgb': lgbm.LGBMClassifier(random_state=1,num_leaves = 29, n_estimators=1000),
             'lgb2': lgbm.LGBMClassifier(random_state=1,num_leaves = 29, max_depth=5, n_estimators=1000),
             'lgb3': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=1000,max_depth=3,learning_rate = 0.09, n_jobs=30),
             'lgb4': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000,max_depth=3,learning_rate = 0.095, n_jobs=30),
             'lgb5': lgbm.LGBMClassifier(random_state=1, num_leaves = 13, n_estimators=5000,max_depth=4,learning_rate = 0.05, n_jobs=30),
             'lgb6': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000,max_depth=3,learning_rate = 0.05, n_jobs=8)
            } # algorithm group

    CrossMethod = {'+':add,
                   '-':substract,
                   '*':times,
                   '/':divide,}

    RecordFolder = 'record.log' # result record file
    modelselect = 'lgb6' # selected algorithm
Ejemplo n.º 19
0
            predict['predicted_score'] = model.predict_proba(predict[features])
            # predict[['instance_id', 'predicted_score']].to_csv('result4_18_dnn_2.csv', index=False, sep=' ')
            print(logloss)

        if mf == 'lgb':  ## gbdt算法在预测集上表现非常不好。
            log_loss_list = []
            kf = KFold(n_splits=5, shuffle=True, random_state=1)
            for train_idx, test_idx in kf.split(all_train):
                train = all_train.iloc[train_idx, :]
                test = all_train.iloc[test_idx, :]
                X_train = train[features]
                y_train = train[target]
                X_test = test[features]
                y_test = test[target]

                clf = lgb.LGBMClassifier(num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.06, lambda_l2=1.0)
                clf.fit(X_train, y_train, feature_name=features)
                X_test['predicted_score'] = clf.predict_proba(X_test, )[:, 1]
                # X_test['predicted_score'] = X_test['predicted_score'] - 0.001
                log_loss_value = log_loss(y_test, X_test['predicted_score'])
                log_loss_list.append(log_loss_value)
            print('the log loss of lgb model in cv with 5 splits: ', log_loss_list)

            # 0.08270811820531722
            # (num_leaves=50, max_depth=5, n_estimators=120, n_jobs=20): 0.08258190767889406
            # (num_leaves=50, max_depth=5, n_estimators=150, n_jobs=20, learning_rate=0.1, num_iterators=1000): 0.08258016897053531
            # (num_leaves=36, max_depth=5, n_estimators=150, n_jobs=20, learning_rate=0.05, lambda_l2=1.0):     0.08255235859456307
            # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.05, lambda_l2=1.0):     0.08252046284038406
            # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.06, lambda_l2=1.0):     0.08243291190872347
            # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.07, lambda_l2=1.0):     0.08246869876021136
            # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.08, lambda_l2=1.0):     0.08250570813361169
Ejemplo n.º 20
0
    #X = data_x.values
    #X_tst = data_x_tst.values
    y = np.ravel(data_y.values)

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(proportion=0.5))

    X_sample, y_sample = oversampler.sample(X, y)

    #X, y = shuffle(X, y, random_state=76592621)
    #X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=76592621)

    print("------ LightGBM...")
    lgbm = lgb.LGBMClassifier(n_estimators=2700,
                              learning_rate=0.08,
                              max_bin=350,
                              num_leaves=34,
                              objective='multiclassova',
                              random_state=76592621,
                              n_jobs=2)
    print("------ Generando submission...")
    submission(X_sample, y_sample, X_tst, lgbm)
    '''  
  fit_alg = lgb.LGBMClassifier(class_weight={1:0.9, 2:0.8, 3:0.7},num_leaves=40, 
                               learning_rate=0.08, objective='multiclassova', 
                               random_state=76592621, n_jobs=-1)
  param_dist = {
  'n_estimators':[510, 1000]
  }
  clf = GridSearchCV(fit_alg, param_dist, verbose=1, cv=2, scoring='f1_micro', n_jobs=-1)
  clf = clf.fit(X,y)
  best_param2 = clf.best_params_['n_estimators']
Ejemplo n.º 21
0
    def train_model(self, model='lr', balance=False):
        kfold = KFold(n_splits=10, shuffle=True, random_state=2021)
        for epoch in range(1, 3):
            train_x, train_y, test = self.load_train_x_train_y_test_x(balance)
            train_x, train_y = self.shuffle(train_x, train_y)
            # train_x, train_y = self.balance_data(train_x, train_y)
            for i, (train_idx, valid_idx) in enumerate(kfold.split(train_x, train_y)):
                train_xx, train_yy = train_x[train_idx], train_y[train_idx]
                valid_xx, valid_yy = train_x[valid_idx], train_y[valid_idx]
                if model == 'lr':
                    lr = LogisticRegression(C=10, solver='liblinear', max_iter=100, n_jobs=1)
                    lr.fit(X=train_xx, y=train_yy.reshape(-1, ))
                    joblib.dump(lr, os.path.join(daikuan_path, 'model', 'lr_epoch_{}_k_{}.model'.format(epoch, i)))
                    # 验证集预测
                    prob = lr.predict_proba(valid_xx)
                    i, j, k = metrics.roc_curve(valid_yy, prob[:, 1])
                    roc_auc = metrics.auc(i, j)
                    print('lr valid roc_auc is', roc_auc)
                elif model == 'svm':
                    svc = SVC(C=10, kernel='rbf', verbose=True, max_iter=-1, gamma='scale')
                    svc.fit(train_xx, train_yy.reshape(-1, ))
                    joblib.dump(svc, os.path.join(daikuan_path, 'model', 'svc_epoch_{}_k_{}.model'.format(epoch, i)))
                    prob = svc.predict_proba(valid_xx)
                    i, j, k = metrics.roc_curve(valid_yy, prob[:, 1])
                    roc_auc = metrics.auc(i, j)
                    print('svc valid roc_auc is', roc_auc)
                elif model == 'lgb':
                    train_m = lgb.Dataset(train_xx, train_yy.reshape(-1, ))
                    valid_m = lgb.Dataset(valid_xx, valid_yy.reshape(-1, ))

                    params = {
                        'boosting_type': 'gbdt',
                        'objective': 'binary',
                        'learning_rate': 0.01,
                        'metric': 'auc',
                        'num_leaves': 14,
                        'max_depth': 19,
                        'min_data_in_leaf': 37,
                        'min_child_weight': 1.6,
                        'reg_lambda': 9,
                        'reg_alpha': 7,
                        'feature_fraction': 0.69,
                        'bagging_fraction': 0.98,
                        'bagging_freq': 96,
                        'min_split_gain': 0.4,
                        'nthread': 4
                    }

                    # params = {
                    #     'boosting_type': 'gbdt',
                    #     'objective': 'binary',
                    #     'learning_rate': 0.01,
                    #     'metric': 'auc',
                    #     'num_leaves': 32,
                    #     'max_depth': 6,
                    #     'min_data_in_leaf': 16,
                    #     'min_child_weight': 1.9,
                    #     # 'min_child_weight': 4.9,
                    #     'reg_lambda': 9,
                    #     'reg_alpha': 7,
                    #     'feature_fraction': 0.8,
                    #     'bagging_fraction': 0.65,
                    #     'bagging_freq': 50,
                    #     'min_split_gain': 0.4
                    # }
                    m = lgb.train(params=params, train_set=train_m, valid_sets=valid_m, num_boost_round=20000,
                                  verbose_eval=1000, early_stopping_rounds=200)
                    val_pre_lgb = m.predict(valid_xx)

                    fpr, tpr, threshold = metrics.roc_curve(valid_yy, val_pre_lgb)
                    print(fpr.shape, tpr.shape, threshold.shape)
                    roc_auc = metrics.auc(fpr, tpr)
                    joblib.dump(m, os.path.join(daikuan_path, 'model', 'lgb4_s_epoch_{}_k_{}.model'.format(epoch, i)))
                    print('调参lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
                elif model == 'lgbm':
                    model_lgb = lgb.LGBMClassifier(
                        boosting_type='gbdt', objective='binary', metric='auc',
                        learning_rate=0.1, n_estimators=2000,
                        num_leaves=40, max_depth=4,
                        bagging_fraction=0.85,
                        feature_fraction=0.57,
                        bagging_freq=58,
                        min_data_in_leaf=25,
                        min_child_weight=4.9, min_split_gain=0.4,
                        reg_lambda=4.6, reg_alpha=9.7,
                        n_jobs=4
                    )
                    model_lgb.fit(train_xx, train_yy)
                    from sklearn.metrics import roc_auc_score
                    from sklearn.model_selection import cross_validate
                    c = cross_validate(model_lgb, train_x, train_y, cv=10)
                    print('c', c)
                    return
Ejemplo n.º 22
0
        mean_f1Train += fper_class_train['f1'] / n_splits
    # print('mean valf1:',mean_f1)
    # print('mean trainf1:',mean_f1Train)
    return mean_f1


xlf = xgb.XGBClassifier(max_depth=7,
                        learning_rate=0.05,
                        n_estimators=55,
                        reg_alpha=0.005,
                        n_jobs=8,
                        importance_type='total_cover')
#
llf = lgb.LGBMClassifier(num_leaves=9,
                         max_depth=5,
                         learning_rate=0.05,
                         n_estimators=80,
                         n_jobs=8)

clf = cab.CatBoostClassifier(iterations=60,
                             learning_rate=0.05,
                             depth=10,
                             silent=True,
                             thread_count=8,
                             task_type='CPU',
                             cat_features=cat_features)

rf = RandomForestClassifier(oob_score=True,
                            random_state=2020,
                            n_estimators=70,
                            max_depth=13,
Ejemplo n.º 23
0
#------------------------------------------------------------------------


from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

'''
print("------ XGB...")
clf = xgb.XGBClassifier(n_estimators = 500,objective='multi:softmax',n_jobs=8,max_depth=11,num_class=4)
clfb, y_test_clf = validacion_cruzada(clf,X_filtered,y,skf)
#'''


print("------ LightGBM...")
lgbmb = lgb.LGBMClassifier(objective='multiclass',n_estimators=1000,num_threads=8,max_depth=-1)
#lgbmb, y_test_lgbm = validacion_cruzada(lgbm,X_filtered,y,skf,'salida7lgb')
#'''

'''
print("------ MLPNN...")
X_filtered = preprocessing.normalize(X_filtered)
nn = MLPClassifier()
nnb, y_test_nn = validacion_cruzada(nn,X_filtered,y,skf,'salida5nn')
#'''

'''
print("------ SVC...")
svc = SVC()
svcb, y_test_svc = validacion_cruzada(svc,X_filtered,y,skf,'salida5svc')
#'''
Ejemplo n.º 24
0
    def optimize(self,
                 metrics='f1_score',
                 n_splits=3,
                 cv_type=StratifiedKFold,
                 maxevals=200,
                 do_predict_proba=None,
                 model_id=0,
                 reuse_experiment=False):

        params = self.hyperparameter_space()
        extra_params = self.extra_setup()

        env = Environment(
            train_dataset=self.data,
            results_path='HyperparameterHunterAssets',
            # results_path=self.PATH,
            metrics=[metrics],
            do_predict_proba=do_predict_proba,
            cv_type=cv_type,
            cv_params=dict(n_splits=n_splits),
        )

        # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
        optimizer = opt.BayesianOptimization(iterations=maxevals)
        optimizer.set_experiment_guidelines(
            model_initializer=lgb.LGBMClassifier,
            model_init_params=params,
            model_extra_params=extra_params)
        optimizer.go()

        # there are a few fixes on its way and the next few lines will soon be
        # one. At the moment, to access to the best parameters one has to read
        # from disc and access them
        best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
            optimizer.best_experiment+'.json'
        with open(best_experiment) as best:
            best = json.loads(
                best.read())['hyperparameters']['model_init_params']

        # The next few lines are the only ones related to mlflow
        if not Path('mlruns').exists():
            # here set the tracking_uri. If None then http://localhost:5000
            client = MlflowClient()
            n_experiments = 0
        elif not reuse_experiment:
            client = MlflowClient()
            n_experiments = len(client.list_experiments())
            experiment_name = 'experiment_' + str(n_experiments)
            client.create_experiment(name=experiment_name)
        with mlflow.start_run(experiment_id=n_experiments):
            model = lgb.LGBMClassifier(**best)
            X, y = self.data.drop('target', axis=1), self.data.target
            model.fit(X,
                      y,
                      feature_name=self.colnames,
                      categorical_feature=self.categorical_columns)
            for name, value in best.items():
                mlflow.log_param(name, value)
            mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun)
            mlflow.sklearn.log_model(model, "model")

        model_fname = 'model_{}_.p'.format(model_id)
        best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
        pickle.dump(model, open('/'.join([self.PATH, model_fname]), 'wb'))
        pickle.dump(optimizer,
                    open('/'.join([self.PATH, best_experiment_fname]), 'wb'))
    'objective': ['binary'],
    'random_state': [501],  # Updated from 'seed'
    'colsample_bytree': [0.65, 0.75],
    'subsample': [0.7, 0.75, 0.8],
    'reg_alpha': [0.1, 1.2],
    'reg_lambda': [0.2, 1.4],
}

# Create classifier to use
mdl = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    n_jobs=5,  # Updated from 'nthread'
    silent=False,
    max_depth=params['max_depth'],
    max_bin=params['max_bin'],
    subsample_for_bin=params['subsample_for_bin'],
    subsample=params['subsample'],
    subsample_freq=params['subsample_freq'],
    min_split_gain=params['min_split_gain'],
    min_child_weight=params['min_child_weight'],
    min_child_samples=params['min_child_samples'],
    scale_pos_weight=params['scale_pos_weight'])

# View the default model params:
mdl.get_params().keys()

# Create the grid
grid = RandomizedSearchCV(mdl, gridParams, verbose=2, cv=4, n_jobs=-1)

# Run the grid
grid.fit(train_early_stop_x, train_early_stop_y)
Ejemplo n.º 26
0
y = df.sort_values('TransactionDT')['isFraud']
df = df.sort_values('TransactionDT').drop(
    ['isFraud', 'TransactionDT', 'TransactionID'], axis=1)

df = df_utils.clean_inf_nan(df)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(df)
df = pd.DataFrame(imputer.transform(df), columns=df.columns.values.tolist())

# 1) Stratifield 5 CV Training Data
scores = []
y_pred_score = np.empty(shape=[0, 2])
predicted_index = np.empty(shape=[
    0,
])
model = lgb.LGBMClassifier()
model.set_params(**lgb_optimal)

if gral_parameters.get('sampling') == 'Adasyn':
    ovs_model = ADASYN().set_params(**oversampling)
    X_train, y_train = ovs_model.fit_sample(df, y)

fileModel = model.fit(X_train, y_train)

save_params = {'base_model': fileModel, 'imputer': imputer}

joblib.dump(
    save_params,
    os.path.join(
        os.path.join(
            os.path.dirname(
Ejemplo n.º 27
0
def experience_mnist(config, path, param):
    print("START MNIST")
    use_cuda = config.general.use_cuda and torch.cuda.is_available()
    torch.manual_seed(config.general.seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    print("START TRAINING TARGET MODEL")
    data_train_target = custum_MNIST(True,
                                     0,
                                     config,
                                     '../data',
                                     train=True,
                                     download=True,
                                     transform=transforms.Compose([
                                         transforms.ToTensor(),
                                         transforms.Normalize((0.1307, ),
                                                              (0.3081, ))
                                     ]))
    data_test_target = custum_MNIST(True,
                                    0,
                                    config,
                                    '../data',
                                    train=False,
                                    transform=transforms.Compose([
                                        transforms.ToTensor(),
                                        transforms.Normalize((0.1307, ),
                                                             (0.3081, ))
                                    ]))
    criterion = nn.CrossEntropyLoss()
    train_loader_target = torch.utils.data.DataLoader(
        data_train_target, batch_size=config.learning.batch_size, shuffle=True)
    test_loader_target = torch.utils.data.DataLoader(
        data_test_target, batch_size=config.learning.batch_size, shuffle=True)
    dataloaders_target = {
        "train": train_loader_target,
        "val": test_loader_target
    }
    dataset_sizes_target = {
        "train": len(data_train_target),
        "val": len(data_test_target)
    }
    print("TAILLE dataset", dataset_sizes_target)
    model_target = Net_mnist().to(device)
    optimizer = optim.SGD(model_target.parameters(),
                          lr=config.learning.learning_rate,
                          momentum=config.learning.momentum)
    # Add DP noise!
    privacy_engine = PrivacyEngine(
        model_target,
        batch_size=config.learning.batch_size,
        sample_size=len(train_loader_target.dataset),
        alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
        noise_multiplier=1.0,  # sigma
        max_grad_norm=1.0,  # Clip per-sample gradients to this norm
    )
    privacy_engine.attach(optimizer)
    exp_lr_scheduler = lr_scheduler.StepLR(
        optimizer,
        step_size=config.learning.decrease_lr_factor,
        gamma=config.learning.decrease_lr_every)
    model_target, best_acc_target, data_test_set, label_test_set, class_test_set = train_model(
        model_target,
        criterion,
        optimizer,
        exp_lr_scheduler,
        dataloaders_target,
        dataset_sizes_target,
        num_epochs=config.learning.epochs)
    np.save(path + "/res_train_target_" + str(param) + ".npy", best_acc_target)
    print("START TRAINING SHADOW MODEL")
    all_shadow_models = []
    all_dataloaders_shadow = []
    data_train_set = []
    label_train_set = []
    class_train_set = []
    for num_model_sahdow in range(config.general.number_shadow_model):
        criterion = nn.CrossEntropyLoss()

        data_train_shadow = custum_MNIST(False,
                                         num_model_sahdow,
                                         config,
                                         '../data',
                                         train=True,
                                         download=True,
                                         transform=transforms.Compose([
                                             transforms.ToTensor(),
                                             transforms.Normalize((0.1307, ),
                                                                  (0.3081, ))
                                         ]))
        data_test_shadow = custum_MNIST(False,
                                        num_model_sahdow,
                                        config,
                                        '../data',
                                        train=False,
                                        transform=transforms.Compose([
                                            transforms.ToTensor(),
                                            transforms.Normalize((0.1307, ),
                                                                 (0.3081, ))
                                        ]))
        train_loader_shadow = torch.utils.data.DataLoader(
            data_train_shadow,
            batch_size=config.learning.batch_size,
            shuffle=True)
        test_loader_shadow = torch.utils.data.DataLoader(
            data_test_shadow,
            batch_size=config.learning.batch_size,
            shuffle=True)
        dataloaders_shadow = {
            "train": train_loader_shadow,
            "val": test_loader_shadow
        }
        dataset_sizes_shadow = {
            "train": len(data_train_shadow),
            "val": len(data_test_shadow)
        }
        print("TAILLE dataset", dataset_sizes_shadow)
        model_shadow = Net_mnist().to(device)
        optimizer = optim.SGD(model_shadow.parameters(),
                              lr=config.learning.learning_rate,
                              momentum=config.learning.momentum)
        exp_lr_scheduler = lr_scheduler.StepLR(
            optimizer,
            step_size=config.learning.decrease_lr_factor,
            gamma=config.learning.decrease_lr_every)
        model_shadow, best_acc_sh, data_train_set_unit, label_train_set_unit, class_train_set_unit = train_model(
            model_shadow,
            criterion,
            optimizer,
            exp_lr_scheduler,
            dataloaders_target,
            dataset_sizes_target,
            num_epochs=config.learning.epochs)
        data_train_set.append(data_train_set_unit)
        label_train_set.append(label_train_set_unit)
        class_train_set.append(class_train_set_unit)
        np.save(
            path + "/res_train_shadow_" + str(num_model_sahdow) + "_" +
            str(param) + ".npy", best_acc_sh)
        all_shadow_models.append(model_shadow)
        all_dataloaders_shadow.append(dataloaders_shadow)
    print("START GETTING DATASET ATTACK MODEL")
    data_train_set = np.concatenate(data_train_set)
    label_train_set = np.concatenate(label_train_set)
    class_train_set = np.concatenate(class_train_set)
    #data_test_set, label_test_set, class_test_set = get_data_for_final_eval([model_target], [dataloaders_target], device)
    #data_train_set, label_train_set, class_train_set = get_data_for_final_eval(all_shadow_models, all_dataloaders_shadow, device)
    data_train_set, label_train_set, class_train_set = shuffle(
        data_train_set,
        label_train_set,
        class_train_set,
        random_state=config.general.seed)
    data_test_set, label_test_set, class_test_set = shuffle(
        data_test_set,
        label_test_set,
        class_test_set,
        random_state=config.general.seed)
    print("Taille dataset train", len(label_train_set))
    print("Taille dataset test", len(label_test_set))
    print("START FITTING ATTACK MODEL")
    model = lgb.LGBMClassifier(objective='binary',
                               reg_lambda=config.learning.ml.reg_lambd,
                               n_estimators=config.learning.ml.n_estimators)
    model.fit(data_train_set, label_train_set)
    y_pred_lgbm = model.predict(data_test_set)
    precision_general, recall_general, _, _ = precision_recall_fscore_support(
        y_pred=y_pred_lgbm, y_true=label_test_set, average="macro")
    accuracy_general = accuracy_score(y_true=label_test_set,
                                      y_pred=y_pred_lgbm)
    precision_per_class, recall_per_class, accuracy_per_class = [], [], []
    for idx_class, classe in enumerate(data_train_target.classes):
        all_index_class = np.where(class_test_set == idx_class)
        precision, recall, _, _ = precision_recall_fscore_support(
            y_pred=y_pred_lgbm[all_index_class],
            y_true=label_test_set[all_index_class],
            average="macro")
        accuracy = accuracy_score(y_true=label_test_set[all_index_class],
                                  y_pred=y_pred_lgbm[all_index_class])
        precision_per_class.append(precision)
        recall_per_class.append(recall)
        accuracy_per_class.append(accuracy)
    print("END MNIST")
    return (precision_general, recall_general, accuracy_general,
            precision_per_class, recall_per_class, accuracy_per_class)
Ejemplo n.º 28
0
print('    Training accuracy of sgdc :', pipe_sgdc.score(X_train, y_train))
print('    Valid Accuracy : %.8f' % pipe_sgdc.score(X_valid, y_valid))
print('    AUC value of sgdc : ',AUC_calculate(pipe_sgdc,valid,X_valid))
print('End sgdc model .')

## LGB
print('Start lgb model :')       
import lightgbm as lgb

lgb_ = lgb.LGBMClassifier(
                    learning_rate = 0.005,
                    boosting_type = 'gbdt',
                    objective = 'binary',
                    metric = 'logloss',
                    max_depth = 7,
                    sub_feature = 0.7,
                    num_leaves = 10,
                    colsample_bytree = 0.7,
                    min_data_in_leaf =10,
                    n_estimators = 500,
                    early_stop = 50,
                    verbose = -1,
                    feature_fraction= 0.7)

pipe_lgb = make_pipeline(StandardScaler(),lgb_)
                          
pipe_lgb.fit(X_train, y_train)
y_pred_lgb = pipe_lgb.predict(X_valid)
print('    Training accuracy of lgb: ', pipe_lgb.score(X_train, y_train))
print('    Valid Accuracy of lgb :  %.10f' % pipe_lgb.score(X_valid, y_valid))
print('    AUC value of lgb  : ',AUC_calculate(pipe_lgb,valid,X_valid))
print('End lgb model .')
Ejemplo n.º 29
0
    'bagging_freq': 5,
    'verbose': 0,
    'min_sum_hessian_in_leaf': 100
}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)  #10折交叉验证
#y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
#print(y_pred)

# specify your configurations as a dict

#'metric': 'binary_logloss',    #'num_iterations':500,
clf4 = lgb.LGBMClassifier(
    criterion="rmse")  #n_estimators:估计器(树)的个数  criterion:优化目标

model4 = clf4.fit(x_train, y_train)
y_hat4 = model4.predict(x_test)
sum(y_hat4 == y_test) / y_test.count()
c = confusion_matrix(y_hat4, y_test)
acc1 = c[0, 0] / sum(c[0, :])
acc2 = c[1, 1] / sum(c[1, :])
print('4-1:%.2f%%' % (acc1 * 100))
print('4-2:%.2f%%' % (acc2 * 100))
#print('confuse_matrix')
#print(c)输出混淆矩阵

train_x = x_train
train_y = y_train
validation_x = x_test
Ejemplo n.º 30
0
def train_model(X, y, params, exp_path):
    fold_params = params['fold']
    model_params = params['model']
    fit_params = params['fit']

    # set mlflow experiment
    try:
        mlflow.create_experiment(exp_path)
    except (mlflow.exceptions.RestException,
            mlflow.exceptions.MlflowException):
        print('The specified experiment ({}) already exists.'.format(exp_path))

    mlflow.set_experiment(exp_path)

    skf = StratifiedKFold(**fold_params)
    models = []
    metrics = []

    y_proba = np.zeros(len(X))
    y_pred = np.zeros(len(X))

    feature_importances_split = np.zeros(X.shape[1])
    feature_importances_gain = np.zeros(X.shape[1])

    scores = defaultdict(int)

    with mlflow.start_run() as run:
        corr = pd.concat((X, y), axis=1).corr()
        log_plot(corr, pf.corr_matrix, 'correlation_matrix.png')
        log_plot(y.value_counts(), pf.label_share, 'label_share.png')

        for fold_no, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
            print_devider(f'Fold: {fold_no}')

            X_train, X_valid = X.iloc[idx_train, :], X.iloc[idx_valid, :]
            y_train, y_valid = y.iloc[idx_train], y.iloc[idx_valid]

            # train model
            model = lgbm.LGBMClassifier(**model_params)
            model.fit(X_train,
                      y_train,
                      **fit_params,
                      eval_set=[(X_valid, y_valid)],
                      eval_names=['valid'])
            metrics.append({
                'name': model.metric,
                'values': model.evals_result_['valid'][model.metric],
                'best_iteration': model.best_iteration_
            })
            models.append(model)

            # feature importance
            feature_importances_split += devide_by_sum(
                model.booster_.feature_importance(
                    importance_type='split')) / skf.n_splits
            feature_importances_gain += devide_by_sum(
                model.booster_.feature_importance(
                    importance_type='gain')) / skf.n_splits

            # predict
            y_valid_proba = model.predict_proba(
                X_valid, num_iteration=model.best_iteration_)[:, 1]
            y_valid_pred = model.predict(X_valid,
                                         num_iteration=model.best_iteration_)
            y_proba[idx_valid] = y_valid_proba
            y_pred[idx_valid] = y_valid_pred

            # evaluate
            scores_valid = get_scores(y_valid, y_valid_pred)

            mlflow.log_metrics(
                {
                    **scores_valid,
                    'best_iteration': model.best_iteration_,
                },
                step=fold_no)

            print('\nScores')
            print(scores_valid)

            # record scores
            for k, v in scores_valid.items():
                scores[k] += v / skf.n_splits

        # log training parameters
        mlflow.log_params({
            **fold_params,
            **model_params,
            **fit_params, 'cv': skf.__class__.__name__,
            'model': model.__class__.__name__
        })

        print_devider('Saving plots')

        # scores
        log_plot(scores, pf.scores, 'scores.png')

        # feature importance
        features = np.array(model.booster_.feature_name())
        log_plot(
            (features, feature_importances_split, 'Feature Importance: split'),
            pf.feature_importance, 'feature_importance_split.png')
        log_plot(
            (features, feature_importances_gain, 'Feature Importance: gain'),
            pf.feature_importance, 'feature_importance_gain.png')

        # metric history
        log_plot(metrics, pf.metric, 'metric_history.png')

        # confusion matrix
        cm = confusion_matrix(y, y_pred)
        log_plot(cm, pf.confusion_matrix, 'confusion_matrix.png')

        # roc curve
        fpr, tpr, _ = roc_curve(y, y_proba)
        roc_auc = roc_auc_score(y, y_pred)
        log_plot((fpr, tpr, roc_auc), pf.roc_curve, 'roc_curve.png')

        # precision-recall curve
        pre, rec, _ = precision_recall_curve(y, y_proba)
        pr_auc = average_precision_score(y, y_pred)
        log_plot((pre, rec, pr_auc), pf.pr_curve, 'pr_curve.png')

        # pickle trained models
        models_path = 'models.pkl'
        with open(models_path, 'wb') as f:
            pickle.dump(models, f)
            mlflow.log_artifact(models_path)
            mlflow.log_param('model_path',
                             os.path.join(run.info.artifact_uri, models_path))
            os.remove(models_path)

    return run.info.experiment_id, run.info.run_uuid