Esempio n. 1
0
def handler(context):
    print(
        f'start training with parameters : {Parameters.as_dict()}, context : {context}'
    )

    X_train, y_train, cols_train = train_data_loader(DATALAKE_CHANNEL_ID,
                                                     DATALAKE_TRAIN_FILE_ID,
                                                     LABEL_FIELD, INPUT_FIELDS)
    dtrain = lgb.Dataset(X_train, y_train)

    if DATALAKE_VAL_FILE_ID:
        X_val, y_val, _ = train_data_loader(DATALAKE_CHANNEL_ID,
                                            DATALAKE_VAL_FILE_ID, LABEL_FIELD,
                                            INPUT_FIELDS)
    else:
        X_val, y_val = None, None

    extraction_cb = ModelExtractionCallback()
    tensorboard_cb = TensorBoardCallback(statistics, writer)
    tensorboard_cb.set_valid(X_val, y_val, Parameters.IS_CLASSIFICATION,
                             IS_MULTI, Parameters.NUM_CLASS)
    callbacks = [
        extraction_cb,
        tensorboard_cb,
    ]

    lgb.cv(PARAMS,
           dtrain,
           nfold=Parameters.NFOLD,
           early_stopping_rounds=Parameters.EARLY_STOPPING_ROUNDS,
           verbose_eval=Parameters.VERBOSE_EVAL,
           stratified=STRATIFIED,
           callbacks=callbacks,
           metrics=Parameters.METRIC,
           seed=Parameters.SEED)

    models = extraction_cb.raw_boosters
    for i, model in enumerate(models):
        model.save_model(
            os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.txt'))

    di = {**(Parameters.as_dict()), 'cols_train': cols_train}
    lgb_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'lgb_env.json'),
                   'w')
    json.dump(di, lgb_env)
    lgb_env.close()
    writer.close()
Esempio n. 2
0
       dtrain,
       nfold=Parameters.NFOLD,
       early_stopping_rounds=Parameters.EARLY_STOPPING_ROUNDS,
       verbose_eval=Parameters.VERBOSE_EVAL,
       stratified=STRATIFIED,
       callbacks=callbacks,
       metrics=Parameters.METRIC,
       seed=Parameters.SEED)

# In[10]:

models = extraction_cb.raw_boosters
for i, model in enumerate(models):
    model.save_model(os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.txt'))

di = {**(Parameters.as_dict()), 'cols_train': cols_train}
lgb_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'lgb_env.json'), 'w')
json.dump(di, lgb_env)
lgb_env.close()
writer.close()

# In[ ]:


def handler(context):
    print("finish.")


# In[ ]:

parameters = Parameters.as_env()
def handler(context):
    print(
        f'start training with parameters : {Parameters.as_dict()}, context : {context}'
    )

    X_train, y_train, cols_train = train_data_loader(DATALAKE_CHANNEL_ID,
                                                     DATALAKE_TRAIN_FILE_ID,
                                                     LABEL_FIELD, INPUT_FIELDS)
    models = []
    pred = np.zeros(len(X_train))

    if DATALAKE_VAL_FILE_ID:
        X_val, y_val, _ = train_data_loader(DATALAKE_CHANNEL_ID,
                                            DATALAKE_VAL_FILE_ID, LABEL_FIELD,
                                            INPUT_FIELDS)
        if IS_MULTI:
            pred_val = np.zeros((len(X_val), NUM_CLASS))
        else:
            pred_val = np.zeros(len(X_val))
    else:
        X_val, y_val, pred_val = None, None, None

    for i, (train_index, valid_index) in enumerate(skf.split(X_train,
                                                             y_train)):
        model = classifier(**PARAMS)
        model.fit(X_train.iloc[train_index], y_train[train_index])
        pred[valid_index] = model.predict(X_train.iloc[valid_index])

        score, loss = evaluator(y_train[valid_index], pred[valid_index])
        score_val = 0.0
        loss_val = 0.0

        filename = os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.pkl')
        pickle.dump(model, open(filename, 'wb'))

        models.append(model)

        if DATALAKE_VAL_FILE_ID:
            pred_val_cv = model.predict(X_val)
            if IS_MULTI:
                pred_val += np.identity(NUM_CLASS)[pred_val_cv]
            else:
                pred_val += pred_val_cv
            score_val, loss_val = evaluator(y_val, pred_val_cv)

        print('-------------')
        print(
            'cv {} || score:{:.4f} || loss:{:.4f} || val_score:{:.4f} || val_loss:{:.4f}'
            .format(i + 1, score, loss, score_val, loss_val))
        writer.add_scalar('main/acc', score, i + 1)
        writer.add_scalar('main/loss', loss, i + 1)
        writer.add_scalar('test/acc', score_val, i + 1)
        writer.add_scalar('test/loss', loss_val, i + 1)
        statistics(i + 1, loss, score, loss_val, score_val)
        writer.flush()

    score, loss = evaluator(y_train, pred)
    score_val = 0.0
    loss_val = 0.0

    if DATALAKE_VAL_FILE_ID:
        if IS_MULTI:
            pred_val = np.argmax(pred_val, axis=1)
        else:
            pred_val /= len(models)
        score_val, loss_val = evaluator(y_val, pred_val)

    print('-------------')
    print(
        'cv total score:{:.4f} || cv total loss:{:.4f} || cv total val_score:{:.4f} || cv total val_loss:{:.4f}'
        .format(score, loss, score_val, loss_val))
    statistics(Parameters.NFOLD, None, score, None, score_val)
    writer.add_scalar('main/acc', score, Parameters.NFOLD)
    writer.add_scalar('main/loss', loss, Parameters.NFOLD)
    writer.add_scalar('test/acc', score_val, Parameters.NFOLD)
    writer.add_scalar('test/loss', loss_val, Parameters.NFOLD)
    writer.close()

    di = {**(Parameters.as_dict()), 'cols_train': cols_train}
    skf_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'skf_env.json'),
                   'w')
    json.dump(di, skf_env)
    skf_env.close()
    return