def train_xgboost(X, y, w, **options):

    test_size = options.get('test_size') or 0.01
    random_state = options.get('seed') or 31

    X_train, X_valid, y_train, y_valid, w_train, w_valid, idx_train, idx_valid = \
        train_test_split(X, y, w, range(len(y)), test_size=test_size, random_state=random_state)

    logging.info('Train size: %s', X_train.shape)
    logging.info('Test size: %s', X_valid.shape)

    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid)

    eval_list = [(dtrain, 'train'), (dvalid, 'valid')]
    progress = dict()

    model = xgb.train(options, dtrain, options['num_round'], eval_list, evals_result=progress)

    p_train = model.predict(dtrain)
    p_valid = model.predict(dvalid)

    quality = dict(
        ll=dict(
            train=log_loss(y_train, p_train, sample_weight=w_train),
            valid=log_loss(y_valid, p_valid, sample_weight=w_valid)
        ),
        auc=dict(
            train=roc_auc_score(y_train, p_train, sample_weight=w_train),
            valid=roc_auc_score(y_valid, p_valid, sample_weight=w_valid)
        ),
        roc=dict(
            train=dict(zip(['fpr', 'tpr', 't'], roc_curve(y_train, p_train))),
            valid=dict(zip(['fpr', 'tpr', 't'], roc_curve(y_valid, p_valid)))
        ),
        reliability=dict(
            train=dict(zip(['avg_label', 'avg_pred'], reliability_curve(y_train, p_train, nbins=50, sample_weights=w_train))),
            valid=dict(zip(['avg_label', 'avg_pred'], reliability_curve(y_valid, p_valid, nbins=50, sample_weights=w_valid)))
        )
    )

    logging.info('Log-loss: %s', quality['ll'])

    quality['errors'] = dict(
        train=dict(
            type_i=heapq.nlargest(100, [(p, idx_train[j]) for j, p in enumerate(p_train) if y_train[j] == 0]),
            type_ii=heapq.nsmallest(100, [(p, idx_train[j]) for j, p in enumerate(p_train) if y_train[j] == 1])
        ),
        valid=dict(
            type_i=heapq.nlargest(100, [(p, idx_valid[j]) for j, p in enumerate(p_valid) if y_valid[j] == 0]),
            type_ii=heapq.nsmallest(100, [(p, idx_valid[j]) for j, p in enumerate(p_valid) if y_valid[j] == 1])
        )
    )

    return model, progress, quality
def train_ff(X, y, skf, **options):
    quality = dict(folds=[], full=dict())
    predictions = np.zeros(len(y))
    input_dim = X.shape[1]

    dump_dir = options.get('dump_dir') or '.'

    layers = options.get('layers')
    activations = options.get('activations')
    assert len(layers) == len(activations)

    method = options.get('method', 'adam')
    epochs = options.get('epochs', 10)
    batch_size = options.get('batch_size', 100)

    for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        logging.info('Cross-validation fold: %d', i)
        X_train = X[train_idx]
        y_train = y[train_idx]

        dump_file = join_path(dump_dir, 'model_%d.pkl' % i)
        try:
            logging.info('Loading model for fold %d', i)
            f = load_model(dump_file)
        except:
            logging.info('Training model on fold %d', i)
            logging.info('Input dimensions: %d', input_dim)

            f = Sequential()
            f.add(
                Dense(layers[0],
                      activation=activations[0],
                      input_dim=input_dim))

            for layer, layer_size in enumerate(layers[1:]):
                f.add(Dense(layer_size, activation=activations[layer]))

            f.add(Dense(1, activation='sigmoid'))

            f.compile(loss='binary_crossentropy',
                      optimizer=method,
                      metrics=['accuracy'])
            f.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

            logging.info('Writing model dump')
            save_model(f, dump_file)

        p_train = f.predict_proba(X_train).flatten()

        ll_train = log_loss(y_train, p_train)
        auc_train = roc_auc_score(y_train, p_train)

        logging.info('Train LL=%s AUC=%s', ll_train, auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1)
        y_avg_train, p_avg_train = reliability_curve(y_train,
                                                     p_train,
                                                     nbins=50)

        X_valid = X[valid_idx]
        y_valid = y[valid_idx]

        p_valid = f.predict_proba(X_valid).flatten()
        ll_valid = log_loss(y_valid, p_valid)
        auc_valid = roc_auc_score(y_valid, p_valid)

        logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid)

        fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1)
        y_avg_valid, p_avg_valid = reliability_curve(y_valid,
                                                     p_valid,
                                                     nbins=50)

        predictions[valid_idx] = logit(p_valid)

        quality['folds'].append(
            dict(fold=i,
                 dump=dump_file,
                 ll=dict(train=ll_train, valid=ll_valid),
                 auc=dict(train=auc_train, valid=auc_valid),
                 roc=dict(train=dict(fpr=fpr_train, tpr=tpr_train),
                          valid=dict(fpr=fpr_valid, tpr=tpr_valid)),
                 reliability=dict(train=dict(avg_label=y_avg_train,
                                             avg_pred=p_avg_train),
                                  valid=dict(avg_label=y_avg_valid,
                                             avg_pred=p_avg_valid))))

    return quality, predictions
Esempio n. 3
0
def train_xgb(X, y, skf, **options):
    quality = dict(folds=[], full=dict())
    predictions = np.zeros(len(y))

    dump_dir = options.get('dump_dir') or '.'

    for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        logging.info('Cross-validation fold: %d', i)
        X_train = X[train_idx]
        y_train = y[train_idx]

        X_valid = X[valid_idx]
        y_valid = y[valid_idx]

        dump_file = join_path(dump_dir, 'model_%d.bin' % i)
        try:
            logging.info('Loading model for fold %d', i)
            f = xgb.Booster({'nthread': 4})
            f.load_model(dump_file)
        except:
            logging.info('Training model on fold %d', i)

            dtrain = xgb.DMatrix(X_train, label=y_train)
            dvalid = xgb.DMatrix(X_valid, label=y_valid)

            eval_list = [(dtrain, 'train'), (dvalid, 'valid')]

            f = xgb.train(options, dtrain, options['num_round'], eval_list)
            f.save_model(f, dump_file)

        p_train = f.predict(X_train)

        ll_train = log_loss(y_train, p_train)
        auc_train = roc_auc_score(y_train, p_train)

        logging.info('Train LL=%s AUC=%s', ll_train, auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1)
        y_avg_train, p_avg_train = reliability_curve(y_train,
                                                     p_train,
                                                     nbins=50)

        p_valid = f.predict(X_valid)
        ll_valid = log_loss(y_valid, p_valid)
        auc_valid = roc_auc_score(y_valid, p_valid)

        logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid)

        fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1)
        y_avg_valid, p_avg_valid = reliability_curve(y_valid,
                                                     p_valid,
                                                     nbins=50)

        predictions[valid_idx] = logit(p_valid)

        quality['folds'].append(
            dict(fold=i,
                 dump=dump_file,
                 ll=dict(train=ll_train, valid=ll_valid),
                 auc=dict(train=auc_train, valid=auc_valid),
                 roc=dict(train=dict(fpr=fpr_train, tpr=tpr_train),
                          valid=dict(fpr=fpr_valid, tpr=tpr_valid)),
                 reliability=dict(train=dict(avg_label=y_avg_train,
                                             avg_pred=p_avg_train),
                                  valid=dict(avg_label=y_avg_valid,
                                             avg_pred=p_avg_valid))))

    return quality, predictions
def quality(labels, pred):
    return dict(
        ll=log_loss(labels, pred),
        auc=roc_auc_score(labels, pred),
        reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100)))
    )
def train(X, y, skf, class_weight, **options):

    quality = dict(folds=[], full=dict())

    predictions = np.zeros(len(y))
    solver = options.get('solver') or 'lbfgs'
    penalty = options.get('penalty') or 'l2'
    alpha = options.get('alpha') or 1.0
    max_iter = options.get('max_iter') or 200
    random_state = options.get('seed') or None
    dump_dir = options.get('dump_dir') or '.'

    for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train = X[train_idx]
        y_train = y[train_idx]

        dump_file = join_path(dump_dir, 'model_%d.pkl' % i)
        try:
            logging.info('Loading model for fold %d', i)
            f = joblib.load(dump_file)
        except:
            logging.info('Training model on fold %d', i)
            f = LogisticRegression(solver=solver,
                                   penalty=penalty,
                                   C=alpha,
                                   max_iter=max_iter,
                                   random_state=random_state)
            f.fit(X_train, y_train)
            joblib.dump(f, dump_file)

        p_train = f.predict_proba(X_train)[:, 1]
        ll_train = log_loss(y_train, p_train)
        auc_train = roc_auc_score(y_train, p_train)

        logging.info('Train LL=%s AUC=%s', ll_train, auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1)
        y_avg_train, p_avg_train = reliability_curve(y_train,
                                                     p_train,
                                                     nbins=50)

        X_valid = X[valid_idx]
        y_valid = y[valid_idx]

        p_valid = f.predict_proba(X_valid)[:, 1]
        ll_valid = log_loss(y_valid, p_valid)
        auc_valid = roc_auc_score(y_valid, p_valid)

        logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid)

        fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1)
        y_avg_valid, p_avg_valid = reliability_curve(y_valid,
                                                     p_valid,
                                                     nbins=50)

        predictions[valid_idx] = logit(p_valid)

        quality['folds'].append(
            dict(fold=i,
                 dump=dump_file,
                 ll=dict(train=ll_train, valid=ll_valid),
                 auc=dict(train=auc_train, valid=auc_valid),
                 roc=dict(train=dict(fpr=fpr_train.tolist(),
                                     tpr=tpr_train.tolist()),
                          valid=dict(fpr=fpr_valid.tolist(),
                                     tpr=tpr_valid.tolist())),
                 reliability=dict(train=dict(y=y_avg_train.tolist(),
                                             p=p_avg_train.tolist()),
                                  valid=dict(y=y_avg_valid.tolist(),
                                             p=p_avg_valid.tolist()))))

    # Train full model
    dump_file = join_path(dump_dir, 'model_full.pkl')

    try:
        logging.info('Loading full model')
        f_full = joblib.load(dump_file)
    except:
        logging.info('Training full model')
        f_full = LogisticRegression(solver=solver,
                                    penalty=penalty,
                                    C=alpha,
                                    max_iter=max_iter,
                                    random_state=random_state)
        f_full.fit(X, y)
        joblib.dump(f_full, dump_file)

    p_full_train = f_full.predict_proba(X)[:, 1]
    ll_full_train = log_loss(y, p_full_train)
    auc_full_train = roc_auc_score(y, p_full_train)

    logging.info('Full LL=%s AUC=%s', ll_full_train, auc_full_train)

    quality['full']['unweighted'] = dict(dump=dump_file,
                                         ll=dict(train=ll_full_train),
                                         auc=dict(train=auc_full_train))

    # Train full model with estimated class weights
    dump_file = join_path(dump_dir, 'model_full_weighted.pkl')

    try:
        logging.info('Loading full weighted model')
        f_full_weighted = joblib.load(dump_file)
    except:
        logging.info('Training full weighted model')
        f_full_weighted = LogisticRegression(solver=solver,
                                             penalty=penalty,
                                             C=alpha,
                                             max_iter=max_iter,
                                             random_state=random_state,
                                             class_weight=class_weight)
        f_full_weighted.fit(X, y)
        joblib.dump(f_full_weighted, dump_file)

    p_full_train_weighted = f_full_weighted.predict_proba(X)[:, 1]
    sample_weight = np.vectorize(class_weight.get)(y)
    ll_full_train_weighted = log_loss(y,
                                      p_full_train_weighted,
                                      sample_weight=sample_weight)
    auc_full_train_weighted = roc_auc_score(y,
                                            p_full_train_weighted,
                                            sample_weight=sample_weight)

    quality['full']['weighted'] = dict(dump=dump_file,
                                       ll=dict(train=ll_full_train_weighted),
                                       auc=dict(train=auc_full_train_weighted))

    return quality, predictions