def train_xgboost(X, y, w, **options): test_size = options.get('test_size') or 0.01 random_state = options.get('seed') or 31 X_train, X_valid, y_train, y_valid, w_train, w_valid, idx_train, idx_valid = \ train_test_split(X, y, w, range(len(y)), test_size=test_size, random_state=random_state) logging.info('Train size: %s', X_train.shape) logging.info('Test size: %s', X_valid.shape) dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train) dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid) eval_list = [(dtrain, 'train'), (dvalid, 'valid')] progress = dict() model = xgb.train(options, dtrain, options['num_round'], eval_list, evals_result=progress) p_train = model.predict(dtrain) p_valid = model.predict(dvalid) quality = dict( ll=dict( train=log_loss(y_train, p_train, sample_weight=w_train), valid=log_loss(y_valid, p_valid, sample_weight=w_valid) ), auc=dict( train=roc_auc_score(y_train, p_train, sample_weight=w_train), valid=roc_auc_score(y_valid, p_valid, sample_weight=w_valid) ), roc=dict( train=dict(zip(['fpr', 'tpr', 't'], roc_curve(y_train, p_train))), valid=dict(zip(['fpr', 'tpr', 't'], roc_curve(y_valid, p_valid))) ), reliability=dict( train=dict(zip(['avg_label', 'avg_pred'], reliability_curve(y_train, p_train, nbins=50, sample_weights=w_train))), valid=dict(zip(['avg_label', 'avg_pred'], reliability_curve(y_valid, p_valid, nbins=50, sample_weights=w_valid))) ) ) logging.info('Log-loss: %s', quality['ll']) quality['errors'] = dict( train=dict( type_i=heapq.nlargest(100, [(p, idx_train[j]) for j, p in enumerate(p_train) if y_train[j] == 0]), type_ii=heapq.nsmallest(100, [(p, idx_train[j]) for j, p in enumerate(p_train) if y_train[j] == 1]) ), valid=dict( type_i=heapq.nlargest(100, [(p, idx_valid[j]) for j, p in enumerate(p_valid) if y_valid[j] == 0]), type_ii=heapq.nsmallest(100, [(p, idx_valid[j]) for j, p in enumerate(p_valid) if y_valid[j] == 1]) ) ) return model, progress, quality
def train_ff(X, y, skf, **options): quality = dict(folds=[], full=dict()) predictions = np.zeros(len(y)) input_dim = X.shape[1] dump_dir = options.get('dump_dir') or '.' layers = options.get('layers') activations = options.get('activations') assert len(layers) == len(activations) method = options.get('method', 'adam') epochs = options.get('epochs', 10) batch_size = options.get('batch_size', 100) for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)): logging.info('Cross-validation fold: %d', i) X_train = X[train_idx] y_train = y[train_idx] dump_file = join_path(dump_dir, 'model_%d.pkl' % i) try: logging.info('Loading model for fold %d', i) f = load_model(dump_file) except: logging.info('Training model on fold %d', i) logging.info('Input dimensions: %d', input_dim) f = Sequential() f.add( Dense(layers[0], activation=activations[0], input_dim=input_dim)) for layer, layer_size in enumerate(layers[1:]): f.add(Dense(layer_size, activation=activations[layer])) f.add(Dense(1, activation='sigmoid')) f.compile(loss='binary_crossentropy', optimizer=method, metrics=['accuracy']) f.fit(X_train, y_train, epochs=epochs, batch_size=batch_size) logging.info('Writing model dump') save_model(f, dump_file) p_train = f.predict_proba(X_train).flatten() ll_train = log_loss(y_train, p_train) auc_train = roc_auc_score(y_train, p_train) logging.info('Train LL=%s AUC=%s', ll_train, auc_train) fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1) y_avg_train, p_avg_train = reliability_curve(y_train, p_train, nbins=50) X_valid = X[valid_idx] y_valid = y[valid_idx] p_valid = f.predict_proba(X_valid).flatten() ll_valid = log_loss(y_valid, p_valid) auc_valid = roc_auc_score(y_valid, p_valid) logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid) fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1) y_avg_valid, p_avg_valid = reliability_curve(y_valid, p_valid, nbins=50) predictions[valid_idx] = logit(p_valid) quality['folds'].append( dict(fold=i, dump=dump_file, ll=dict(train=ll_train, valid=ll_valid), auc=dict(train=auc_train, valid=auc_valid), roc=dict(train=dict(fpr=fpr_train, tpr=tpr_train), valid=dict(fpr=fpr_valid, tpr=tpr_valid)), reliability=dict(train=dict(avg_label=y_avg_train, avg_pred=p_avg_train), valid=dict(avg_label=y_avg_valid, avg_pred=p_avg_valid)))) return quality, predictions
def train_xgb(X, y, skf, **options): quality = dict(folds=[], full=dict()) predictions = np.zeros(len(y)) dump_dir = options.get('dump_dir') or '.' for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)): logging.info('Cross-validation fold: %d', i) X_train = X[train_idx] y_train = y[train_idx] X_valid = X[valid_idx] y_valid = y[valid_idx] dump_file = join_path(dump_dir, 'model_%d.bin' % i) try: logging.info('Loading model for fold %d', i) f = xgb.Booster({'nthread': 4}) f.load_model(dump_file) except: logging.info('Training model on fold %d', i) dtrain = xgb.DMatrix(X_train, label=y_train) dvalid = xgb.DMatrix(X_valid, label=y_valid) eval_list = [(dtrain, 'train'), (dvalid, 'valid')] f = xgb.train(options, dtrain, options['num_round'], eval_list) f.save_model(f, dump_file) p_train = f.predict(X_train) ll_train = log_loss(y_train, p_train) auc_train = roc_auc_score(y_train, p_train) logging.info('Train LL=%s AUC=%s', ll_train, auc_train) fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1) y_avg_train, p_avg_train = reliability_curve(y_train, p_train, nbins=50) p_valid = f.predict(X_valid) ll_valid = log_loss(y_valid, p_valid) auc_valid = roc_auc_score(y_valid, p_valid) logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid) fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1) y_avg_valid, p_avg_valid = reliability_curve(y_valid, p_valid, nbins=50) predictions[valid_idx] = logit(p_valid) quality['folds'].append( dict(fold=i, dump=dump_file, ll=dict(train=ll_train, valid=ll_valid), auc=dict(train=auc_train, valid=auc_valid), roc=dict(train=dict(fpr=fpr_train, tpr=tpr_train), valid=dict(fpr=fpr_valid, tpr=tpr_valid)), reliability=dict(train=dict(avg_label=y_avg_train, avg_pred=p_avg_train), valid=dict(avg_label=y_avg_valid, avg_pred=p_avg_valid)))) return quality, predictions
def quality(labels, pred): return dict( ll=log_loss(labels, pred), auc=roc_auc_score(labels, pred), reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100))) )
def train(X, y, skf, class_weight, **options): quality = dict(folds=[], full=dict()) predictions = np.zeros(len(y)) solver = options.get('solver') or 'lbfgs' penalty = options.get('penalty') or 'l2' alpha = options.get('alpha') or 1.0 max_iter = options.get('max_iter') or 200 random_state = options.get('seed') or None dump_dir = options.get('dump_dir') or '.' for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)): X_train = X[train_idx] y_train = y[train_idx] dump_file = join_path(dump_dir, 'model_%d.pkl' % i) try: logging.info('Loading model for fold %d', i) f = joblib.load(dump_file) except: logging.info('Training model on fold %d', i) f = LogisticRegression(solver=solver, penalty=penalty, C=alpha, max_iter=max_iter, random_state=random_state) f.fit(X_train, y_train) joblib.dump(f, dump_file) p_train = f.predict_proba(X_train)[:, 1] ll_train = log_loss(y_train, p_train) auc_train = roc_auc_score(y_train, p_train) logging.info('Train LL=%s AUC=%s', ll_train, auc_train) fpr_train, tpr_train, _ = roc_curve(y_train, p_train, pos_label=1) y_avg_train, p_avg_train = reliability_curve(y_train, p_train, nbins=50) X_valid = X[valid_idx] y_valid = y[valid_idx] p_valid = f.predict_proba(X_valid)[:, 1] ll_valid = log_loss(y_valid, p_valid) auc_valid = roc_auc_score(y_valid, p_valid) logging.info('Validation LL=%s AUC=%s', ll_valid, auc_valid) fpr_valid, tpr_valid, _ = roc_curve(y_valid, p_valid, pos_label=1) y_avg_valid, p_avg_valid = reliability_curve(y_valid, p_valid, nbins=50) predictions[valid_idx] = logit(p_valid) quality['folds'].append( dict(fold=i, dump=dump_file, ll=dict(train=ll_train, valid=ll_valid), auc=dict(train=auc_train, valid=auc_valid), roc=dict(train=dict(fpr=fpr_train.tolist(), tpr=tpr_train.tolist()), valid=dict(fpr=fpr_valid.tolist(), tpr=tpr_valid.tolist())), reliability=dict(train=dict(y=y_avg_train.tolist(), p=p_avg_train.tolist()), valid=dict(y=y_avg_valid.tolist(), p=p_avg_valid.tolist())))) # Train full model dump_file = join_path(dump_dir, 'model_full.pkl') try: logging.info('Loading full model') f_full = joblib.load(dump_file) except: logging.info('Training full model') f_full = LogisticRegression(solver=solver, penalty=penalty, C=alpha, max_iter=max_iter, random_state=random_state) f_full.fit(X, y) joblib.dump(f_full, dump_file) p_full_train = f_full.predict_proba(X)[:, 1] ll_full_train = log_loss(y, p_full_train) auc_full_train = roc_auc_score(y, p_full_train) logging.info('Full LL=%s AUC=%s', ll_full_train, auc_full_train) quality['full']['unweighted'] = dict(dump=dump_file, ll=dict(train=ll_full_train), auc=dict(train=auc_full_train)) # Train full model with estimated class weights dump_file = join_path(dump_dir, 'model_full_weighted.pkl') try: logging.info('Loading full weighted model') f_full_weighted = joblib.load(dump_file) except: logging.info('Training full weighted model') f_full_weighted = LogisticRegression(solver=solver, penalty=penalty, C=alpha, max_iter=max_iter, random_state=random_state, class_weight=class_weight) f_full_weighted.fit(X, y) joblib.dump(f_full_weighted, dump_file) p_full_train_weighted = f_full_weighted.predict_proba(X)[:, 1] sample_weight = np.vectorize(class_weight.get)(y) ll_full_train_weighted = log_loss(y, p_full_train_weighted, sample_weight=sample_weight) auc_full_train_weighted = roc_auc_score(y, p_full_train_weighted, sample_weight=sample_weight) quality['full']['weighted'] = dict(dump=dump_file, ll=dict(train=ll_full_train_weighted), auc=dict(train=auc_full_train_weighted)) return quality, predictions