Esempio n. 1
0
def train_logistic_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)
    # B/c all features after onehot is 0/1.

    params = {
        'penalty':'l2',
        'C':100.0,
        'class_weight':'balanced',
        'solver':'saga',
        'max_iter':500,
        'verbose':1,
        'n_jobs':-1
    }
    lr = Pipeline([
        ('scaler', Normalizer()),
        ('lr', LogisticRegression(**params))
    ])
    lr.fit(X_train, y_train)

    y_pred = lr.predict_proba(X_val)[:, 1]
    
    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(lr, pathify('models', 'avazu-lr.pickle'))
    return lr
Esempio n. 2
0
def train_fm_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)


    X_train = csr_matrix(X_train)
    X_val = csr_matrix(X_val)
    y_train[y_train == 0] = -1
    y_val[y_val == 0] = -1
    y_train = np.array(y_train)
    y_val   = np.array(y_val)

    fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2)
    y_pred = fm.fit_predict_proba(X_train, y_train, X_val)

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(fm, pathify('models', 'avazu-fm.pickle'))
    return fm
Esempio n. 3
0
def train_ftrl_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed',
                                                   'avazu-cv-train.csv'),
                                           label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed',
                                               'avazu-cv-val.csv'),
                                       label_col='click')

    params = {
        'alpha': 0.1,  # learning rate
        'beta': 1,  # smoothing parameter for adaptive learning rate
        'L1': 1,  # L1 regularization, larger value means more regularized
        'L2': 1,  # L2 regularization, larger value means more regularized
        'num_categories':
        2**16,  # make sure it is the same value with make_features.py
    }
    ftrl = ftrl_proximal(**params)
    ftrl.fit(X_train, y_train, X_val, y_val)

    y_pred = []
    for x_val in list(X_val.values):
        p = ftrl.predict(x_val)
        y_pred.append(p)
    y_pred = np.array(y_pred)
    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(ftrl, pathify('models', 'avazu-ftrl.pickle'))
    return ftrl
Esempio n. 4
0
def train_gradientboosting_model():
    x_train, y_train = load_processed_data(pathify('data', 'processed',
                                                   'avazu-cv-train.csv'),
                                           label_col='click')
    x_val, y_val = load_processed_data(pathify('data', 'processed',
                                               'avazu-cv-val.csv'),
                                       label_col='click')

    params = {
        'learning_rate': 0.1,
        'colsample_bytree': 0.8,
        'n_estimators': 100,
        'gamma': 1,
        'max_depth': 6,
        'lambda': 1,
        'min_child_weight': 5
    }

    gb = xgb.XGBClassifier(**params)
    gb.fit(x_train,
           y_train,
           eval_metric='auc',
           verbose=True,
           eval_set=[(x_val, y_val)])
    y_pred = gb.predict_proba(x_val)[:, 1]

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(gb, pathify('models', 'avazu-gb.pickle'))
    return gb
Esempio n. 5
0
def make_features(input_file, output_file, mode):
    count_filename = pathify('data', 'interim',
                             'avazu-cv-train-count-features.pickle')
    if mode in ['test', 'val']:
        count_features = load_pickle(count_filename)
    else:
        count_features = prepare_count_features(input_file)
        save_pickle(count_features, count_filename)

    fields = make_output_headers() + list(count_features.keys())
    with open(output_file, 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fields)
        writer.writeheader()
        for i, row in (iter_as_dict(input_file)):
            if is_million(i):
                log.info('Write {} mil.rows to {}'.format(i + 1, output_file))
            row_to_write = add_count_features_to_row(row, count_features)
            row_to_write['hour'] = make_hour_from_row(row)
            if mode == 'test':
                row_to_write['click'] = -1
            writer.writerow(row_to_write)
Esempio n. 6
0
def make(is_debug=False):
    csv_folder = pathify('data', 'raw', 'avazu')
    if is_debug:
        csv_folder = pathify(csv_folder, 'sample')

    split_for_validation(pathify(csv_folder, 'train'), is_debug)

    make_features(input_file=pathify('data', 'interim', 'avazu-train.csv'),
                  output_file=pathify('data', 'interim',
                                      'avazu-train-feature.csv'),
                  mode='train')

    make_features(input_file=pathify('data', 'interim', 'avazu-val.csv'),
                  output_file=pathify('data', 'interim',
                                      'avazu-val-feature.csv'),
                  mode='val')

    feature_names = 'C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,device_id_count,device_ip_count,user_id_count,hour_count'.split(
        ',')
    label_name = 'click'
    preprocess(input_path=pathify('data', 'interim',
                                  'avazu-train-feature.csv'),
               output_path=pathify('data', 'processed', 'avazu-cv-train.csv'),
               feature_names=feature_names,
               label_name=label_name,
               num_categories=2**16)
    preprocess(input_path=pathify('data', 'interim', 'avazu-val-feature.csv'),
               output_path=pathify('data', 'processed', 'avazu-cv-val.csv'),
               feature_names=feature_names,
               label_name=label_name,
               num_categories=2**16)
Esempio n. 7
0
# -*- coding: utf-8 -*-
from ajctr.helpers import timing, log, mkdir, pathify
from ajctr.data import make_dataset
from ajctr.features import make_features
from ajctr.models import train_model

mkdir(pathify('data', 'interim'))
mkdir(pathify('data', 'processed'))


@timing
def main():
    make_dataset.make(is_debug=False)
    make_features.make(is_debug=False)
    train_model.train()


if __name__ == '__main__':
    main()