Ejemplo n.º 1
0
def train_fm_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)


    X_train = csr_matrix(X_train)
    X_val = csr_matrix(X_val)
    y_train[y_train == 0] = -1
    y_val[y_val == 0] = -1
    y_train = np.array(y_train)
    y_val   = np.array(y_val)

    fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2)
    y_pred = fm.fit_predict_proba(X_train, y_train, X_val)

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(fm, pathify('models', 'avazu-fm.pickle'))
    return fm
Ejemplo n.º 2
0
def train_logistic_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)
    # B/c all features after onehot is 0/1.

    params = {
        'penalty':'l2',
        'C':100.0,
        'class_weight':'balanced',
        'solver':'saga',
        'max_iter':500,
        'verbose':1,
        'n_jobs':-1
    }
    lr = Pipeline([
        ('scaler', Normalizer()),
        ('lr', LogisticRegression(**params))
    ])
    lr.fit(X_train, y_train)

    y_pred = lr.predict_proba(X_val)[:, 1]
    
    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(lr, pathify('models', 'avazu-lr.pickle'))
    return lr
Ejemplo n.º 3
0
def train_ftrl_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed',
                                                   'avazu-cv-train.csv'),
                                           label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed',
                                               'avazu-cv-val.csv'),
                                       label_col='click')

    params = {
        'alpha': 0.1,  # learning rate
        'beta': 1,  # smoothing parameter for adaptive learning rate
        'L1': 1,  # L1 regularization, larger value means more regularized
        'L2': 1,  # L2 regularization, larger value means more regularized
        'num_categories':
        2**16,  # make sure it is the same value with make_features.py
    }
    ftrl = ftrl_proximal(**params)
    ftrl.fit(X_train, y_train, X_val, y_val)

    y_pred = []
    for x_val in list(X_val.values):
        p = ftrl.predict(x_val)
        y_pred.append(p)
    y_pred = np.array(y_pred)
    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(ftrl, pathify('models', 'avazu-ftrl.pickle'))
    return ftrl
Ejemplo n.º 4
0
def train_gradientboosting_model():
    x_train, y_train = load_processed_data(pathify('data', 'processed',
                                                   'avazu-cv-train.csv'),
                                           label_col='click')
    x_val, y_val = load_processed_data(pathify('data', 'processed',
                                               'avazu-cv-val.csv'),
                                       label_col='click')

    params = {
        'learning_rate': 0.1,
        'colsample_bytree': 0.8,
        'n_estimators': 100,
        'gamma': 1,
        'max_depth': 6,
        'lambda': 1,
        'min_child_weight': 5
    }

    gb = xgb.XGBClassifier(**params)
    gb.fit(x_train,
           y_train,
           eval_metric='auc',
           verbose=True,
           eval_set=[(x_val, y_val)])
    y_pred = gb.predict_proba(x_val)[:, 1]

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(gb, pathify('models', 'avazu-gb.pickle'))
    return gb
Ejemplo n.º 5
0
def make_features(input_file, output_file, mode):
    count_filename = pathify('data', 'interim',
                             'avazu-cv-train-count-features.pickle')
    if mode in ['test', 'val']:
        count_features = load_pickle(count_filename)
    else:
        count_features = prepare_count_features(input_file)
        save_pickle(count_features, count_filename)

    fields = make_output_headers() + list(count_features.keys())
    with open(output_file, 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fields)
        writer.writeheader()
        for i, row in (iter_as_dict(input_file)):
            if is_million(i):
                log.info('Write {} mil.rows to {}'.format(i + 1, output_file))
            row_to_write = add_count_features_to_row(row, count_features)
            row_to_write['hour'] = make_hour_from_row(row)
            if mode == 'test':
                row_to_write['click'] = -1
            writer.writerow(row_to_write)