예제 #1
0
def train_logistic_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)
    # B/c all features after onehot is 0/1.

    params = {
        'penalty':'l2',
        'C':100.0,
        'class_weight':'balanced',
        'solver':'saga',
        'max_iter':500,
        'verbose':1,
        'n_jobs':-1
    }
    lr = Pipeline([
        ('scaler', Normalizer()),
        ('lr', LogisticRegression(**params))
    ])
    lr.fit(X_train, y_train)

    y_pred = lr.predict_proba(X_val)[:, 1]
    
    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(lr, pathify('models', 'avazu-lr.pickle'))
    return lr
예제 #2
0
def train_fm_model():
    X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click')
    X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click')
    
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)


    X_train = csr_matrix(X_train)
    X_val = csr_matrix(X_val)
    y_train[y_train == 0] = -1
    y_val[y_val == 0] = -1
    y_train = np.array(y_train)
    y_val   = np.array(y_val)

    fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2)
    y_pred = fm.fit_predict_proba(X_train, y_train, X_val)

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(fm, pathify('models', 'avazu-fm.pickle'))
    return fm
예제 #3
0
def split_for_validation(train_filename, is_debug):
    # Use date 30 in train data as validation data
    date_val = '141030'
    fields = 'id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,device_id_count,device_ip_count,user_id_count,hour_count\n'
    cv_train_path = 'data/interim/avazu-train.csv'
    cv_val_path = 'data/interim/avazu-val.csv'

    with open(cv_train_path, 'w') as train_file:
        train_file.write(fields)
    with open(cv_val_path, 'w') as val_file:
        val_file.write(fields)

    with open(train_filename) as csv_file:
        with open(cv_train_path, 'a') as train_file:
            with open(cv_val_path, 'a') as val_file:
                for i, line in enumerate(csv_file):
                    if i == 0:
                        continue
                    if is_debug:
                        val_file.write(line)
                        train_file.write(line)
                    else:
                        if line.split(',')[2][:-2] == date_val:
                            val_file.write(line)
                        else:
                            train_file.write(line)

                    if is_million(i):
                        log.info('Splited {} mil.rows'.format(i + 1))
예제 #4
0
def train_gradientboosting_model():
    x_train, y_train = load_processed_data(pathify('data', 'processed',
                                                   'avazu-cv-train.csv'),
                                           label_col='click')
    x_val, y_val = load_processed_data(pathify('data', 'processed',
                                               'avazu-cv-val.csv'),
                                       label_col='click')

    params = {
        'learning_rate': 0.1,
        'colsample_bytree': 0.8,
        'n_estimators': 100,
        'gamma': 1,
        'max_depth': 6,
        'lambda': 1,
        'min_child_weight': 5
    }

    gb = xgb.XGBClassifier(**params)
    gb.fit(x_train,
           y_train,
           eval_metric='auc',
           verbose=True,
           eval_set=[(x_val, y_val)])
    y_pred = gb.predict_proba(x_val)[:, 1]

    auc_score = cal_auc(y_val, y_pred)
    log.info("auc_score: {:.4f}".format(auc_score))

    log_loss = cal_logloss(y_val, y_pred)
    log.info("log_loss: {:.4f}".format(log_loss))

    save_pickle(gb, pathify('models', 'avazu-gb.pickle'))
    return gb
예제 #5
0
def prepare_count_features(path_to_file):
    count_features = {}
    count_features['device_id_count'] = defaultdict(int)
    count_features['device_ip_count'] = defaultdict(int)
    count_features['user_id_count'] = defaultdict(int)
    count_features['hour_count'] = defaultdict(int)

    for i, row in iter_as_dict(path_to_file):
        count_features['device_id_count'][row['device_id']] += 1
        count_features['device_ip_count'][row['device_ip']] += 1
        count_features['user_id_count'][make_userid_from_row(row)] += 1
        count_features['hour_count'][make_hour_from_row(row)] += 1
        if is_million(i):
            log.info('Count {} mil.rows in {}'.format(i + 1, path_to_file))
    return count_features
예제 #6
0
def preprocess(input_path, output_path, feature_names, label_name,
               num_categories):
    fields = [label_name] + feature_names
    with open(output_path, 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fields)
        writer.writeheader()
        for i, row in (iter_as_dict(input_path)):
            if is_million(i):
                log.info('Preprocessed {} mil.rows'.format(i + 1))
            hashed_features = {label_name: row[label_name]}
            for feature in feature_names:
                str_to_hash = '{}-{}'.format(feature, row[feature])
                hashed_features[feature] = categorize_by_hash(
                    str_to_hash, num_categories)
            writer.writerow(hashed_features)
예제 #7
0
def make_features(input_file, output_file, mode):
    count_filename = pathify('data', 'interim',
                             'avazu-cv-train-count-features.pickle')
    if mode in ['test', 'val']:
        count_features = load_pickle(count_filename)
    else:
        count_features = prepare_count_features(input_file)
        save_pickle(count_features, count_filename)

    fields = make_output_headers() + list(count_features.keys())
    with open(output_file, 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fields)
        writer.writeheader()
        for i, row in (iter_as_dict(input_file)):
            if is_million(i):
                log.info('Write {} mil.rows to {}'.format(i + 1, output_file))
            row_to_write = add_count_features_to_row(row, count_features)
            row_to_write['hour'] = make_hour_from_row(row)
            if mode == 'test':
                row_to_write['click'] = -1
            writer.writerow(row_to_write)