def train_logistic_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train) X_train = encoder.transform(X_train) X_val = encoder.transform(X_val) # B/c all features after onehot is 0/1. params = { 'penalty':'l2', 'C':100.0, 'class_weight':'balanced', 'solver':'saga', 'max_iter':500, 'verbose':1, 'n_jobs':-1 } lr = Pipeline([ ('scaler', Normalizer()), ('lr', LogisticRegression(**params)) ]) lr.fit(X_train, y_train) y_pred = lr.predict_proba(X_val)[:, 1] auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(lr, pathify('models', 'avazu-lr.pickle')) return lr
def train_fm_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train) X_train = encoder.transform(X_train) X_val = encoder.transform(X_val) X_train = csr_matrix(X_train) X_val = csr_matrix(X_val) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_train = np.array(y_train) y_val = np.array(y_val) fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2) y_pred = fm.fit_predict_proba(X_train, y_train, X_val) auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(fm, pathify('models', 'avazu-fm.pickle')) return fm
def train_ftrl_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') params = { 'alpha': 0.1, # learning rate 'beta': 1, # smoothing parameter for adaptive learning rate 'L1': 1, # L1 regularization, larger value means more regularized 'L2': 1, # L2 regularization, larger value means more regularized 'num_categories': 2**16, # make sure it is the same value with make_features.py } ftrl = ftrl_proximal(**params) ftrl.fit(X_train, y_train, X_val, y_val) y_pred = [] for x_val in list(X_val.values): p = ftrl.predict(x_val) y_pred.append(p) y_pred = np.array(y_pred) auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(ftrl, pathify('models', 'avazu-ftrl.pickle')) return ftrl
def train_gradientboosting_model(): x_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') x_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') params = { 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'n_estimators': 100, 'gamma': 1, 'max_depth': 6, 'lambda': 1, 'min_child_weight': 5 } gb = xgb.XGBClassifier(**params) gb.fit(x_train, y_train, eval_metric='auc', verbose=True, eval_set=[(x_val, y_val)]) y_pred = gb.predict_proba(x_val)[:, 1] auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(gb, pathify('models', 'avazu-gb.pickle')) return gb
def make_features(input_file, output_file, mode): count_filename = pathify('data', 'interim', 'avazu-cv-train-count-features.pickle') if mode in ['test', 'val']: count_features = load_pickle(count_filename) else: count_features = prepare_count_features(input_file) save_pickle(count_features, count_filename) fields = make_output_headers() + list(count_features.keys()) with open(output_file, 'w') as csv_file: writer = csv.DictWriter(csv_file, fields) writer.writeheader() for i, row in (iter_as_dict(input_file)): if is_million(i): log.info('Write {} mil.rows to {}'.format(i + 1, output_file)) row_to_write = add_count_features_to_row(row, count_features) row_to_write['hour'] = make_hour_from_row(row) if mode == 'test': row_to_write['click'] = -1 writer.writerow(row_to_write)
def make(is_debug=False): csv_folder = pathify('data', 'raw', 'avazu') if is_debug: csv_folder = pathify(csv_folder, 'sample') split_for_validation(pathify(csv_folder, 'train'), is_debug) make_features(input_file=pathify('data', 'interim', 'avazu-train.csv'), output_file=pathify('data', 'interim', 'avazu-train-feature.csv'), mode='train') make_features(input_file=pathify('data', 'interim', 'avazu-val.csv'), output_file=pathify('data', 'interim', 'avazu-val-feature.csv'), mode='val') feature_names = 'C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,device_id_count,device_ip_count,user_id_count,hour_count'.split( ',') label_name = 'click' preprocess(input_path=pathify('data', 'interim', 'avazu-train-feature.csv'), output_path=pathify('data', 'processed', 'avazu-cv-train.csv'), feature_names=feature_names, label_name=label_name, num_categories=2**16) preprocess(input_path=pathify('data', 'interim', 'avazu-val-feature.csv'), output_path=pathify('data', 'processed', 'avazu-cv-val.csv'), feature_names=feature_names, label_name=label_name, num_categories=2**16)
# -*- coding: utf-8 -*- from ajctr.helpers import timing, log, mkdir, pathify from ajctr.data import make_dataset from ajctr.features import make_features from ajctr.models import train_model mkdir(pathify('data', 'interim')) mkdir(pathify('data', 'processed')) @timing def main(): make_dataset.make(is_debug=False) make_features.make(is_debug=False) train_model.train() if __name__ == '__main__': main()