def train_logistic_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train) X_train = encoder.transform(X_train) X_val = encoder.transform(X_val) # B/c all features after onehot is 0/1. params = { 'penalty':'l2', 'C':100.0, 'class_weight':'balanced', 'solver':'saga', 'max_iter':500, 'verbose':1, 'n_jobs':-1 } lr = Pipeline([ ('scaler', Normalizer()), ('lr', LogisticRegression(**params)) ]) lr.fit(X_train, y_train) y_pred = lr.predict_proba(X_val)[:, 1] auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(lr, pathify('models', 'avazu-lr.pickle')) return lr
def train_fm_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train) X_train = encoder.transform(X_train) X_val = encoder.transform(X_val) X_train = csr_matrix(X_train) X_val = csr_matrix(X_val) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_train = np.array(y_train) y_val = np.array(y_val) fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2) y_pred = fm.fit_predict_proba(X_train, y_train, X_val) auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(fm, pathify('models', 'avazu-fm.pickle')) return fm
def split_for_validation(train_filename, is_debug): # Use date 30 in train data as validation data date_val = '141030' fields = 'id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,device_id_count,device_ip_count,user_id_count,hour_count\n' cv_train_path = 'data/interim/avazu-train.csv' cv_val_path = 'data/interim/avazu-val.csv' with open(cv_train_path, 'w') as train_file: train_file.write(fields) with open(cv_val_path, 'w') as val_file: val_file.write(fields) with open(train_filename) as csv_file: with open(cv_train_path, 'a') as train_file: with open(cv_val_path, 'a') as val_file: for i, line in enumerate(csv_file): if i == 0: continue if is_debug: val_file.write(line) train_file.write(line) else: if line.split(',')[2][:-2] == date_val: val_file.write(line) else: train_file.write(line) if is_million(i): log.info('Splited {} mil.rows'.format(i + 1))
def train_gradientboosting_model(): x_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') x_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') params = { 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'n_estimators': 100, 'gamma': 1, 'max_depth': 6, 'lambda': 1, 'min_child_weight': 5 } gb = xgb.XGBClassifier(**params) gb.fit(x_train, y_train, eval_metric='auc', verbose=True, eval_set=[(x_val, y_val)]) y_pred = gb.predict_proba(x_val)[:, 1] auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(gb, pathify('models', 'avazu-gb.pickle')) return gb
def prepare_count_features(path_to_file): count_features = {} count_features['device_id_count'] = defaultdict(int) count_features['device_ip_count'] = defaultdict(int) count_features['user_id_count'] = defaultdict(int) count_features['hour_count'] = defaultdict(int) for i, row in iter_as_dict(path_to_file): count_features['device_id_count'][row['device_id']] += 1 count_features['device_ip_count'][row['device_ip']] += 1 count_features['user_id_count'][make_userid_from_row(row)] += 1 count_features['hour_count'][make_hour_from_row(row)] += 1 if is_million(i): log.info('Count {} mil.rows in {}'.format(i + 1, path_to_file)) return count_features
def preprocess(input_path, output_path, feature_names, label_name, num_categories): fields = [label_name] + feature_names with open(output_path, 'w') as csv_file: writer = csv.DictWriter(csv_file, fields) writer.writeheader() for i, row in (iter_as_dict(input_path)): if is_million(i): log.info('Preprocessed {} mil.rows'.format(i + 1)) hashed_features = {label_name: row[label_name]} for feature in feature_names: str_to_hash = '{}-{}'.format(feature, row[feature]) hashed_features[feature] = categorize_by_hash( str_to_hash, num_categories) writer.writerow(hashed_features)
def make_features(input_file, output_file, mode): count_filename = pathify('data', 'interim', 'avazu-cv-train-count-features.pickle') if mode in ['test', 'val']: count_features = load_pickle(count_filename) else: count_features = prepare_count_features(input_file) save_pickle(count_features, count_filename) fields = make_output_headers() + list(count_features.keys()) with open(output_file, 'w') as csv_file: writer = csv.DictWriter(csv_file, fields) writer.writeheader() for i, row in (iter_as_dict(input_file)): if is_million(i): log.info('Write {} mil.rows to {}'.format(i + 1, output_file)) row_to_write = add_count_features_to_row(row, count_features) row_to_write['hour'] = make_hour_from_row(row) if mode == 'test': row_to_write['click'] = -1 writer.writerow(row_to_write)