def train_model(): train = pd.read_csv('data/train_new_features.csv') train = train.drop( columns=['city', 'reference', 'action_type', 'hotel_cat']) y_train = train[['label']] x_train = train.drop(columns=['label']) groups = group_lengths(x_train["session_id"].values) x_train = x_train.drop(columns=['user_id', 'session_id']) ranker = LGBMRanker(PARAMS) ranker.fit(x_train, y_train.values.ravel(), group=groups, verbose=1)
def parse_model_instance(model_config): model_class = model_config["model_class"] model_params = model_config["model_params"] if model_class == "LGBMRanker": model_instance = LGBMRanker(**model_params) elif model_class == "LGBMRankerMRR": model_instance = LGBMRankerMRR(**model_params) elif model_class == "LGBMRankerMRR2": model_instance = LGBMRankerMRR2(**model_params) elif model_class == "LGBMRankerMRR3": model_instance = LGBMRankerMRR3(**model_params) else: assert False return model_instance
def get_predictions(df: pd.DataFrame, model: lgbm.LGBMRanker): """ :param df: :return: """ print(f'\tPredicting relevance') test_pred = model.predict(df) df['relevance'] = test_pred df.sort_values(by=['srch_id', 'relevance'], ascending=[True, False], inplace=True) kaggle_answer = pd.DataFrame({ 'srch_id': df['srch_id'], 'prop_id': df['prop_id'] }) print(f'\t Writing answers to csv') kaggle_answer.to_csv('expedia_answer.csv', index=False)
def train_lgbm(df: pd.DataFrame, gbm: lgbm.LGBMRanker = None, cv: bool = False): """ :param df: :param seed: random seed :return: """ df.drop(['click_bool', 'booking_bool', 'position'], axis=1, inplace=True) categorical_values = [ x for x in [ 'prop_country_id', 'srch_id', 'site_id', 'visitor_location_country_id', 'prop_id', 'srch_destination_id' ] if x in df.columns.values ] if cv: print("\tSplitting data") train_ids, val_ids = split_data(df) cv_scores = [] for i, train_id in enumerate(train_ids): train_data = df.loc[df['srch_id'].isin(train_id)] val_data = df.loc[df['srch_id'].isin(val_ids[i])] y_train, y_val = train_data['relevance'], val_data['relevance'] X_train, X_val = train_data.drop( 'relevance', axis=1), val_data.drop('relevance', axis=1) train_queries = list( Counter(np.asarray(X_train['srch_id'])).values()) val_queries = list(Counter(np.asarray(X_val['srch_id'])).values()) gbm = lgbm.LGBMRanker(n_estimators=700) print(f"\tTraining LGBM Ranker, fold {i+1}") gbm.fit(X_train, y_train, group=train_queries, eval_set=[(X_val, y_val)], eval_group=[val_queries], eval_at=[5, 38], early_stopping_rounds=50, categorical_feature=categorical_values) feature_importance = zip(X_train.columns.values, gbm.feature_importances_) print( f"Feature importance: {sorted(feature_importance, key= lambda x: x[1])}" ) print('\n') cv_scores.append(gbm.best_score_['valid_0']['ndcg@5']) print(cv_scores) save_model(gbm, cv_scores) return else: X_train = df.drop('relevance', axis=1) y_train = df['relevance'] train_queries = list(Counter(np.asarray(X_train['srch_id'])).values()) gbm = lgbm.LGBMRanker(n_estimators=700) gbm.fit(X_train, y_train, group=train_queries, categorical_feature=categorical_values) return gbm
train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind)) logger.info( f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min():(train_ind.max() + 1)] X_val = mat[val_ind.min():(val_ind.max() + 1)] del mat gc.collect() with timer("model fitting"): model = LGBMRanker(**BEST_PARAMS) model.fit(X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)) val_pred = model.predict(X_val) train_pred = model.predict(X_train) logger.info("Train AUC {:.4f}".format( roc_auc_score(meta_train["was_clicked"].values, train_pred))) logger.info("Val AUC {:.4f}".format( roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) githash = get_git_hash() meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False) joblib.dump(model, "model_val.joblib")
def build_pairwise_model(): model = make_pipeline( data_pipeline(), LGBMRanker(n_estimators=200), ) return model
## Create Pairs, see comment in original code # x_pairs, y_pairs = get_pairs(pd.DataFrame(y_train).join(x_train),granularity = 10,cutoff_ratio = 2,MAX_ITER = np.inf ,MAX_GROUP = 50000,verbose=True) # x_pairs.to_csv('newx.csv', index=False) # y_pairs.to_csv('newy.csv', index=False) # Load Pairs from File x_pairs = pd.read_csv("x_pairs.csv") x_pairs.set_index('Unnamed: 0', inplace=True) y_pairs = pd.read_csv("y_pairs.csv",header=None) y_pairs.set_index(0, inplace=True) y_pairs.columns = ['CTR'] ## Train LGBMRanker lgbr_multi = LGBMRanker(objective = "regression", learning_rate = 0.248, num_leaves = 300, # num_trees = 325, num_trees = 10, max_depth=20) lgbr_multi.fit(x_pairs,y_pairs, group = [2 for i in range(int(len(x_pairs)/2))],categorical_feature = cat_ft_list) ## Getting All Paths adprofile_allwx_all_path = lgtplus.get_all_paths(lgbr_multi) ## Getting High Frequence Patterns feature_names = lgbr_multi._Booster.dump_model()["feature_names"] print(adprofile_allwx_all_path[9]) L2 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,2,50,True,feature_names) L3 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,3,50,True,feature_names) L4 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,4,50,True,feature_names)
# Convert to numpy groups = np.array(groups) y = np.array(y) X = np.array(X) unique_groups = np.unique(groups) # Rank data ranked_y = np.zeros_like(y) for g in unique_groups: indices = groups == g ranks = rankdata(y[indices]) ranked_y[indices] = np.array(ranks / np.max(ranks) * 1000).astype(int) # Ranker ranker = LGBMRanker(n_estimators=500, learning_rate=0.05, num_leaves=16, label_gain=np.arange(0, 1001, 1)) logo = LeaveOneGroupOut() correlations = [] for train_index, test_index in tqdm(logo.split(X, y, groups)): unique, counts = np.unique(groups[train_index], return_counts=True) ranker.fit(X[train_index], ranked_y[train_index], group=counts) predictions = ranker.predict(X[test_index]) correlation, p_value = kendalltau(ranked_y[test_index], predictions) print(np.unique(groups[test_index]), correlation) correlations.append(correlation) print("Mean correlation: ", np.mean(correlations))
df_train, df_val = read_data() vectorizer = make_vectorizer_1() mat_train = vectorizer.fit_transform(df_train, df_train["was_clicked"]) print(mat_train.shape) mat_val = vectorizer.transform(df_val) print(mat_val.shape) def mrr_metric(train_data, preds): mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values) return "error", mrr, True model = LGBMRanker(learning_rate=0.05, n_estimators=900, min_child_samples=5, min_child_weight=0.00001, n_jobs=-2) model.fit( mat_train, df_train["was_clicked"], group=group_lengths(df_train["clickout_id"]), # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1), verbose=True, eval_set=[(mat_val, df_val["was_clicked"])], eval_group=[group_lengths(df_val["clickout_id"])], eval_metric=mrr_metric, ) df_train["click_proba"] = model.predict(mat_train) df_val["click_proba"] = model.predict(mat_val)