def eval_routine(DATA_PATH, TEST_PATH, ITEM_FEATURES_PATH, USER_FEATURES_PATH, TEST_SIZE_WEEKS, N_POPULAR_ITEMS, INIT_NUM_RECS, N_FIN_RECS): data_train_lvl_1 = get_raw_data_splits(DATA_PATH, mode=0) data_val_lvl_1 = get_raw_data_splits(TEST_PATH, mode=0) item_features, user_features = preprare_features(ITEM_FEATURES_PATH, USER_FEATURES_PATH) data_train_lvl_1 = prefilter_items(data_train_lvl_1, N_POPULAR_ITEMS) # Prefilter routine data_val_lvl_1 = prefilter_items(data_val_lvl_1, N_POPULAR_ITEMS) # Prefilter routine itemid_to_price = get_price_list(data_train_lvl_1, data_val_lvl_1) user_bought_history = get_bought_ever_list(data_train_lvl_1) item_to_commodity = get_item_commodities_list(item_features) recommender = MainRecommender(data_train_lvl_1, itemid_to_price) result_lvl_1 = data_val_lvl_1.groupby( 'user_id')['item_id'].unique().reset_index() result_lvl_1.columns = ['user_id', 'actual'] result_lvl_1['base_rec'] = result_lvl_1['user_id'].apply( lambda x: recommender.get_own_recommendations(x, N=INIT_NUM_RECS)) # result_lvl_1['als_rec'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=INIT_NUM_RECS)) result_lvl_1 = postfilter_items(result_lvl_1, recommender.overall_top_purchases, item_to_commodity, itemid_to_price, user_bought_history, n=N_FIN_RECS) res = result_lvl_1.apply(lambda row: money_precision_at_k( row['result'], row['actual'], itemid_to_price, k=5), axis=1).mean() print(f"Result money precision @ 5 metric: {res}") if SAVE_RESULTS: result_lvl_1[['user_id', 'result']].to_csv('YN_recs.csv', index=False) return res
def fit(self, data_train, data_val, item_sub_comm, prices, N=5): param_grid_list = self._get_param_grid(self.model_params) result_lvl_1 = data_val.groupby( 'user_id')['item_id'].unique().reset_index() result_lvl_1.columns = ['user_id', 'actual'] best_params = [] best_score = 0 if self.recommender == 'MainRecommender': for top_n in self.top_n_list: X = prefilter_items(data_train, take_n_popular=top_n) for weighting in self.weighting_list: model = MainRecommender(X, weighting=weighting, **param_grid_list[0]) result_lvl_1['recs'] = result_lvl_1['user_id'].apply( lambda x: model.get_main_model_recommendations( x, N=top_n)) result_lvl_1 = postfilter_items(result_lvl_1, 'recs', item_sub_comm, prices, N=N) score = result_lvl_1.apply(lambda row: self.scoring_func( row['postfilter_recs'], row['actual'], k=200), axis=1).mean() if score > best_score: best_score = score best_params = [top_n, weighting, param_grid_list[0]] for param in param_grid_list[1:]: model.model = model.fit(model.user_item_matrix, **param) result_lvl_1['recs'] = result_lvl_1['user_id'].apply( lambda x: model.get_main_model_recommendations( x, N=top_n)) result_lvl_1 = postfilter_items(result_lvl_1, 'recs', item_sub_comm, prices, N=N) score = result_lvl_1.apply( lambda row: self.scoring_func( row['postfilter_recs'], row['actual'], k=N), axis=1).mean() if score > best_score: best_score = score best_params = [top_n, weighting, param] return best_score, best_params
def calc_precision_take_n_popular(data_train, data_test, item_features, take_n_popular): data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=take_n_popular) als_model = MainRecommender(data_train) result = data_test.groupby('user_id')['item_id'].unique().reset_index() result.columns=['user_id', 'actual'] # result['als_recommendations'] = result['user_id'].apply(lambda x: als_model.get_als_recommendations(x, N=5)) result['own_recommendations'] = result['user_id'].apply(lambda x: als_model.get_own_recommendations(x, N=5)) # result['similar_items_recommendations'] = result['user_id'].apply(lambda x: als_model.get_similar_items_recommendation(x, N=5)) res = {} # res['als_recommendations'] = result.apply(lambda row: precision_at_k(row['als_recommendations'], row['actual']), axis=1).mean() res['own_recommendations'] = result.apply(lambda row: precision_at_k(row['own_recommendations'], row['actual']), axis=1).mean() # res['similar_items_recommendations'] = result.apply(lambda row: precision_at_k(row['similar_items_recommendations'], row['actual']), axis=1).mean() return res, result
def get_results(main_data_path, item_features_path, user_features_path, val_lvl_1_size_weeks, val_lvl_2_size_weeks, model_lvl_1_path, model_lvl_2_path, test_data_path): print('Reading data...') data = pd.read_csv(main_data_path) item_features = pd.read_csv(item_features_path) user_features = pd.read_csv(user_features_path) item_features.columns = [col.lower() for col in item_features.columns] user_features.columns = [col.lower() for col in user_features.columns] item_features.rename(columns={'product_id': 'item_id'}, inplace=True) user_features.rename(columns={'household_key': 'user_id'}, inplace=True) data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)] data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) & (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))] data_train_lvl_2 = data_val_lvl_1.copy() data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks] prices = data.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index() prices['price'] = prices['sales_value'] / prices['quantity'] prices.replace(np.inf, 0, inplace=True) prices = dict(zip(prices['item_id'], prices['price'])) items_sub_comm = dict( zip(item_features['item_id'], item_features['sub_commodity_desc'])) trans = DataTransformer() if test_data_path: test_data = pd.read_csv(test_data_path) if model_lvl_1_path and model_lvl_2_path: print('Reading models...') with open(model_lvl_1_path, 'rb') as f: model_lvl_1 = pickle.load(f) with open(model_lvl_2_path, 'rb') as f: model_lvl_2 = pickle.load(f) if test_data_path: result = get_recommendation_df(test_data, model_lvl_1, model_lvl_2, items_sub_comm, prices, item_features, user_features) else: result = get_recommendation_df(data_val_lvl_2, model_lvl_1, model_lvl_2, items_sub_comm, prices, item_features, user_features) else: print('Prepare data for level 1 model fit...') data_train_lvl_1 = prefilter_items(data_train_lvl_1, 6000) print('Level 1 model fit...') num_threads = multiprocessing.cpu_count() // 2 + 1 model_lvl_1 = MainRecommender(data_train_lvl_1, weighting=None, n_factors=100, regularization=0.01, iterations=100, num_threads=num_threads) print('Construct level 1 recommendations DataFrame...') result_lvl_1 = get_recommendation_lvl_1(data_val_lvl_1, model_lvl_1) result_lvl_1 = postfilter_items(result_lvl_1, 'lvl_1_recs', items_sub_comm, prices, N=200) result_lvl_1.rename( columns={'postfilter_lvl_1_recs': 'recommendations'}, inplace=True) print('Prepare data for level 2 model fit...') data_train_lvl_2 = trans.fit_transform(result_lvl_1, data_val_lvl_1, item_features, user_features, with_targets=True) y = data_train_lvl_2['target'] X = data_train_lvl_2.drop('target', axis=1) cat_features = [ 'department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc' ] class_1_weight = len(y[y == 0]) / len(y[y == 1]) model_lvl_2 = CatBoostClassifier(n_estimators=300, max_depth=7, class_weights=[1, class_1_weight], cat_features=cat_features) print('Level 2 model fit...') model_lvl_2.fit(X, y) if test_data_path: result = get_recommendation_df(test_data, model_lvl_1, model_lvl_2, items_sub_comm, prices, item_features, user_features) else: result = get_recommendation_df(data_val_lvl_2, model_lvl_1, model_lvl_2, items_sub_comm, prices, item_features, user_features) print('Calculating final metric...') money_precision_at_5 = result.apply( lambda row: money_precision_at_k(row['postfilter_catboost_recs'], row[ 'actual'], row['recommend_prices']), axis=1).mean() print( f'Money precision@5 for final recommendations:{money_precision_at_5}') result.to_csv('Final_recommendations.csv') return result
from src.utils import prefilter_items, postfilter, create_dataset, dataset_processing from src.recommenders import MainRecommender, Ranking data = pd.read_csv('../raw_data/retail_train.csv') item_features = pd.read_csv('../raw_data/product.csv') user_features = pd.read_csv('../raw_data/hh_demographic.csv') # column processing item_features.columns = [col.lower() for col in item_features.columns] user_features.columns = [col.lower() for col in user_features.columns] item_features.rename(columns={'product_id': 'item_id'}, inplace=True) user_features.rename(columns={'household_key': 'user_id'}, inplace=True) # data filtering data = prefilter_items(data, item_features=item_features, take_n_popular=5000) candidates = pd.DataFrame(data['user_id'].unique()) candidates = candidates.rename(columns={0: 'user_id'}) recommender = MainRecommender(data) # Рекомендации по BM25 взвешиванию candidates['candidates'] = candidates['user_id'].apply(lambda x: recommender.get_bm25_recommendations(x, N=100)) # Создадим датафрейм в целевой переменной и добавим фичи юзеров и товаров targets = create_dataset(data=data, data_candidates=candidates, users_info=user_features, items_info=item_features) # Сгенерируем новые фичи targets = dataset_processing(dataset=targets, data=data, items_info=item_features)
def __init__(self, data, item_info, weighting=True, first_model_weeks=6, second_model_weeks=3, take_n_popular=7000): self.top_purchases = data.groupby( ['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != 999999] self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_buyses = data.groupby( 'user_id')['item_id'].unique().reset_index() self.user_buyses.columns = ['user_id', 'actual'] self.user_item_matrix = self._prepare_matrix(data) self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix) self.item_info = item_info self.first_model_weeks = first_model_weeks self.second_model_weeks = second_model_weeks self.val_lvl_1_size_weeks = first_model_weeks self.val_lvl_2_size_weeks = second_model_weeks self.data_train_lvl_1 = data[ data['week_no'] < data['week_no'].max() - (self.val_lvl_1_size_weeks + self.val_lvl_2_size_weeks)] self.data_val_lvl_1 = data[ (data['week_no'] >= data['week_no'].max() - (self.val_lvl_1_size_weeks + self.val_lvl_2_size_weeks)) & (data['week_no'] < data['week_no'].max() - (self.val_lvl_2_size_weeks))] self.data_train_lvl_1 = prefilter_items(self.data_train_lvl_1, item_features=self.item_info, take_n_popular=take_n_popular) self.data_train_lvl_2 = self.data_val_lvl_1.copy() self.data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - self.val_lvl_2_size_weeks] self.user_buyses_lvl_1 = self.data_train_lvl_1.groupby( 'user_id')['item_id'].unique().reset_index() self.user_buyses_lvl_1.columns = ['user_id', 'actual'] self.users_recommendations_lvl_1 = self.get_als_recommendations_users( self.user_buyses_lvl_1["user_id"], N=25)