def extract_features_as_df(shard_id: int, predictor: TwoStagePredictor): part_dfs = [] aux = [] md5_path = md5_hex(str(shard_id)) for js in tqdm(iterate_shard(shard_id)): profile = ClientProfile( product_info_map=predictor.product_info_map, product_encoder=predictor.product_encoder, actual_product_encoder=predictor.actual_product_encoder, client_js=js, ) precalc = predictor.feature_extractor.build_precalc(profile) candidates = predictor.candidate_selector.get_candidates(profile, precalc) rows = predictor.feature_extractor.build_features( profile, precalc, candidates, js["target"][0]["datetime"] ) features = pd.DataFrame(rows) features = features[sorted(features.columns)] groupId = "{}:{}:0".format(md5_path, js["client_id"]) features["_groupId"] = groupId gt = set(js["target"][0]["product_ids"]) features["_label"] = [int(x in gt) for x in candidates] part_dfs.append(features) aux.append({"gt": list(gt), "candidates": candidates, "groupId": groupId}) return part_dfs, aux
def estimate_global_top(n_shards=3): cnt = defaultdict(int) for shard_id in range(n_shards): for js in tqdm(iterate_shard(shard_id)): for trans in js["transaction_history"]: for product in trans["products"]: cnt[product["product_id"]] += 1 _tmp = list(cnt.keys()) return sorted(_tmp, key=lambda x: -cnt[x])
def estimate_times(ext_products_df, n_shards): stats = {} for pid in ext_products_df.product_id.values: stats[pid] = {"first_seen_day": 200, "last_seen_day": -200, "cnt": 0} for shard_id in range(n_shards): for js in tqdm(iterate_shard(shard_id)): for trans in js["transaction_history"]: curr_date = get_date(trans["datetime"]) days = days_between(REF_DATE, curr_date) for product_item in trans["products"]: pid = product_item["product_id"] stats[pid]["cnt"] += 1 stats[pid]["first_seen_day"] = min(stats[pid]["first_seen_day"], days) stats[pid]["last_seen_day"] = max(stats[pid]["last_seen_day"], days) stats_df = pd.DataFrame.from_dict(stats, orient="index").reset_index() return stats_df
def collect_train_data( shard_ids: List[int], product_encoder: ProductEncoderMini, is_train: bool = False ) -> List[TrainingSample]: samples = [] for shard_id in shard_ids: for js in tqdm(iterate_shard(shard_id)): row = make_coo_row_mini(js["transaction_history"], product_encoder) target_items = product_encoder.toIdxWithFilter(js["target"][0]["product_ids"]) # if train, add the next transaction to target if is_train and len(js["target"]) > 1: for target in js["target"][1:]: target_items.extend(product_encoder.toIdxWithFilter(target["product_ids"])) # skip users with empty target if row is None or len(target_items) == 0: continue samples.append(TrainingSample(row=row, target_items=set(target_items), client_id=js["client_id"],)) return samples
def extract_batch(shard_id: str, predictor: TwoStagePredictor): map_func = partial(_extract, predictor=predictor) result = map(map_func, tqdm(iterate_shard(shard_id))) return list(result)
import implicit import numpy as np import pandas as pd from scipy import sparse as sp from tqdm import tqdm import src.config as cfg from src.utils import ProductEncoder, iterate_shard, make_coo_row if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_PARQUET_PATH) rows = [] for shard_id in range(8): for js in tqdm(iterate_shard(shard_id)): rows.append( make_coo_row(js["transaction_history"], product_encoder, lvl="level_4")) train_mat = sp.vstack(rows) model, tag = (implicit.nearest_neighbours.CosineRecommender(K=10), "L4_cosine10") model.fit(train_mat.T) out_dir = "../tmp/implicit_full/{}/".format(tag) os.makedirs(out_dir, exist_ok=True) print("Dump model to " + out_dir) pickle.dump(model, open(out_dir + "/model.pkl", "wb"))