def extract_features_as_df(shard_id: int, predictor: TwoStagePredictor):
    part_dfs = []
    aux = []
    md5_path = md5_hex(str(shard_id))
    for js in tqdm(iterate_shard(shard_id)):

        profile = ClientProfile(
            product_info_map=predictor.product_info_map,
            product_encoder=predictor.product_encoder,
            actual_product_encoder=predictor.actual_product_encoder,
            client_js=js,
        )

        precalc = predictor.feature_extractor.build_precalc(profile)
        candidates = predictor.candidate_selector.get_candidates(profile, precalc)

        rows = predictor.feature_extractor.build_features(
            profile, precalc, candidates, js["target"][0]["datetime"]
        )

        features = pd.DataFrame(rows)
        features = features[sorted(features.columns)]

        groupId = "{}:{}:0".format(md5_path, js["client_id"])
        features["_groupId"] = groupId
        gt = set(js["target"][0]["product_ids"])

        features["_label"] = [int(x in gt) for x in candidates]

        part_dfs.append(features)
        aux.append({"gt": list(gt), "candidates": candidates, "groupId": groupId})
    return part_dfs, aux
def estimate_global_top(n_shards=3):
    cnt = defaultdict(int)
    for shard_id in range(n_shards):
        for js in tqdm(iterate_shard(shard_id)):
            for trans in js["transaction_history"]:
                for product in trans["products"]:
                    cnt[product["product_id"]] += 1

    _tmp = list(cnt.keys())
    return sorted(_tmp, key=lambda x: -cnt[x])
def estimate_times(ext_products_df, n_shards):
    stats = {}
    for pid in ext_products_df.product_id.values:
        stats[pid] = {"first_seen_day": 200, "last_seen_day": -200, "cnt": 0}

    for shard_id in range(n_shards):
        for js in tqdm(iterate_shard(shard_id)):
            for trans in js["transaction_history"]:
                curr_date = get_date(trans["datetime"])
                days = days_between(REF_DATE, curr_date)
                for product_item in trans["products"]:
                    pid = product_item["product_id"]
                    stats[pid]["cnt"] += 1
                    stats[pid]["first_seen_day"] = min(stats[pid]["first_seen_day"], days)
                    stats[pid]["last_seen_day"] = max(stats[pid]["last_seen_day"], days)
    stats_df = pd.DataFrame.from_dict(stats, orient="index").reset_index()
    return stats_df
def collect_train_data(
    shard_ids: List[int], product_encoder: ProductEncoderMini, is_train: bool = False
) -> List[TrainingSample]:
    samples = []
    for shard_id in shard_ids:
        for js in tqdm(iterate_shard(shard_id)):
            row = make_coo_row_mini(js["transaction_history"], product_encoder)
            target_items = product_encoder.toIdxWithFilter(js["target"][0]["product_ids"])

            # if train, add the next transaction to target
            if is_train and len(js["target"]) > 1:
                for target in js["target"][1:]:
                    target_items.extend(product_encoder.toIdxWithFilter(target["product_ids"]))

            # skip users with empty target
            if row is None or len(target_items) == 0:
                continue

            samples.append(TrainingSample(row=row, target_items=set(target_items), client_id=js["client_id"],))
    return samples
def extract_batch(shard_id: str, predictor: TwoStagePredictor):
    map_func = partial(_extract, predictor=predictor)
    result = map(map_func, tqdm(iterate_shard(shard_id)))
    return list(result)
import implicit
import numpy as np
import pandas as pd
from scipy import sparse as sp
from tqdm import tqdm

import src.config as cfg
from src.utils import ProductEncoder, iterate_shard, make_coo_row

if __name__ == "__main__":

    product_encoder = ProductEncoder(cfg.PRODUCT_PARQUET_PATH)

    rows = []
    for shard_id in range(8):
        for js in tqdm(iterate_shard(shard_id)):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             lvl="level_4"))
    train_mat = sp.vstack(rows)

    model, tag = (implicit.nearest_neighbours.CosineRecommender(K=10),
                  "L4_cosine10")
    model.fit(train_mat.T)
    out_dir = "../tmp/implicit_full/{}/".format(tag)
    os.makedirs(out_dir, exist_ok=True)
    print("Dump model to " + out_dir)
    pickle.dump(model, open(out_dir + "/model.pkl", "wb"))