コード例 #1
0
def extract_features_as_df(shard_id: int, predictor: TwoStagePredictor):
    part_dfs = []
    aux = []
    md5_path = md5_hex(str(shard_id))
    for js in tqdm(iterate_shard(shard_id)):

        profile = ClientProfile(
            product_info_map=predictor.product_info_map,
            product_encoder=predictor.product_encoder,
            actual_product_encoder=predictor.actual_product_encoder,
            client_js=js,
        )

        precalc = predictor.feature_extractor.build_precalc(profile)
        candidates = predictor.candidate_selector.get_candidates(profile, precalc)

        rows = predictor.feature_extractor.build_features(
            profile, precalc, candidates, js["target"][0]["datetime"]
        )

        features = pd.DataFrame(rows)
        features = features[sorted(features.columns)]

        groupId = "{}:{}:0".format(md5_path, js["client_id"])
        features["_groupId"] = groupId
        gt = set(js["target"][0]["product_ids"])

        features["_label"] = [int(x in gt) for x in candidates]

        part_dfs.append(features)
        aux.append({"gt": list(gt), "candidates": candidates, "groupId": groupId})
    return part_dfs, aux
コード例 #2
0
def estimate_global_top(n_shards=3):
    cnt = defaultdict(int)
    for shard_id in range(n_shards):
        for js in tqdm(iterate_shard(shard_id)):
            for trans in js["transaction_history"]:
                for product in trans["products"]:
                    cnt[product["product_id"]] += 1

    _tmp = list(cnt.keys())
    return sorted(_tmp, key=lambda x: -cnt[x])
コード例 #3
0
def estimate_times(ext_products_df, n_shards):
    stats = {}
    for pid in ext_products_df.product_id.values:
        stats[pid] = {"first_seen_day": 200, "last_seen_day": -200, "cnt": 0}

    for shard_id in range(n_shards):
        for js in tqdm(iterate_shard(shard_id)):
            for trans in js["transaction_history"]:
                curr_date = get_date(trans["datetime"])
                days = days_between(REF_DATE, curr_date)
                for product_item in trans["products"]:
                    pid = product_item["product_id"]
                    stats[pid]["cnt"] += 1
                    stats[pid]["first_seen_day"] = min(stats[pid]["first_seen_day"], days)
                    stats[pid]["last_seen_day"] = max(stats[pid]["last_seen_day"], days)
    stats_df = pd.DataFrame.from_dict(stats, orient="index").reset_index()
    return stats_df
コード例 #4
0
def collect_train_data(
    shard_ids: List[int], product_encoder: ProductEncoderMini, is_train: bool = False
) -> List[TrainingSample]:
    samples = []
    for shard_id in shard_ids:
        for js in tqdm(iterate_shard(shard_id)):
            row = make_coo_row_mini(js["transaction_history"], product_encoder)
            target_items = product_encoder.toIdxWithFilter(js["target"][0]["product_ids"])

            # if train, add the next transaction to target
            if is_train and len(js["target"]) > 1:
                for target in js["target"][1:]:
                    target_items.extend(product_encoder.toIdxWithFilter(target["product_ids"]))

            # skip users with empty target
            if row is None or len(target_items) == 0:
                continue

            samples.append(TrainingSample(row=row, target_items=set(target_items), client_id=js["client_id"],))
    return samples
コード例 #5
0
def extract_batch(shard_id: str, predictor: TwoStagePredictor):
    map_func = partial(_extract, predictor=predictor)
    result = map(map_func, tqdm(iterate_shard(shard_id)))
    return list(result)
コード例 #6
0
import implicit
import numpy as np
import pandas as pd
from scipy import sparse as sp
from tqdm import tqdm

import src.config as cfg
from src.utils import ProductEncoder, iterate_shard, make_coo_row

if __name__ == "__main__":

    product_encoder = ProductEncoder(cfg.PRODUCT_PARQUET_PATH)

    rows = []
    for shard_id in range(8):
        for js in tqdm(iterate_shard(shard_id)):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             lvl="level_4"))
    train_mat = sp.vstack(rows)

    model, tag = (implicit.nearest_neighbours.CosineRecommender(K=10),
                  "L4_cosine10")
    model.fit(train_mat.T)
    out_dir = "../tmp/implicit_full/{}/".format(tag)
    os.makedirs(out_dir, exist_ok=True)
    print("Dump model to " + out_dir)
    pickle.dump(model, open(out_dir + "/model.pkl", "wb"))