Beispiel #1
0
def collect_train_data(
        jsons: List[str],
        product_encoder: ProductEncoder) -> List[TrainingSample]:
    samples = []
    for js_path in jsons:
        print("Load samples from {}".format(js_path))
        for js in tqdm((json.loads(s) for s in open(js_path))):
            samples.append(
                TrainingSample(
                    row=make_coo_row(js["transaction_history"],
                                     product_encoder),
                    target_items=set(
                        product_encoder.toIdx(js["target"][0]["product_ids"])),
                    client_id=js["client_id"],
                ))
    return samples
Beispiel #2
0
def get_train_data(max_rows=None):
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)

    rows = []
    num_rows = 0
    for shard_idx in tqdm(range(cfg.NUM_SHARDS)):
        for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             normalize=True))
            num_rows += 1

            if max_rows and num_rows == max_rows:
                return sp.vstack(rows)

    trans_mat = sp.vstack(rows)
    return trans_mat
Beispiel #3
0
                storage[key] = (storage[key] + item_cost) / 2.0


if __name__ == "__main__":
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
    num_products = product_encoder.num_products

    items_cost = defaultdict(int)
    rows = []
    num_transactions = 0
    for i in tqdm(range(cfg.NUM_SHARDS)):
        for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
            update_item_cost(js["transaction_history"], product_encoder, items_cost)
            rows.append(
                make_coo_row(
                    js["transaction_history"], product_encoder, normalize=False
                )
            )
            num_transactions += len(js["transaction_history"])
    trans_mat = sp.vstack(rows)

    items_cnt = trans_mat.sum(axis=0).A[0]
    df_top_items = (
        pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame()
    )
    df_items_cost = pd.Series(items_cost, name="cost").to_frame()
    df_misc_features = df_top_items.join(df_items_cost)
    df_misc_features["popularity_position"] = range(num_products)

    df_misc_features.to_csv(cfg.ASSETS_DIR / "products_misc.csv")
 def predict(self, transactions_history):
     row = make_coo_row(transactions_history, self.product_encoder).tocsr()
     raw_recs = self.model.recommend(
         userid=0, user_items=row, N=30, filter_already_liked_items=False, recalculate_user=True
     )
     return self.product_encoder.toPid([idx for (idx, score) in raw_recs])
from utils import (
    ProductEncoder,
    get_shard_path,
    make_coo_row,
    normalized_average_precision,
)

if __name__ == "__main__":
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)

    rows = []
    for i in range(cfg.NUM_SHARDS - 1):
        for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             normalize=True))
    train_mat = sp.vstack(rows)

    model = implicit.nearest_neighbours.CosineRecommender(K=2)
    # model = implicit.nearest_neighbours.CosineRecommender(K=50)
    # model = implicit.nearest_neighbours.TFIDFRecommender(K=100)

    # ALS should be trained with normalize = False
    # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12)
    model.fit(train_mat.T)

    out_dir = cfg.ASSETS_DIR
    os.makedirs(out_dir, exist_ok=True)
    print(f"Dump model to {out_dir}")
    pickle.dump(model, open(out_dir / "model.pkl", "wb"))
Beispiel #6
0
    def get_gbm_features(self,
                         js,
                         train=False,
                         drop_null_target_records=False,
                         add_target_records=False):
        # sort history as in public and check it was unordered
        js["transaction_history"] = sorted(js["transaction_history"],
                                           key=lambda x: x["datetime"])

        if train:
            target_products = set(
                self.product_encoder.toIdx(
                    [pid for pid in js["target"][0]["product_ids"]]))

        transaction_history = js.get("transaction_history", [])
        if transaction_history:
            last_transaction_date = self.to_date(
                transaction_history[-1].get("datetime"))
            # num days from 2000/1/1
            last_transaction_timestamp = (last_transaction_date.toordinal() -
                                          self.datetime_stamp_2000)
        else:
            last_transaction_date = None
            last_transaction_timestamp = None
        num_days_from_last_transaction = self.get_num_days_from_last_transaction(
            js, last_transaction_date)

        product_ind_csr_not_normed = make_coo_row(transaction_history,
                                                  self.product_encoder,
                                                  normalize=False).tocsr()
        product_ind_csr = make_coo_row(js.get("transaction_history", []),
                                       self.product_encoder).tocsr()

        selected_items = list(
            self.get_items_pool(product_ind_csr, product_ind_csr_not_normed))

        cosine50_scores = self.get_implicit_scores(self.model,
                                                   product_ind_csr,
                                                   selected_items,
                                                   model_prefix="cosine50")
        tf_idf_scores = self.get_implicit_scores(self.implicit_tfidf,
                                                 product_ind_csr,
                                                 selected_items,
                                                 model_prefix="tfidf")
        cosine2_scores = self.get_implicit_scores(
            self.implicit_cosine2,
            product_ind_csr,
            selected_items,
            model_prefix="cosine2",
        )
        als_scores = self.get_implicit_scores(
            self.implicit_als,
            product_ind_csr_not_normed,
            selected_items,
            model_prefix="als",
        )

        # co occurrence features
        # all user items purchases
        purchased_items = product_ind_csr.indices
        cooc_weights = product_ind_csr.data
        cooc_purchased_all_scores = self.item_co_occurrence[:, purchased_items]

        cooc_norm_item_features, cooc_scores_norm_co_item_features = self.get_cooc_features(
            cooc_purchased_all_scores, selected_items, purchased_items,
            cooc_weights)

        # scores per each transaction
        cooc_purchased_scores = cooc_purchased_all_scores[purchased_items].A
        cooc_purchased_scores_norm_co_item = (
            cooc_purchased_scores / self.item_occurrence[purchased_items])
        purchased_item2pos = {
            pid: pos
            for pos, pid in enumerate(purchased_items)
        }

        product_history_features = self.get_product_feat_from_history(
            js.get("transaction_history", []),
            cooc_purchased_scores_norm_co_item,
            purchased_item2pos,
        )

        high_level_features = self.get_highlevel_feat_from_history(
            js.get("transaction_history", []))

        # faiss_features = self.get_faiss_features(product_ind_csr, selected_items)
        umap_scores = self.get_umap_scores(product_ind_csr, selected_items)

        gbm_records = []
        for product_idx in selected_items:
            record = dict(
                **{
                    "idx":
                    product_idx,
                    "age":
                    js["age"],
                    "gender":
                    self.feature_extractor["gender"][js["gender"]],
                    "num_transactions":
                    len(js.get("transaction_history", [])),
                    "popularity_position":
                    self.pos_in_top[product_idx],
                    "last_transaction_timestamp":
                    last_transaction_timestamp,
                    "num_days_from_last_transaction":
                    num_days_from_last_transaction,
                },
                **high_level_features,
                **product_history_features.get(
                    product_idx, self.get_default_history_feat(product_idx)),
                **self.product_features.product_features(
                    self.product_encoder.toPid(int(product_idx))),
                **self.get_implicit_features(product_idx),
                **als_scores[product_idx],
                **tf_idf_scores[product_idx],
                **cosine50_scores[product_idx],
                **cosine2_scores[product_idx],
                **next(cooc_norm_item_features),
                **next(cooc_scores_norm_co_item_features),
                # **faiss_features[product_idx],
                **umap_scores[product_idx],
            )

            record["item_pct_spent"] = record.get("item_spent", 0) / max(
                record.get("purchase_sum", 1), 1)

            if train:
                record["target"] = int(product_idx in target_products)
                record["client_id"] = js["client_id"]

            gbm_records.append(record)

        if train:
            gt_products = dict(client_id=js["client_id"],
                               products=list(target_products))
            return gbm_records, gt_products

        return gbm_records
    ProductEncoder,
    TrainingSample,
    coo_to_pytorch_sparse,
    get_shard_path,
    make_coo_row,
    normalized_average_precision,
)

if __name__ == "__main__":

    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)

    rows = []
    for i in range(15):
        for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
            rows.append(make_coo_row(js["transaction_history"], product_encoder))
    train_mat = sp.vstack(rows)

    model = implicit.nearest_neighbours.CosineRecommender(K=1)
    model.fit(train_mat.T)
    out_dir = "../tmp/implicit_cosine1/"
    os.makedirs(out_dir, exist_ok=True)
    print("Dump model to " + out_dir)
    pickle.dump(model, open(out_dir + "/model.pkl", "wb"))

    print("Estimate quiality...")
    scores = []
    for js in tqdm((json.loads(s) for s in open(get_shard_path(15)))):
        row = make_coo_row(js["transaction_history"], product_encoder).tocsr()
        raw_recs = model.recommend(
            userid=0, user_items=row, N=30, filter_already_liked_items=False, recalculate_user=True