def collect_train_data( jsons: List[str], product_encoder: ProductEncoder) -> List[TrainingSample]: samples = [] for js_path in jsons: print("Load samples from {}".format(js_path)) for js in tqdm((json.loads(s) for s in open(js_path))): samples.append( TrainingSample( row=make_coo_row(js["transaction_history"], product_encoder), target_items=set( product_encoder.toIdx(js["target"][0]["product_ids"])), client_id=js["client_id"], )) return samples
def get_train_data(max_rows=None): product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) rows = [] num_rows = 0 for shard_idx in tqdm(range(cfg.NUM_SHARDS)): for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))): rows.append( make_coo_row(js["transaction_history"], product_encoder, normalize=True)) num_rows += 1 if max_rows and num_rows == max_rows: return sp.vstack(rows) trans_mat = sp.vstack(rows) return trans_mat
storage[key] = (storage[key] + item_cost) / 2.0 if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) num_products = product_encoder.num_products items_cost = defaultdict(int) rows = [] num_transactions = 0 for i in tqdm(range(cfg.NUM_SHARDS)): for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): update_item_cost(js["transaction_history"], product_encoder, items_cost) rows.append( make_coo_row( js["transaction_history"], product_encoder, normalize=False ) ) num_transactions += len(js["transaction_history"]) trans_mat = sp.vstack(rows) items_cnt = trans_mat.sum(axis=0).A[0] df_top_items = ( pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame() ) df_items_cost = pd.Series(items_cost, name="cost").to_frame() df_misc_features = df_top_items.join(df_items_cost) df_misc_features["popularity_position"] = range(num_products) df_misc_features.to_csv(cfg.ASSETS_DIR / "products_misc.csv")
def predict(self, transactions_history): row = make_coo_row(transactions_history, self.product_encoder).tocsr() raw_recs = self.model.recommend( userid=0, user_items=row, N=30, filter_already_liked_items=False, recalculate_user=True ) return self.product_encoder.toPid([idx for (idx, score) in raw_recs])
from utils import ( ProductEncoder, get_shard_path, make_coo_row, normalized_average_precision, ) if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) rows = [] for i in range(cfg.NUM_SHARDS - 1): for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): rows.append( make_coo_row(js["transaction_history"], product_encoder, normalize=True)) train_mat = sp.vstack(rows) model = implicit.nearest_neighbours.CosineRecommender(K=2) # model = implicit.nearest_neighbours.CosineRecommender(K=50) # model = implicit.nearest_neighbours.TFIDFRecommender(K=100) # ALS should be trained with normalize = False # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12) model.fit(train_mat.T) out_dir = cfg.ASSETS_DIR os.makedirs(out_dir, exist_ok=True) print(f"Dump model to {out_dir}") pickle.dump(model, open(out_dir / "model.pkl", "wb"))
def get_gbm_features(self, js, train=False, drop_null_target_records=False, add_target_records=False): # sort history as in public and check it was unordered js["transaction_history"] = sorted(js["transaction_history"], key=lambda x: x["datetime"]) if train: target_products = set( self.product_encoder.toIdx( [pid for pid in js["target"][0]["product_ids"]])) transaction_history = js.get("transaction_history", []) if transaction_history: last_transaction_date = self.to_date( transaction_history[-1].get("datetime")) # num days from 2000/1/1 last_transaction_timestamp = (last_transaction_date.toordinal() - self.datetime_stamp_2000) else: last_transaction_date = None last_transaction_timestamp = None num_days_from_last_transaction = self.get_num_days_from_last_transaction( js, last_transaction_date) product_ind_csr_not_normed = make_coo_row(transaction_history, self.product_encoder, normalize=False).tocsr() product_ind_csr = make_coo_row(js.get("transaction_history", []), self.product_encoder).tocsr() selected_items = list( self.get_items_pool(product_ind_csr, product_ind_csr_not_normed)) cosine50_scores = self.get_implicit_scores(self.model, product_ind_csr, selected_items, model_prefix="cosine50") tf_idf_scores = self.get_implicit_scores(self.implicit_tfidf, product_ind_csr, selected_items, model_prefix="tfidf") cosine2_scores = self.get_implicit_scores( self.implicit_cosine2, product_ind_csr, selected_items, model_prefix="cosine2", ) als_scores = self.get_implicit_scores( self.implicit_als, product_ind_csr_not_normed, selected_items, model_prefix="als", ) # co occurrence features # all user items purchases purchased_items = product_ind_csr.indices cooc_weights = product_ind_csr.data cooc_purchased_all_scores = self.item_co_occurrence[:, purchased_items] cooc_norm_item_features, cooc_scores_norm_co_item_features = self.get_cooc_features( cooc_purchased_all_scores, selected_items, purchased_items, cooc_weights) # scores per each transaction cooc_purchased_scores = cooc_purchased_all_scores[purchased_items].A cooc_purchased_scores_norm_co_item = ( cooc_purchased_scores / self.item_occurrence[purchased_items]) purchased_item2pos = { pid: pos for pos, pid in enumerate(purchased_items) } product_history_features = self.get_product_feat_from_history( js.get("transaction_history", []), cooc_purchased_scores_norm_co_item, purchased_item2pos, ) high_level_features = self.get_highlevel_feat_from_history( js.get("transaction_history", [])) # faiss_features = self.get_faiss_features(product_ind_csr, selected_items) umap_scores = self.get_umap_scores(product_ind_csr, selected_items) gbm_records = [] for product_idx in selected_items: record = dict( **{ "idx": product_idx, "age": js["age"], "gender": self.feature_extractor["gender"][js["gender"]], "num_transactions": len(js.get("transaction_history", [])), "popularity_position": self.pos_in_top[product_idx], "last_transaction_timestamp": last_transaction_timestamp, "num_days_from_last_transaction": num_days_from_last_transaction, }, **high_level_features, **product_history_features.get( product_idx, self.get_default_history_feat(product_idx)), **self.product_features.product_features( self.product_encoder.toPid(int(product_idx))), **self.get_implicit_features(product_idx), **als_scores[product_idx], **tf_idf_scores[product_idx], **cosine50_scores[product_idx], **cosine2_scores[product_idx], **next(cooc_norm_item_features), **next(cooc_scores_norm_co_item_features), # **faiss_features[product_idx], **umap_scores[product_idx], ) record["item_pct_spent"] = record.get("item_spent", 0) / max( record.get("purchase_sum", 1), 1) if train: record["target"] = int(product_idx in target_products) record["client_id"] = js["client_id"] gbm_records.append(record) if train: gt_products = dict(client_id=js["client_id"], products=list(target_products)) return gbm_records, gt_products return gbm_records
ProductEncoder, TrainingSample, coo_to_pytorch_sparse, get_shard_path, make_coo_row, normalized_average_precision, ) if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) rows = [] for i in range(15): for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): rows.append(make_coo_row(js["transaction_history"], product_encoder)) train_mat = sp.vstack(rows) model = implicit.nearest_neighbours.CosineRecommender(K=1) model.fit(train_mat.T) out_dir = "../tmp/implicit_cosine1/" os.makedirs(out_dir, exist_ok=True) print("Dump model to " + out_dir) pickle.dump(model, open(out_dir + "/model.pkl", "wb")) print("Estimate quiality...") scores = [] for js in tqdm((json.loads(s) for s in open(get_shard_path(15)))): row = make_coo_row(js["transaction_history"], product_encoder).tocsr() raw_recs = model.recommend( userid=0, user_items=row, N=30, filter_already_liked_items=False, recalculate_user=True