def make_data(self, agg_column, filtr=None, full=False): self.full = full uim = self.prepare_matrix(agg_column=agg_column, full=full, filtr=filtr) uim_w = uim.copy() self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr() uim[uim > 0] = 1 self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr() self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight( csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight( csr_matrix(uim.T).tocsr()) self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight( csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_bm25'] = bm25_weight( csr_matrix(uim.T).tocsr()) self.user_item_matrix['status'] = True self.user_item_matrix['params'] = { 'agg_column': agg_column, 'filtr': filtr, 'full': full } return self.user_item_matrix
def __init__(self, data, values='weight', aggfunc='count', weighting='bm25'): self.users_top = data[data['item_id'] != FAKE_ITEM].groupby( ['user_id', 'item_id']).agg({values: aggfunc}).reset_index() self.users_top.sort_values(values, ascending=False, inplace=True) self.top = data[data['item_id'] != FAKE_ITEM].groupby('item_id').agg({values: aggfunc}).reset_index() self.top = self.top.sort_values(values, ascending=False).item_id.tolist() user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id', values=values, aggfunc=aggfunc, fill_value=0).astype(float) self.userids = user_item_matrix.index.values self.itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(self.userids)) matrix_itemids = np.arange(len(self.itemids)) self.id_to_itemid = dict(zip(matrix_itemids, self.itemids)) self.id_to_userid = dict(zip(matrix_userids, self.userids)) self.itemid_to_id = dict(zip(self.itemids, matrix_itemids)) self.userid_to_id = dict(zip(self.userids, matrix_userids)) self.FAKE_ITEM_ID = self.itemid_to_id[FAKE_ITEM] self.__userTop = {} if weighting == 'tfidf': self.user_item_matrix = tfidf_weight(user_item_matrix.T).T elif weighting == 'bm25': self.user_item_matrix = bm25_weight(user_item_matrix.T).T else: self.user_item_matrix = user_item_matrix self.csr_matrix = csr_matrix(self.user_item_matrix).T.tocsr()
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def __init__(self, data, weighting=True): # Создаем топ покупок self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist() # Матрица user-item self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts( self.user_item_matrix) # Взвешивание if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T # Обучение и рекомендации self.model = self.fit(self.user_item_matrix) # self.own_recommender = self.fit_own_recommender(self.user_item_matrix) # self.cosin_recommender = self.fit_cosin_recommender(self.user_item_matrix) # self.tfidf_recommender = self.fit_tfidf_recommender(self.user_item_matrix) # self.tfidf100_recommender = self.fit_tfidf100_recommender(self.user_item_matrix) self.bm25_recommender = self.fit_bm25_recommender(self.user_item_matrix)
def calculate_recommendations(input_filename, output_filename, model_name="als"): """ Generates artist recommendations for each user in the dataset """ # train the model based off input params df, plays = read_data(input_filename) # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_similar_items = False logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # generate recommendations for each user and write out to a file artists = dict(enumerate(df['artist'].cat.categories)) start = time.time() user_plays = plays.T.tocsr() with open(output_filename, "w") as o: for userid, username in enumerate(df['user'].cat.categories): for artistid, score in model.recommend(userid, user_plays): o.write("%s\t%s\t%s\n" % (username, artists[artistid], score)) logging.debug("generated recommendations in %0.2fs", time.time() - start)
def __init__(self, data, item_features, weighting=True, n_factors=50): # Топ покупок каждого юзера self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist() self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ self.item_id_to_ctm = dict(zip(item_features["item_id"], item_features["brand"] == "Private")) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.n_factors = n_factors self.model = self.fit(self.user_item_matrix, n_factors=n_factors) # Own recommender обучается до взвешивания матрицы self.own_recommender = self.fit_own_recommender(self.user_item_matrix) self.items_emb_df, self.users_emb_df = self.prepare_embeddings(self)
def __init__(self, data, data_product, weighting=True): # Топ покупок каждого юзера self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist() self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid,self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ self.item_id_to_ctm = self.prepare_ctm(data_product) # Own recommender обучается до взвешивания матрицы self.own_recommender = self.fit_own_recommender(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self)
def __init__(self, data, top_popular, item_features, item_mean_cost, popular_exp_item, weighting=True): # Топ покупок каждого юзера self.top_popular = top_popular self.item_features = item_features self.item_mean_cost = item_mean_cost self.popular_exp_item = popular_exp_item self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts( self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T, K1=12, B=0.165).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix) self.all_recommendations = self.get_all_recommendations(self.model, N=200)
def __init__(self, data, weighting='tfidf'): print('Preparing tops...') # Топ покупок каждого юзера self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist() print('Preparing matrix...') self.user_item_matrix = self._prepare_matrix(data) # pd.DataFrame print('Preparing dicts...') self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) print('Weighting...') if weighting == 'tfidf': self.user_item_matrix = tfidf_weight(self.user_item_matrix.T).T else: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T print('Fitting als...') self.model = self.fit(self.user_item_matrix) print('Fitting own recommender...') self.own_recommender = self.fit_own_recommender(self.user_item_matrix) print('Complete.')
def __init__(self, data, values='quantity', weighting=True): # Топ покупок по всему датасету self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) # Формируем user-item матрицу self.user_item_matrix = self._prepare_matrix(data, values) # pd.DataFrame # Формируем вспомогательные словари self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id \ = self._prepare_dicts(self.user_item_matrix) # Взвешиваем матрицу if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T # ALS модель self.model = self.fit(self.user_item_matrix) # ItemItemRecommender модель self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
def setup_model(self, k1=1.2, b=0.75, factors=64, regularization=0.001, use_native=True, use_cg=True): """ """ # Converting to coordinate format to increase space efficiency sparse_content_person = sparse.coo_matrix( (self.data['eventStrength'].astype(float), (self.data['contentId'], self.data['personId']))) #print(f'Before bm25: {sparse_content_person}') sparse_content_person = bm25_weight(sparse_content_person, K1=k1, B=b) #print(f'After bm25: {sparse_content_person}') self.sparse_matrix = sparse_content_person.tocsr() self.model = implicit.als.AlternatingLeastSquares( factors=factors, regularization=regularization, use_native=use_native, use_cg=use_cg)
def __init__(self, data, weighting=True): # Топ покупок каждого юзера self.top_purchases = data.groupby( ['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_item_matrix = self._prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
def __init__(self, data, weighted=True): self.FILTER_ID = FILTER_ID self.top_purchases = data.groupby( ['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != self.FILTER_ID] self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != self.FILTER_ID] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self._user_item_matrix = self.prepare_matrix(data) (self.id_to_item_id, self.id_to_user_id, self.item_id_to_id, self.user_id_to_id) = self.prepare_dicts(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix) if weighted: self._user_item_matrix = bm25_weight(self.user_item_matrix.T).T self._model = None self._user_factors = None self._item_factors = None
def train_als(train_df, test_df, min_rating=4.0): # map each user/item to a unique numeric value train_df['user_id'] = train_df['user_id'].astype("category") train_df['item_id'] = train_df['item_id'].astype("category") ratings_csr = coo_matrix((train_df['rating'].astype(np.float32), (train_df['item_id'].cat.codes.copy(), train_df['user_id'].cat.codes.copy()))).tocsr() items = np.array(train_df['item_id'].cat.categories) users = np.array(train_df['user_id'].cat.categories) ratings = ratings_csr # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) model = AlternatingLeastSquares() # lets weight these models by bm25weight. ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() # train the model start = time.time() model.fit(ratings) print("Training time: {}".format(time.time() - start)) return model, users, items, ratings
def fit_trainset(self, raw_train_dataset): trainset = copy.deepcopy(raw_train_dataset) #trainset = trainset.drop_duplicates(subset=['user','item']) self.mapping_dict, self.inv_mapping_dict = fit_coder( trainset, 'user', 'item', 'rating') self.mapped_trainset = code(copy.deepcopy(trainset), 'user', 'item', 'rating', self.mapping_dict) self.max_index_of_item = len(self.mapped_trainset.item.unique()) self.max_index_of_user = len(self.mapped_trainset.user.unique()) row = self.mapped_trainset.item.values col = self.mapped_trainset.user.values data = self.mapped_trainset.rating.values self.item_users = csr_matrix( (data, (row, col)), shape=(self.max_index_of_item, self.max_index_of_user)) self.user_items = self.item_users.T.tocsr() self.user_items = bm25_weight(self.user_items, B=0.7).tocsr() * 5 self.item_users = self.user_items.T.tocsr() # #Experiment -------------- # add_one = self.item_users.toarray() + 1 # self.item_users = csr_matrix(add_one) # # ------------------------- self.user_items = self.item_users.T.tocsr()
def dump_factors(): numfactors = int(request.args['numfactors'].strip()) model = AlternatingLeastSquares(factors=numfactors, dtype=np.float32, use_gpu=False, iterations=30) model.approximate_recommend = False model.approximate_similar_items = False data = {'userid': [], 'productid': [], 'purchase_count': []} for userid in purchases: for productid in purchases[userid]: data['userid'].append(userid) data['productid'].append(productid) data['purchase_count'].append(purchases[userid][productid]) df = pd.DataFrame(data) df['userid'] = df['userid'].astype("category") df['productid'] = df['productid'].astype("category") userids = list(df['userid'].cat.categories) userids_reverse = dict(zip(userids, list(range(len(userids))))) productids = list(df['productid'].cat.categories) productids_reverse = dict(zip(productids, list(range(len(productids))))) purchases_matrix = coo_matrix((df['purchase_count'].astype(np.float32), (df['productid'].cat.codes.copy(), df['userid'].cat.codes.copy()))) print("Matrix shape: %s, max value: %.2f" % (np.shape(purchases_matrix), np.max(purchases_matrix))) purchases_matrix = bm25_weight(purchases_matrix, K1=2.0, B=0.25) purchases_matrix_T = purchases_matrix.T.tocsr() purchases_matrix = purchases_matrix.tocsr() # to support indexing in recommend/similar_items functions model.fit(purchases_matrix) np.savetxt('item_factors.csv', model.item_factors, delimiter=',') np.savetxt('user_factors.csv', model.user_factors, delimiter=',') with open('item_ids.csv', 'w') as f: for pid in productids_reverse: f.write("%s,%d,%s\n" % (pid, productids_reverse[pid], recommendation.sub(r',', ' ', productnames[pid]))) with open('user_ids.csv', 'w') as f: for uid in userids_reverse: f.write("%s,%d,%s\n" % (uid, userids_reverse[uid], recommendation.sub(r',', ' ', usernames[uid]))) return 'OK\n'
def __init__(self, data, item_features, weighting=True): # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать data = self.prefilter_items(data, item_features) # predefined code self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts( self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix) # вынес сюда группировку, чтобы не делать её каждый раз при вызове метода get_similar_items_recommendation self.popularity = data.groupby(['user_id', 'item_id' ])['quantity'].count().reset_index() self.popularity.sort_values( 'quantity', ascending=False, inplace=True) # "Популярность" по кол-ву покупок item_filter = 999999 if item_filter: self.popularity = self.popularity[ self.popularity['item_id'] != item_filter] # item_filter = 999999, dummy item self.popularity = self.popularity.groupby( 'user_id') # Заранее сгруппируем по юзерам
def normalize_X(X, norm_type): """ Normalizes the X matrix using either tfidf or bm25. Wrapper for tfidf_weight and bm25_weight functions from the :mod:`implicit:implicit.nearest_neighbours` module. Parameters ---------- X : scipy.sparse.csr_matrix sparse matrix of shape (n_users, n_collections) norm_type : str can be either "bm25" or tfidf Returns ------- scipy.sparse.csr_matrix Normalized sparse csr matrix References ---------- .. [1] bm25 and tfidf explanation: https://www.benfrederickson.com/distance-metrics/ .. [2] https://github.com/benfred/implicit/blob/master/implicit/evaluation.pyx """ _sparse_checker(X, '`X`') if norm_type == "bm25": X = bm25_weight(X, K1=100, B=0.8) elif norm_type == "tfidf": X = tfidf_weight(X) else: raise ValueError("Unknown `norm_type` parameter.") return X.tocsr()
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def _set_implib_train_mat(self, train_mat): # implib ALS expects matrix in items x users format self.implib_train_mat = train_mat.T if self.fit_params['use_bm25']: self.implib_train_mat = bm25_weight(self.implib_train_mat, K1=self.fit_params['bm25_k1'], B=self.fit_params['bm25_b']) self.model.regularization = \ self.fit_params['regularization'] * self.implib_train_mat.nnz
def __init__(self, data, weighting=True): self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
def experiment(B, K1, conf, variant='20m', min_rating=3.0): # read in the input data file _, ratings = get_movielens(variant) ratings = ratings.tocsr() # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) training = ratings.tolil() # makes a copy # remove some implicit ratings (make them zeros, i.e., missing) # (these ratings might have already been missing, in fact) movieids = np.random.randint(low=0, high=np.shape(ratings)[0], size=100000) userids = np.random.randint(low=0, high=np.shape(ratings)[1], size=100000) training[movieids, userids] = 0 model = FaissAlternatingLeastSquares(factors=128, iterations=30) model.approximate_recommend = False model.approximate_similar_items = False model.show_progress = False # possibly recalculate scores by bm25weight. if B != "NA": training = bm25_weight(training, B=B, K1=K1).tocsr() # train the model model.fit(training) # compute the predicted ratings moviescores = np.einsum('ij,ij->i', model.item_factors[movieids], model.user_factors[userids]) # using confidence threshold, find boolean predictions preds = (moviescores >= conf) true_ratings = np.ravel(ratings[movieids,userids]) # both model predicted True and user rated movie tp = true_ratings[preds].sum() #tp = ratings[:,userids][preds][movieids].sum() # model predicted True but user did not rate movie fp = preds.sum() - tp # model predicted False but user did rate movie fn = true_ratings.sum() - tp if tp+fp == 0: prec = float('nan') else: prec = float(tp)/float(tp+fp) if tp+fn == 0: recall = float('nan') else: recall = float(tp)/float(tp+fn) if B != "NA": print("%.2f,%.2f,%.2f,%d,%d,%d,%.2f,%.2f" % (B, K1, conf, tp, fp, fn, prec, recall)) else: print("NA,NA,%.2f,%d,%d,%d,%.2f,%.2f" % (conf, tp, fp, fn, prec, recall))
def calculate_similar_businesses(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar businesses. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, ratings = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") ratings = bm25_weight(ratings, K1=100, B=0.8) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(ratings) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar businesses by popularity logging.debug("calculating top businesses") user_count = df.groupby('business').size() businesses = dict(enumerate(df['business'].cat.categories)) to_generate = sorted(list(businesses), key=lambda x: -user_count[x]) # write out as a TSV of businessid, otherbusinessid, score with open(output_filename, "w") as o: for businessid in to_generate: business = businesses[businessid] for other, score in model.similar_items(businessid, 11): o.write("%s\t%s\t%s\n" % (business, businesses[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): """ :param input_path: 训练数据集的路径 :param output_filename: 输出的文件名称 :param model_name: 采用的模型 :param min_rating: 过滤所需的阈值大小 :return: """ logging.debug("reading data from %s", input_path) start = time.time() rating_data, movies_data, m = read_data(input_path, min_rating=min_rating) logging.debug("reading data in %s", time.time() - start) if model_name == "als": model = AlternatingLeastSquares() logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender() else: raise NotImplementedError("TODU: model %s" % model_name) m = m.tocsr() logging.debug("Training model :%s" % model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = rating_data.groupby("movieId").size() movie_lookup = dict((i, m) for i,m in zip(movies_data['movieId'], movies_data['title'])) to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: if(m.indptr[movieid] == m.indptr[movieid + 1]): continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_recommendations(input_filename, output_filename, model_name): """ Generates track_uri recommendations for each pid in the dataset """ df, track_count = read_data(input_filename) # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input by bm25 if issubclass(model.__class__, AlternatingLeastSquares): logging.debug("weighting matrix by bm25_weight") track_count = bm25_weight(track_count, K1=100, B=0.8) # disable building approximate recommend index model.approximate_similar_items = False # transpose the training_matrix track_count = track_count.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(track_count) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # generate recommendations for each pid and creating submission file first_line = 'team_info,team_name,main,[email protected]' recs = [''] tracks = dict(enumerate(df['track_uri'].cat.categories)) start = time.time() pid_track_counts = track_count.T.tocsr() with codecs.open(output_filename, "w") as o: o.write("%s \n" % (first_line)) o.write("\n") for playlist_id, pid in enumerate(df['pid'].cat.categories): for track_id, score in model.recommend(playlist_id, pid_track_counts, N=500): recs.append(tracks[track_id]) if int(pid) >= 1000000: o.write("%s" % (pid)) recs = ','.join(map(str, recs)) o.write(recs) o.write("\n") o.write("\n") recs = [''] logging.debug("generated recommendations in %0.2fs", time.time() - start)
def calculateSimilarArtists(output_filename, dataset, modelName="als"): """ Generates a list of similar artists in lastfm by utilizing the 'similar_items' api of the models """ print(f"getting dataset {dataset}") # artists, users, plays = get_lastfm() getdata = dataSets.get(dataset) if not getdata: raise ValueError(f"Unknown Model {dataset}") artists, users, plays = getdata() # sys.exit() # create a model from the input data model = getModel(modelName) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_recommend = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", modelName) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", modelName, time.time() - start) # write out similar artists by popularity start = time.time() logging.debug("calculating top artists") user_count = np.ediff1d(plays.indptr) to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score logging.debug("writing similar items") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) progress.update(1) logging.debug("generated similar artists in %0.2fs", time.time() - start)
def run_benchmark(args): plays = bm25_weight(scipy.io.mmread(args.inputfile)) qmf_time = benchmark_qmf(args.qmfpath, plays, args.factors, args.regularization, args.iterations) implicit_time = benchmark_implicit(plays, args.factors, args.regularization, args.iterations) print("QMF finished in", qmf_time) print("Implicit finished in", implicit_time) print("Implicit is %s times faster" % (qmf_time / implicit_time))
def __init__(self, data, weighting=True): # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self.user_item_matrix) self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
def __init__(self, data:pd.DataFrame, weighting:bool=True): self.user_item_matrix = self.prepare_matrix(data) self.sparse_user_item = csr_matrix(self.user_item_matrix) self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) if weighting: self.bm25_user_item_matrix = bm25_weight(self.sparse_user_item.T).T # csr-matrix self.model = self.fit(self.bm25_user_item_matrix) self.own_recommender = self.fit_own_recommender(self.bm25_user_item_matrix)
def fit(self, X, y=None): self._reset() self.item_info = X.groupby('item_id').agg({ 'price': 'max', 'SUB_COMMODITY_DESC': 'first' }) self.user_history = pd.DataFrame( X.groupby('user_id').item_id.unique().rename('history')) self.top_purchases = X.groupby(['user_id', 'item_id' ])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != self.filter_item_id] # Топ покупок по всему датасету self.overall_top_purchases = X.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != self.filter_item_id] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_item_matrix = self._prepare_matrix(X, self.matrix_values, self.matrix_aggfunc) self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) if self.weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = AlternatingLeastSquares( factors=self.factors, regularization=self.regularization, iterations=self.iterations, dtype=np.float32, use_native=self.use_native, use_gpu=self.use_gpu, ) self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) self.model_own_recommender = ItemItemRecommender(K=1) self.model_own_recommender.fit( csr_matrix(self.user_item_matrix).T.tocsr()) self._fit = True
def calculate_similar_artists(output_filename, model_name="als"): """ generates a list of similar artists in lastfm by utiliizing the 'similar_items' api of the models """ artists, users, plays = get_lastfm() # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_recommend = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # write out similar artists by popularity start = time.time() logging.debug("calculating top artists") user_count = np.ediff1d(plays.indptr) to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score logging.debug("writing similar items") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) progress.update(1) logging.debug("generated similar artists in %0.2fs", time.time() - start)
def calculate_recommendations(output_filename, model_name="als"): """ Generates artist recommendations for each user in the dataset """ # train the model based off input params artists, users, plays = get_lastfm() # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_similar_items = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # generate recommendations for each user and write out to a file start = time.time() user_plays = plays.T.tocsr() with tqdm.tqdm(total=len(users)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for userid, username in enumerate(users): for artistid, score in model.recommend(userid, user_plays): o.write("%s\t%s\t%s\n" % (username, artists[artistid], score)) progress.update(1) logging.debug("generated recommendations in %0.2fs", time.time() - start)
data['user'].cat.codes.copy()))) return data, plays if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generates file for ann-benchmarks", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--factors', type=int, dest='factors', help='# of factors to use', default=50) parser.add_argument('--input', type=str, dest='inputfile', help='last.fm dataset file', required=True) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) # train a basic ALS model on the last.fm dataset data, plays = read_data(args.inputfile) plays = bm25_weight(plays, K1=100, B=0.8) model = implicit.als.AlternatingLeastSquares(factors=args.factors, regularization=0.8) model.fit(plays) # transform the factors into being appropiate for an inner product search training_data = implicit.approximate_als.augment_inner_product_matrix(model.item_factors) # generate queries from the user factors, setting extra dimension to 0 queries = numpy.append(model.user_factors, numpy.zeros((model.user_factors.shape[0], 1)), axis=1) # dump out data in a format that annbenchmarks expects, add an extra column for other testing filename = "lastfm%s-10000--1-3.npz" % args.factors numpy.savez(filename, train=training_data[1], test=queries[:10000], queries=queries)
parser.add_argument('--input', type=str, required=True, dest='inputfile', help='dataset file in matrix market format') parser.add_argument('--graph', help='generates graphs', action="store_true") parser.add_argument('--loss', help='test training loss', action="store_true") parser.add_argument('--speed', help='test training speed', action="store_true") args = parser.parse_args() if not (args.speed or args.loss): print("must specify at least one of --speed or --loss") parser.print_help() else: plays = bm25_weight(scipy.io.mmread(args.inputfile)).tocsr() logging.basicConfig(level=logging.DEBUG) if args.loss: acc = benchmark_accuracy(plays) json.dump(acc, open("als_accuracy.json", "w")) if args.graph: generate_loss_graph(acc, "als_accuracy.png") if args.speed: speed = benchmark_times(plays) json.dump(speed, open("als_speed.json", "w")) if args.graph: generate_speed_graph(speed, "als_speed.png")