コード例 #1
0
    def make_data(self, agg_column, filtr=None, full=False):
        self.full = full
        uim = self.prepare_matrix(agg_column=agg_column,
                                  full=full,
                                  filtr=filtr)
        uim_w = uim.copy()
        self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr()
        uim[uim > 0] = 1
        self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr()

        self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight(
            csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight(
            csr_matrix(uim.T).tocsr())
        self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight(
            csr_matrix(uim_w.T).tocsr())
        self.user_item_matrix['ium_matrix_bm25'] = bm25_weight(
            csr_matrix(uim.T).tocsr())

        self.user_item_matrix['status'] = True
        self.user_item_matrix['params'] = {
            'agg_column': agg_column,
            'filtr': filtr,
            'full': full
        }
        return self.user_item_matrix
コード例 #2
0
ファイル: recommenders.py プロジェクト: SergeAA/rs
    def __init__(self, data, values='weight', aggfunc='count', weighting='bm25'):

        self.users_top = data[data['item_id'] != FAKE_ITEM].groupby(
            ['user_id', 'item_id']).agg({values: aggfunc}).reset_index()
        self.users_top.sort_values(values, ascending=False, inplace=True)

        self.top = data[data['item_id'] != FAKE_ITEM].groupby('item_id').agg({values: aggfunc}).reset_index()
        self.top = self.top.sort_values(values, ascending=False).item_id.tolist()

        user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id',
                                          values=values, aggfunc=aggfunc,
                                          fill_value=0).astype(float)

        self.userids = user_item_matrix.index.values
        self.itemids = user_item_matrix.columns.values
        matrix_userids = np.arange(len(self.userids))
        matrix_itemids = np.arange(len(self.itemids))

        self.id_to_itemid = dict(zip(matrix_itemids, self.itemids))
        self.id_to_userid = dict(zip(matrix_userids, self.userids))
        self.itemid_to_id = dict(zip(self.itemids, matrix_itemids))
        self.userid_to_id = dict(zip(self.userids, matrix_userids))

        self.FAKE_ITEM_ID = self.itemid_to_id[FAKE_ITEM]
        self.__userTop = {}

        if weighting == 'tfidf':
            self.user_item_matrix = tfidf_weight(user_item_matrix.T).T
        elif weighting == 'bm25':
            self.user_item_matrix = bm25_weight(user_item_matrix.T).T
        else:
            self.user_item_matrix = user_item_matrix

        self.csr_matrix = csr_matrix(self.user_item_matrix).T.tocsr()
コード例 #3
0
ファイル: movielens.py プロジェクト: yang0110/implicit
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
コード例 #4
0
    def __init__(self, data, weighting=True):

        # Создаем топ покупок
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        # Матрица user-item
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(
            self.user_item_matrix)

        # Взвешивание
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

            # Обучение и рекомендации
        self.model = self.fit(self.user_item_matrix)
        # self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        # self.cosin_recommender = self.fit_cosin_recommender(self.user_item_matrix)
        # self.tfidf_recommender = self.fit_tfidf_recommender(self.user_item_matrix)
        # self.tfidf100_recommender = self.fit_tfidf100_recommender(self.user_item_matrix)
        self.bm25_recommender = self.fit_bm25_recommender(self.user_item_matrix)
コード例 #5
0
ファイル: lastfm.py プロジェクト: whikwon/implicit
def calculate_recommendations(input_filename,
                              output_filename,
                              model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    df, plays = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name,
                  time.time() - start)

    # generate recommendations for each user and write out to a file
    artists = dict(enumerate(df['artist'].cat.categories))
    start = time.time()
    user_plays = plays.T.tocsr()
    with open(output_filename, "w") as o:
        for userid, username in enumerate(df['user'].cat.categories):
            for artistid, score in model.recommend(userid, user_plays):
                o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
    logging.debug("generated recommendations in %0.2fs", time.time() - start)
コード例 #6
0
	def __init__(self, data, item_features, weighting=True, n_factors=50):
				
		# Топ покупок каждого юзера
		self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
		self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
		self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

		# Топ покупок по всему датасету
		self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
		self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
		self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
		self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
		
		self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
		self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
		
		# Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ
		self.item_id_to_ctm = dict(zip(item_features["item_id"], item_features["brand"] == "Private"))
		
		if weighting:
			self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
		
		self.n_factors = n_factors
		self.model = self.fit(self.user_item_matrix, n_factors=n_factors)
		# Own recommender обучается до взвешивания матрицы
		self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
		
		self.items_emb_df, self.users_emb_df = self.prepare_embeddings(self)
コード例 #7
0
    def __init__(self, data, data_product, weighting=True):
                
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid,self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ
        self.item_id_to_ctm = self.prepare_ctm(data_product)
        
        # Own recommender обучается до взвешивания матрицы
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self)
コード例 #8
0
    def __init__(self,
                 data,
                 top_popular,
                 item_features,
                 item_mean_cost,
                 popular_exp_item,
                 weighting=True):

        # Топ покупок каждого юзера
        self.top_popular = top_popular
        self.item_features = item_features
        self.item_mean_cost = item_mean_cost
        self.popular_exp_item = popular_exp_item

        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(
            self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T,
                                                K1=12,
                                                B=0.165).T

        self.model = self.fit(self.user_item_matrix)

        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

        self.all_recommendations = self.get_all_recommendations(self.model,
                                                                N=200)
コード例 #9
0
    def __init__(self, data, weighting='tfidf'):

        print('Preparing tops...')
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        print('Preparing matrix...')
        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        print('Preparing dicts...')
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        print('Weighting...')
        if weighting == 'tfidf':
            self.user_item_matrix = tfidf_weight(self.user_item_matrix.T).T
        else:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        print('Fitting als...')
        self.model = self.fit(self.user_item_matrix)
        print('Fitting own recommender...')
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        print('Complete.')
コード例 #10
0
    def __init__(self, data, values='quantity', weighting=True):

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        # Формируем user-item матрицу
        self.user_item_matrix = self._prepare_matrix(data,
                                                     values)  # pd.DataFrame
        # Формируем вспомогательные словари
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id \
            = self._prepare_dicts(self.user_item_matrix)

        # Взвешиваем матрицу
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        # ALS модель
        self.model = self.fit(self.user_item_matrix)
        # ItemItemRecommender модель
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
コード例 #11
0
    def setup_model(self,
                    k1=1.2,
                    b=0.75,
                    factors=64,
                    regularization=0.001,
                    use_native=True,
                    use_cg=True):
        """
        """

        # Converting to coordinate format to increase space efficiency
        sparse_content_person = sparse.coo_matrix(
            (self.data['eventStrength'].astype(float),
             (self.data['contentId'], self.data['personId'])))
        #print(f'Before bm25: {sparse_content_person}')
        sparse_content_person = bm25_weight(sparse_content_person, K1=k1, B=b)
        #print(f'After bm25: {sparse_content_person}')

        self.sparse_matrix = sparse_content_person.tocsr()

        self.model = implicit.als.AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            use_native=use_native,
            use_cg=use_cg)
コード例 #12
0
    def __init__(self, data, weighting=True):

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(
            ['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
コード例 #13
0
    def __init__(self, data, weighted=True):

        self.FILTER_ID = FILTER_ID

        self.top_purchases = data.groupby(
            ['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != self.FILTER_ID]

        self.overall_top_purchases = data.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != self.FILTER_ID]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self._user_item_matrix = self.prepare_matrix(data)

        (self.id_to_item_id, self.id_to_user_id, self.item_id_to_id,
         self.user_id_to_id) = self.prepare_dicts(self.user_item_matrix)

        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

        if weighted:
            self._user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self._model = None
        self._user_factors = None
        self._item_factors = None
コード例 #14
0
def train_als(train_df, test_df, min_rating=4.0):
    # map each user/item to a unique numeric value
    train_df['user_id'] = train_df['user_id'].astype("category")
    train_df['item_id'] = train_df['item_id'].astype("category")

    ratings_csr = coo_matrix((train_df['rating'].astype(np.float32),
                              (train_df['item_id'].cat.codes.copy(),
                               train_df['user_id'].cat.codes.copy()))).tocsr()

    items = np.array(train_df['item_id'].cat.categories)
    users = np.array(train_df['user_id'].cat.categories)
    ratings = ratings_csr

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    model = AlternatingLeastSquares()
    # lets weight these models by bm25weight.
    ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
    # train the model
    start = time.time()
    model.fit(ratings)
    print("Training time: {}".format(time.time() - start))
    return model, users, items, ratings
コード例 #15
0
    def fit_trainset(self, raw_train_dataset):
        trainset = copy.deepcopy(raw_train_dataset)
        #trainset = trainset.drop_duplicates(subset=['user','item'])

        self.mapping_dict, self.inv_mapping_dict = fit_coder(
            trainset, 'user', 'item', 'rating')
        self.mapped_trainset = code(copy.deepcopy(trainset), 'user', 'item',
                                    'rating', self.mapping_dict)

        self.max_index_of_item = len(self.mapped_trainset.item.unique())
        self.max_index_of_user = len(self.mapped_trainset.user.unique())

        row = self.mapped_trainset.item.values
        col = self.mapped_trainset.user.values
        data = self.mapped_trainset.rating.values

        self.item_users = csr_matrix(
            (data, (row, col)),
            shape=(self.max_index_of_item, self.max_index_of_user))
        self.user_items = self.item_users.T.tocsr()
        self.user_items = bm25_weight(self.user_items, B=0.7).tocsr() * 5
        self.item_users = self.user_items.T.tocsr()

        # #Experiment --------------
        # add_one = self.item_users.toarray() + 1
        # self.item_users = csr_matrix(add_one)
        # # -------------------------
        self.user_items = self.item_users.T.tocsr()
コード例 #16
0
def dump_factors():
    numfactors = int(request.args['numfactors'].strip())
    model = AlternatingLeastSquares(factors=numfactors, dtype=np.float32, use_gpu=False, iterations=30)
    model.approximate_recommend = False
    model.approximate_similar_items = False
    data = {'userid': [], 'productid': [], 'purchase_count': []}
    for userid in purchases:
        for productid in purchases[userid]:
            data['userid'].append(userid)
            data['productid'].append(productid)
            data['purchase_count'].append(purchases[userid][productid])
    df = pd.DataFrame(data)
    df['userid'] = df['userid'].astype("category")
    df['productid'] = df['productid'].astype("category")
    userids = list(df['userid'].cat.categories)
    userids_reverse = dict(zip(userids, list(range(len(userids)))))
    productids = list(df['productid'].cat.categories)
    productids_reverse = dict(zip(productids, list(range(len(productids)))))
    purchases_matrix = coo_matrix((df['purchase_count'].astype(np.float32),
                                   (df['productid'].cat.codes.copy(),
                                    df['userid'].cat.codes.copy())))
    print("Matrix shape: %s, max value: %.2f" % (np.shape(purchases_matrix), np.max(purchases_matrix)))
    purchases_matrix = bm25_weight(purchases_matrix, K1=2.0, B=0.25)
    purchases_matrix_T = purchases_matrix.T.tocsr()
    purchases_matrix = purchases_matrix.tocsr() # to support indexing in recommend/similar_items functions
    model.fit(purchases_matrix)
    np.savetxt('item_factors.csv', model.item_factors, delimiter=',')
    np.savetxt('user_factors.csv', model.user_factors, delimiter=',')
    with open('item_ids.csv', 'w') as f:
        for pid in productids_reverse:
            f.write("%s,%d,%s\n" % (pid, productids_reverse[pid], recommendation.sub(r',', ' ', productnames[pid])))
    with open('user_ids.csv', 'w') as f:
        for uid in userids_reverse:
            f.write("%s,%d,%s\n" % (uid, userids_reverse[uid], recommendation.sub(r',', ' ', usernames[uid])))
    return 'OK\n'
コード例 #17
0
    def __init__(self, data, item_features, weighting=True):

        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать
        data = self.prefilter_items(data, item_features)

        # predefined code
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(
            self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

        # вынес сюда группировку, чтобы не делать её каждый раз при вызове метода get_similar_items_recommendation
        self.popularity = data.groupby(['user_id', 'item_id'
                                        ])['quantity'].count().reset_index()
        self.popularity.sort_values(
            'quantity', ascending=False,
            inplace=True)  # "Популярность" по кол-ву покупок

        item_filter = 999999
        if item_filter:
            self.popularity = self.popularity[
                self.popularity['item_id'] !=
                item_filter]  # item_filter = 999999, dummy item
        self.popularity = self.popularity.groupby(
            'user_id')  # Заранее сгруппируем по юзерам
コード例 #18
0
ファイル: preprocess.py プロジェクト: qxmd/ImplicitMF
def normalize_X(X, norm_type):
    """
    Normalizes the X matrix using either tfidf or bm25.
    Wrapper for tfidf_weight and bm25_weight functions from
    the :mod:`implicit:implicit.nearest_neighbours` module.

    Parameters
    ----------
    X : scipy.sparse.csr_matrix
        sparse matrix of shape (n_users, n_collections)
    norm_type : str
        can be either "bm25" or tfidf 
    
    Returns
    -------
    scipy.sparse.csr_matrix
        Normalized sparse csr matrix

    References
    ----------
    .. [1] bm25 and tfidf explanation: https://www.benfrederickson.com/distance-metrics/
    .. [2] https://github.com/benfred/implicit/blob/master/implicit/evaluation.pyx
    """
    _sparse_checker(X, '`X`')
    if norm_type == "bm25":
        X = bm25_weight(X, K1=100, B=0.8)
    elif norm_type == "tfidf":
        X = tfidf_weight(X)
    else:
        raise ValueError("Unknown `norm_type` parameter.")
    return X.tocsr()
コード例 #19
0
ファイル: movielens.py プロジェクト: bananemure/implicit
def calculate_similar_movies(output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings,  B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
コード例 #20
0
ファイル: movielens.py プロジェクト: viparitakarani/implicit
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
コード例 #21
0
 def _set_implib_train_mat(self, train_mat):
     # implib ALS expects matrix in items x users format
     self.implib_train_mat = train_mat.T
     if self.fit_params['use_bm25']:
         self.implib_train_mat = bm25_weight(self.implib_train_mat,
                                             K1=self.fit_params['bm25_k1'],
                                             B=self.fit_params['bm25_b'])
     self.model.regularization = \
         self.fit_params['regularization'] * self.implib_train_mat.nnz
コード例 #22
0
 def __init__(self, data, weighting=True):
     
     self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
     self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
     
     if weighting:
         self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
     
     self.model = self.fit(self.user_item_matrix)
     self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
コード例 #23
0
def experiment(B, K1, conf, variant='20m', min_rating=3.0):
    # read in the input data file
    _, ratings = get_movielens(variant)
    ratings = ratings.tocsr()

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    training = ratings.tolil() # makes a copy

    # remove some implicit ratings (make them zeros, i.e., missing)
    # (these ratings might have already been missing, in fact)
    movieids = np.random.randint(low=0, high=np.shape(ratings)[0], size=100000)
    userids = np.random.randint(low=0, high=np.shape(ratings)[1], size=100000)
    training[movieids, userids] = 0

    model = FaissAlternatingLeastSquares(factors=128, iterations=30)
    model.approximate_recommend = False
    model.approximate_similar_items = False
    model.show_progress = False

    # possibly recalculate scores by bm25weight.
    if B != "NA":
        training = bm25_weight(training, B=B, K1=K1).tocsr()

    # train the model
    model.fit(training)

    # compute the predicted ratings
    moviescores = np.einsum('ij,ij->i', model.item_factors[movieids], model.user_factors[userids])
    # using confidence threshold, find boolean predictions
    preds = (moviescores >= conf)
    true_ratings = np.ravel(ratings[movieids,userids])
    # both model predicted True and user rated movie
    tp = true_ratings[preds].sum()
    #tp = ratings[:,userids][preds][movieids].sum()
    # model predicted True but user did not rate movie
    fp = preds.sum() - tp
    # model predicted False but user did rate movie
    fn = true_ratings.sum() - tp
    if tp+fp == 0:
        prec = float('nan')
    else:
        prec = float(tp)/float(tp+fp)
    if tp+fn == 0:
        recall = float('nan')
    else:
        recall = float(tp)/float(tp+fn)
    if B != "NA":
        print("%.2f,%.2f,%.2f,%d,%d,%d,%.2f,%.2f" % (B, K1, conf, tp, fp, fn, prec, recall))
    else:
        print("NA,NA,%.2f,%d,%d,%d,%.2f,%.2f" % (conf, tp, fp, fn, prec, recall))
コード例 #24
0
def calculate_similar_businesses(input_filename,
                                 output_filename,
                                 model_name="als",
                                 factors=50,
                                 regularization=0.01,
                                 iterations=15,
                                 exact=False,
                                 trees=20,
                                 use_native=True,
                                 dtype=numpy.float64,
                                 cg=False):
    logging.debug("Calculating similar businesses. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, ratings = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if exact:
        model = AlternatingLeastSquares(factors=factors,
                                        regularization=regularization,
                                        use_native=use_native,
                                        use_cg=cg,
                                        dtype=dtype)
    else:
        model = AnnoyAlternatingLeastSquares(factors=factors,
                                             regularization=regularization,
                                             use_native=use_native,
                                             use_cg=cg,
                                             dtype=dtype)

    # lets weight these models by bm25weight.
    logging.debug("weighting matrix by bm25_weight")
    ratings = bm25_weight(ratings, K1=100, B=0.8)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    # write out similar businesses by popularity
    logging.debug("calculating top businesses")
    user_count = df.groupby('business').size()
    businesses = dict(enumerate(df['business'].cat.categories))
    to_generate = sorted(list(businesses), key=lambda x: -user_count[x])

    # write out as a TSV of businessid, otherbusinessid, score
    with open(output_filename, "w") as o:
        for businessid in to_generate:
            business = businesses[businessid]
            for other, score in model.similar_items(businessid, 11):
                o.write("%s\t%s\t%s\n" % (business, businesses[other], score))
コード例 #25
0
def calculate_similar_movies(input_path, output_filename,
                             model_name="als", min_rating=4.0):
    """
    :param input_path: 训练数据集的路径
    :param output_filename: 输出的文件名称
    :param model_name: 采用的模型
    :param min_rating: 过滤所需的阈值大小
    :return:
    """

    logging.debug("reading data from %s", input_path)
    start = time.time()
    rating_data, movies_data, m = read_data(input_path, min_rating=min_rating)
    logging.debug("reading data in %s", time.time() - start)

    if model_name == "als":
        model = AlternatingLeastSquares()

        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender()

    else:
        raise NotImplementedError("TODU: model %s" % model_name)


    m = m.tocsr()
    logging.debug("Training model :%s" % model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = rating_data.groupby("movieId").size()
    movie_lookup = dict((i, m) for i,m in
                        zip(movies_data['movieId'], movies_data['title']))
    to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            if(m.indptr[movieid] == m.indptr[movieid + 1]):
                continue

            movie = movie_lookup[movieid]

            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
コード例 #26
0
def calculate_recommendations(input_filename, output_filename, model_name):
    """ Generates track_uri recommendations for each pid in the dataset """

    df, track_count = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):

        logging.debug("weighting matrix by bm25_weight")
        track_count = bm25_weight(track_count, K1=100, B=0.8)

        # disable building approximate recommend index
        model.approximate_similar_items = False

    # transpose the training_matrix
    track_count = track_count.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(track_count)
    logging.debug("trained model '%s' in %0.2fs", model_name,
                  time.time() - start)

    # generate recommendations for each pid and creating submission file

    first_line = 'team_info,team_name,main,[email protected]'
    recs = ['']

    tracks = dict(enumerate(df['track_uri'].cat.categories))
    start = time.time()
    pid_track_counts = track_count.T.tocsr()

    with codecs.open(output_filename, "w") as o:
        o.write("%s \n" % (first_line))
        o.write("\n")
        for playlist_id, pid in enumerate(df['pid'].cat.categories):

            for track_id, score in model.recommend(playlist_id,
                                                   pid_track_counts,
                                                   N=500):
                recs.append(tracks[track_id])

            if int(pid) >= 1000000:
                o.write("%s" % (pid))
                recs = ','.join(map(str, recs))
                o.write(recs)
                o.write("\n")
                o.write("\n")

            recs = ['']
    logging.debug("generated recommendations in %0.2fs", time.time() - start)
コード例 #27
0
ファイル: lastfmV2.py プロジェクト: mw0/MLnotebooks
def calculateSimilarArtists(output_filename, dataset, modelName="als"):
    """
    Generates a list of similar artists in lastfm by utilizing the
    'similar_items' api of the models
    """

    print(f"getting dataset {dataset}")
    # artists, users, plays = get_lastfm()
    getdata = dataSets.get(dataset)
    if not getdata:
        raise ValueError(f"Unknown Model {dataset}")
    artists, users, plays = getdata()
    # sys.exit()

    # create a model from the input data
    model = getModel(modelName)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", modelName)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", modelName,
                  time.time() - start)

    # write out similar artists by popularity
    start = time.time()
    logging.debug("calculating top artists")

    user_count = np.ediff1d(plays.indptr)
    to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    logging.debug("writing similar items")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in model.similar_items(artistid, 11):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
                progress.update(1)

    logging.debug("generated similar artists in %0.2fs", time.time() - start)
コード例 #28
0
ファイル: benchmark_qmf.py プロジェクト: bananemure/implicit
def run_benchmark(args):
    plays = bm25_weight(scipy.io.mmread(args.inputfile))

    qmf_time = benchmark_qmf(args.qmfpath, plays, args.factors, args.regularization,
                             args.iterations)

    implicit_time = benchmark_implicit(plays, args.factors, args.regularization, args.iterations)

    print("QMF finished in", qmf_time)
    print("Implicit finished in", implicit_time)
    print("Implicit is %s times faster" % (qmf_time / implicit_time))
コード例 #29
0
ファイル: recommenders.py プロジェクト: hellge83/AI_rec_sys
 def __init__(self, data, weighting=True):
     
     # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать
     
     self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
     self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
     
     if weighting:
         self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
     
     self.model = self.fit(self.user_item_matrix)
     self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
コード例 #30
0
def run_benchmark(args):
    plays = bm25_weight(scipy.io.mmread(args.inputfile))

    qmf_time = benchmark_qmf(args.qmfpath, plays, args.factors,
                             args.regularization, args.iterations)

    implicit_time = benchmark_implicit(plays, args.factors,
                                       args.regularization, args.iterations)

    print("QMF finished in", qmf_time)
    print("Implicit finished in", implicit_time)
    print("Implicit is %s times faster" % (qmf_time / implicit_time))
コード例 #31
0
    def __init__(self, data:pd.DataFrame, weighting:bool=True):
        self.user_item_matrix = self.prepare_matrix(data)
        self.sparse_user_item = csr_matrix(self.user_item_matrix)
        
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)

        if weighting:
            self.bm25_user_item_matrix = bm25_weight(self.sparse_user_item.T).T # csr-matrix

        self.model = self.fit(self.bm25_user_item_matrix)
        
        self.own_recommender = self.fit_own_recommender(self.bm25_user_item_matrix)
コード例 #32
0
    def fit(self, X, y=None):
        self._reset()
        self.item_info = X.groupby('item_id').agg({
            'price': 'max',
            'SUB_COMMODITY_DESC': 'first'
        })
        self.user_history = pd.DataFrame(
            X.groupby('user_id').item_id.unique().rename('history'))

        self.top_purchases = X.groupby(['user_id', 'item_id'
                                        ])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity',
                                       ascending=False,
                                       inplace=True)
        self.top_purchases = self.top_purchases[
            self.top_purchases['item_id'] != self.filter_item_id]

        # Топ покупок по всему датасету
        self.overall_top_purchases = X.groupby(
            'item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity',
                                               ascending=False,
                                               inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[
            self.overall_top_purchases['item_id'] != self.filter_item_id]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist(
        )

        self.user_item_matrix = self._prepare_matrix(X, self.matrix_values,
                                                     self.matrix_aggfunc)

        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if self.weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization,
            iterations=self.iterations,
            dtype=np.float32,
            use_native=self.use_native,
            use_gpu=self.use_gpu,
        )

        self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr())

        self.model_own_recommender = ItemItemRecommender(K=1)
        self.model_own_recommender.fit(
            csr_matrix(self.user_item_matrix).T.tocsr())

        self._fit = True
コード例 #33
0
ファイル: lastfm.py プロジェクト: bananemure/implicit
def calculate_similar_artists(output_filename, model_name="als"):
    """ generates a list of similar artists in lastfm by utiliizing the 'similar_items'
    api of the models """
    artists, users, plays = get_lastfm()

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # write out similar artists by popularity
    start = time.time()
    logging.debug("calculating top artists")

    user_count = np.ediff1d(plays.indptr)
    to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    logging.debug("writing similar items")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in model.similar_items(artistid, 11):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
                progress.update(1)

    logging.debug("generated similar artists in %0.2fs",  time.time() - start)
コード例 #34
0
ファイル: lastfm.py プロジェクト: bananemure/implicit
def calculate_recommendations(output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    artists, users, plays = get_lastfm()

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # generate recommendations for each user and write out to a file
    start = time.time()
    user_plays = plays.T.tocsr()
    with tqdm.tqdm(total=len(users)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for userid, username in enumerate(users):
                for artistid, score in model.recommend(userid, user_plays):
                    o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs",  time.time() - start)
コード例 #35
0
                        data['user'].cat.codes.copy())))
    return data, plays


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generates file for ann-benchmarks",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--factors', type=int,
                        dest='factors', help='# of factors to use', default=50)
    parser.add_argument('--input', type=str,
                        dest='inputfile', help='last.fm dataset file', required=True)
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG)

    # train a basic ALS model on the last.fm dataset
    data, plays = read_data(args.inputfile)
    plays = bm25_weight(plays, K1=100, B=0.8)
    model = implicit.als.AlternatingLeastSquares(factors=args.factors, regularization=0.8)
    model.fit(plays)

    # transform the factors into being appropiate for an inner product search
    training_data = implicit.approximate_als.augment_inner_product_matrix(model.item_factors)

    # generate queries from the user factors, setting extra dimension to 0
    queries = numpy.append(model.user_factors,
                           numpy.zeros((model.user_factors.shape[0], 1)), axis=1)

    # dump out data in a format that annbenchmarks expects, add an extra column for other testing
    filename = "lastfm%s-10000--1-3.npz" % args.factors
    numpy.savez(filename, train=training_data[1], test=queries[:10000], queries=queries)
コード例 #36
0
ファイル: benchmark_als.py プロジェクト: bananemure/implicit
    parser.add_argument('--input', type=str, required=True,
                        dest='inputfile', help='dataset file in matrix market format')
    parser.add_argument('--graph', help='generates graphs',
                        action="store_true")
    parser.add_argument('--loss', help='test training loss',
                        action="store_true")
    parser.add_argument('--speed', help='test training speed',
                        action="store_true")

    args = parser.parse_args()
    if not (args.speed or args.loss):
        print("must specify at least one of --speed or --loss")
        parser.print_help()

    else:
        plays = bm25_weight(scipy.io.mmread(args.inputfile)).tocsr()
        logging.basicConfig(level=logging.DEBUG)

        if args.loss:
            acc = benchmark_accuracy(plays)
            json.dump(acc, open("als_accuracy.json", "w"))
            if args.graph:
                generate_loss_graph(acc, "als_accuracy.png")

        if args.speed:
            speed = benchmark_times(plays)
            json.dump(speed, open("als_speed.json", "w"))
            if args.graph:
                generate_speed_graph(speed, "als_speed.png")