Ejemplo n.º 1
0
class ALSRecommender(BaseRecommender):
    """
    implement alternating least squares algorithm implementation based on implicit library
    """
    def fit(self,
            train_df,
            col_user=cfg.USER_COL,
            col_item=cfg.ITEM_COL,
            col_rating=cfg.DEFAULT_RATING_COL,
            factors=100,
            confidence=5,
            regularization=0.1):
        """
        Trains implicit ALS recommender on train data
        :param train_df: pandas DataFrame with train data
         :param col_user: str column name for user
        :param col_item: str column name for item
        :param col_rating: str column name for ratings
        :param factors: int number of factors to use in ALS model
        :param confidence: int as described in implicit documentation
        :param regularization: float higher values mean stronger regularization
        :return: None
        """
        BaseRecommender.fit(self, train_df, col_user, col_item, col_rating)
        self.train_df[self.col_rating] = train_df[self.col_rating] * confidence
        self.uii_matrix = self.get_uii_matrix()
        self.als = AlternatingLeastSquares(factors=factors,
                                           use_gpu=False,
                                           regularization=regularization)
        self.als.fit(self.uii_matrix.T)

    def predict(self, test_df, k=cfg.DEFAULT_K):
        """
        recommend k items for each user in test_df
        :param test_df: pandas DataFrame with test_users and truth recommendations
        :param k: int number of items to recommend
        :return: pandas DataFrame with k recommendations for each user in test_df
        """
        test_users_indices = [
            self.users.index(user) for user in test_df[self.col_user].values
            if user in self.users
        ]

        prediction_records = []
        for item in test_users_indices:
            doc = {
                self.col_user:
                self.users[item],
                self.col_item: [
                    self.items[it[0]] for it in self.als.recommend(
                        item,
                        self.uii_matrix,
                        k,
                        filter_already_liked_items=False)
                ]
            }
            prediction_records.append(doc)
        prediction = pd.DataFrame.from_records(prediction_records)

        return prediction
Ejemplo n.º 2
0
def train_als(train_df, test_df, min_rating=4.0):
    # map each user/item to a unique numeric value
    train_df['user_id'] = train_df['user_id'].astype("category")
    train_df['item_id'] = train_df['item_id'].astype("category")

    ratings_csr = coo_matrix((train_df['rating'].astype(np.float32),
                              (train_df['item_id'].cat.codes.copy(),
                               train_df['user_id'].cat.codes.copy()))).tocsr()

    items = np.array(train_df['item_id'].cat.categories)
    users = np.array(train_df['user_id'].cat.categories)
    ratings = ratings_csr

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    model = AlternatingLeastSquares()
    # lets weight these models by bm25weight.
    ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
    # train the model
    start = time.time()
    model.fit(ratings)
    print("Training time: {}".format(time.time() - start))
    return model, users, items, ratings
Ejemplo n.º 3
0
 def train_and_evaluate(self):
     self.model = AlternatingLeastSquares(factors= 16, \
                                     iterations = 100)
     print('model is going to fit!')
     self.model.fit(self.train.transpose())
     print('model is already!')
     self.evaluate()
Ejemplo n.º 4
0
class ALS(Model):
    def __init__(self):
        """ Model inicialization 
        """
        self.model = AlternatingLeastSquares()
        self.trainset = None

    def fit(self, X, y):
        #Create Coo-Matrix with X and y
        data = coo_matrix((y, (X[:, 0], X[:, 1])))
        self.trainset = data
        data.transpose()  #rows:[n_items] ; columns:[n_users]
        self.model.fit(data)

    def recommend(self, user_id, N=1):
        n_recomendation = self.model.recommend(
            user_id, self.trainset.tocsr(),
            N=N)  #array of tuples (item_id,rating)
        #convert array of [tuples] in array of [item_id]
        result = np.zeros(N, dtype=int)
        pos = 0
        for recomendation_tuple in n_recomendation:
            result[pos] = recomendation_tuple[0]
            pos = pos + 1
        return result

    def get_params(self, deep=True):
        return dict()
Ejemplo n.º 5
0
 def fit(self,
         train_df,
         col_user=cfg.USER_COL,
         col_item=cfg.ITEM_COL,
         col_rating=cfg.DEFAULT_RATING_COL,
         factors=100,
         confidence=5,
         regularization=0.1):
     """
     Trains implicit ALS recommender on train data
     :param train_df: pandas DataFrame with train data
      :param col_user: str column name for user
     :param col_item: str column name for item
     :param col_rating: str column name for ratings
     :param factors: int number of factors to use in ALS model
     :param confidence: int as described in implicit documentation
     :param regularization: float higher values mean stronger regularization
     :return: None
     """
     BaseRecommender.fit(self, train_df, col_user, col_item, col_rating)
     self.train_df[self.col_rating] = train_df[self.col_rating] * confidence
     self.uii_matrix = self.get_uii_matrix()
     self.als = AlternatingLeastSquares(factors=factors,
                                        use_gpu=False,
                                        regularization=regularization)
     self.als.fit(self.uii_matrix.T)
Ejemplo n.º 6
0
    def __init__(self,
                 user_df,
                 song_df,
                 k=100,
                 knn_frac=0.5,
                 max_overlap=0.2,
                 cf_weighting_alpha=1,
                 min_songs=5,
                 mode='popular'):

        self.user_df = user_df
        self.song_df = song_df
        self.cf_weighting_alpha = cf_weighting_alpha
        self.knn_frac = knn_frac
        self.k = k
        self.max_overlap = max_overlap
        self.min_songs = min_songs
        self.mode = mode

        user_df_subset = user_df.loc[user_df['num_songs'] > (min_songs - 1)]
        self.kdtree = KDTree(user_df_subset['MUSIC'].tolist())

        #build the collaborative filtering model with params hardcoded
        als_params = {
            'factors': 16,
            'dtype': np.float32,
            'iterations': 2,
            'calculate_training_loss': True
        }
        self.cf_model = AlternatingLeastSquares(**als_params)
Ejemplo n.º 7
0
    def test_cg_nan(self):
        # test issue with CG code that was causing NaN values in output:
        # https://github.com/benfred/implicit/issues/19#issuecomment-283164905
        raw = [[0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
        counts = csr_matrix(raw, dtype=np.float64)
        for use_native in (True, False):
            model = AlternatingLeastSquares(factors=3,
                                            regularization=0.01,
                                            dtype=np.float64,
                                            use_native=use_native,
                                            use_cg=True,
                                            use_gpu=False)
            model.fit(counts, show_progress=False)
            rows, cols = model.item_factors, model.user_factors

            self.assertFalse(np.isnan(np.sum(cols)))
            self.assertFalse(np.isnan(np.sum(rows)))
Ejemplo n.º 8
0
def make_latent_feature(df: pd.DataFrame,
                        index_col: str,
                        value_col: str,
                        n_factors: int,
                        n_iterations: int,
                        sum_col: Optional[str] = None):
    if sum_col is None:
        csr = make_count_csr(df, index_col=index_col, value_col=value_col)
    else:
        csr = make_sum_csr(
            df,
            index_col=index_col,
            value_col=value_col,
            col_to_sum=sum_col,
        )

    model = AlternatingLeastSquares(
        factors=n_factors,
        dtype=np.float32,
        iterations=n_iterations,
        regularization=0.1,
        use_gpu=False,  # True if n_factors >= 32 else False,
    )
    np.random.seed(RANDOM_STATE)
    model.fit(csr.T)

    return model.user_factors
Ejemplo n.º 9
0
class ALSEstimator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 factors=50,
                 regularization=0.01,
                 iterations=10,
                 filter_seen=True):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.filter_seen = filter_seen

    def fit(self, X, y=None):
        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization,
            iterations=self.iterations,
            dtype=np.float64,
            use_native=True,
            use_cg=True)
        self.model.fit(X)
        if self.fiter_seen:
            self.fit_X = X
        return self

    def predict(self, X, y=None):
        predictions = np.dot(self.model.item_factors,
                             self.model.user_factors.T)
        if self.filter_seen:
            predictions[self.fit_x.nonzero()] = -99
        return predictions
Ejemplo n.º 10
0
    def test_cg_nan2(self):
        # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106)
        Ciu = random(m=100,
                     n=100,
                     density=0.0005,
                     format='coo',
                     dtype=np.float32,
                     random_state=42,
                     data_rvs=None).T.tocsr()

        configs = [{
            'use_native': True,
            'use_gpu': False
        }, {
            'use_native': False,
            'use_gpu': False
        }]
        if HAS_CUDA:
            configs.append({'use_gpu': True})

        for options in configs:
            model = AlternatingLeastSquares(factors=32,
                                            regularization=10,
                                            iterations=10,
                                            dtype=np.float32,
                                            **options)
            model.fit(Ciu, show_progress=False)

            self.assertTrue(np.isfinite(model.item_factors).all())
            self.assertTrue(np.isfinite(model.user_factors).all())
Ejemplo n.º 11
0
    def test_cg_nan(self):
        # test issue with CG code that was causing NaN values in output:
        # https://github.com/benfred/implicit/issues/19#issuecomment-283164905
        raw = [[0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25, 1.2],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333, 1.25],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5, 1.33333333],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.5],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0],
               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
        counts = csr_matrix(raw, dtype=np.float64)
        for use_native in (True, False):
            model = AlternatingLeastSquares(factors=3,
                                            regularization=0.01,
                                            dtype=np.float64,
                                            use_native=use_native,
                                            use_cg=True,
                                            use_gpu=False)
            model.fit(counts, show_progress=False)
            rows, cols = model.item_factors, model.user_factors

            self.assertFalse(np.isnan(np.sum(cols)))
            self.assertFalse(np.isnan(np.sum(rows)))
Ejemplo n.º 12
0
def calculate_similar_event(path, output_filename):
    model = AlternatingLeastSquares()

    a, b = read_event_data(path)
    event, users = hfd5_from_dataframe(a, b, output_filename)

    users.eliminate_zeros()
    users.data = np.ones(len(users.data))

    log.info("Start fitting")
    model.fit(users)

    user_count = np.ediff1d(users.indptr)
    to_generate = sorted(np.arange(len(event)), key=lambda x: -user_count[x])

    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf-8") as o:
            for eventid in to_generate:
                if users.indptr[eventid] != users.indptr[eventid + 1]:
                    name = event[eventid]
                    for other, score in model.similar_items(
                        eventid, int(len(event) * 2 / 3)
                    ):
                        o.write(f"{name},{event[other]},{score}\n")
                progress.update(1)
Ejemplo n.º 13
0
    def test_factorize(self):
        counts = csr_matrix(
            [
                [1, 1, 0, 1, 0, 0],
                [0, 1, 1, 1, 0, 0],
                [1, 0, 1, 0, 0, 0],
                [1, 1, 0, 0, 0, 0],
                [0, 0, 1, 1, 0, 1],
                [0, 1, 0, 0, 0, 1],
                [0, 0, 0, 0, 1, 1],
            ],
            dtype=np.float64,
        )
        user_items = counts * 2

        # try all 8 variants of native/python, cg/cholesky, and
        # 64 vs 32 bit factors
        options = [(dtype, cg, native, False)
                   for dtype in (np.float32, np.float64)
                   for cg in (False, True) for native in (False, True)]

        # also try out GPU support if available
        if HAS_CUDA:
            options.append((np.float32, False, False, True))

        for dtype, use_cg, use_native, use_gpu in options:
            try:
                model = AlternatingLeastSquares(
                    factors=6,
                    regularization=0,
                    dtype=dtype,
                    use_native=use_native,
                    use_cg=use_cg,
                    use_gpu=use_gpu,
                    random_state=42,
                )
                model.fit(user_items, show_progress=False)
                rows, cols = model.item_factors, model.user_factors

                if use_gpu:
                    rows, cols = rows.to_numpy(), cols.to_numpy()

            except Exception as e:
                self.fail(msg="failed to factorize matrix. Error=%s"
                          " dtype=%s, cg=%s, native=%s gpu=%s" %
                          (e, dtype, use_cg, use_native, use_gpu))

            reconstructed = rows.dot(cols.T)
            for i in range(counts.shape[0]):
                for j in range(counts.shape[1]):
                    self.assertAlmostEqual(
                        counts[i, j],
                        reconstructed[i, j],
                        delta=0.0001,
                        msg="failed to reconstruct row=%s, col=%s,"
                        " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s" %
                        (i, j, reconstructed[i, j], dtype, use_cg, use_native,
                         use_gpu),
                    )
Ejemplo n.º 14
0
def benchmark_implicit(matrix, factors, reg, iterations):
    start = time.time()
    model = AlternatingLeastSquares(factors,
                                    regularization=reg,
                                    iterations=iterations,
                                    use_cg=True)
    model.fit(matrix)
    return time.time() - start
Ejemplo n.º 15
0
 def _prep_for_fit(self, train_obs, **fit_params):
     # self.toggle_mkl_blas_1_thread(True)
     self._set_data(train_obs)
     self.set_params(**fit_params)
     self.model = AlternatingLeastSquares(**self.model_params)
     self.model.cg_steps = self.fit_params[
         'cg_steps']  # not passable to __init__()
     self._set_implib_train_mat(self.train_mat)
Ejemplo n.º 16
0
class AlsRecommender(OwnRecommender):
    """Модель, обученная ALS

    Input
    -----
    ds: RecommenderDataset
        подготовленный RecommenderDataset обьект
    """

    def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        self.model = AlternatingLeastSquares(factors=n_factors,
                                             regularization=regularization,
                                             iterations=iterations,
                                             num_threads=num_threads)
        self.model.fit(self.ds.csr_matrix)

        return self

    def _similarItems(self, userId, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        if not self.ds.userExist(userId):
            return self.ds.extend([], N)

        def _get_similar_item(item_id):
            """Находит товар, похожий на item_id"""
            recs = self.model.similar_items(self.ds.itemid_to_id[item_id], N=2)
            if len(recs) > 1:
                top_rec = recs[1][0]
                return self.ds.id_to_itemid[top_rec]
            return item_id

        res = [_get_similar_item(item) for item in self.ds.userTop(userId, N)]
        return self.extend(res, N)

    def _similarUsers(self, userId, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        if not self.ds.userExist(userId):
            return self.ds.extend([], N)

        res = []
        similar_users = [rec[0] for rec in self.model.similar_users(self.ds.userid_to_id[userId], N=N+1)]
        similar_users = similar_users[1:]

        for user in similar_users:
            res.extend(self.ds.userTop(userId, 1))

        return self.extend(res, N)

    def items_embedings(self):
        emb = pd.DataFrame(data=self.model.item_factors).add_prefix('itm')
        emb['item_id'] = self.ds.itemids
        return emb

    def users_embedings(self):
        emb = pd.DataFrame(data=self.model.user_factors).add_prefix('usr')
        emb['user_id'] = self.ds.userids
        return emb
Ejemplo n.º 17
0
    def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        self.model = AlternatingLeastSquares(factors=n_factors,
                                             regularization=regularization,
                                             iterations=iterations,
                                             num_threads=num_threads)
        self.model.fit(self.ds.csr_matrix)

        return self
Ejemplo n.º 18
0
    def fit(user_item_matrix, factors=20, regularization=0.001, iterations=15):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=factors,
                                        regularization=regularization,
                                        iterations=iterations)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Ejemplo n.º 19
0
 def __init__(self, k, reg=1e-4, n_iters=15):
     """"""
     super().__init__()
     self.k = k
     self.reg = reg
     self.n_iters = n_iters
     self.als = AlternatingLeastSquares(
         k, regularization=reg, iterations=n_iters
     )
Ejemplo n.º 20
0
    def _add_als_recs(self,
                      n_factors=20,
                      regularization=0.001,
                      iterations=20,
                      num_threads=0):

        als_model = AlternatingLeastSquares(factors=n_factors,
                                            regularization=regularization,
                                            iterations=iterations,
                                            num_threads=num_threads)

        als_model.fit(csr_matrix(self.user_item_matrix).T.tocsr())
        self.als_model = als_model

        als_recs = lambda i: [
            self.id_to_itemid[rec[0]] for rec in als_model.recommend(
                userid=int(i),
                user_items=csr_matrix(self.user_item_matrix).tocsr(),
                N=self.first_model_rec_limit,
                filter_items=[self.itemid_to_id[999999]],
                recalculate_user=True,
                filter_already_liked_items=False)
        ]
        self.df_users['als_recommender'] = None
        self.df_users.loc[~self.df_users['id'].isnull(),
                          'als_recommender'] = self.df_users.loc[
                              ~self.df_users['id'].isnull(),
                              'id'].map(als_recs)
        self.df_users['als_recommender'] = self.df_users[
            'als_recommender'].map(lambda val: val
                                   if type(val) == type([]) else [])

        # adding embedings to df_users and df_items as features
        als_user_factors = pd.DataFrame(
            self.als_model.user_factors,
            columns=[
                f'als_user_factor_{i}'
                for i in range(self.als_model.user_factors.shape[1])
            ])
        als_user_factors['id'] = als_user_factors.index
        self.df_users = pd.merge(left=self.df_users,
                                 right=als_user_factors,
                                 on='id',
                                 how='left')

        als_item_factors = pd.DataFrame(
            self.als_model.item_factors,
            columns=[
                f'als_item_factor_{i}'
                for i in range(self.als_model.item_factors.shape[1])
            ])
        als_item_factors['id'] = als_item_factors.index
        self.df_items = pd.merge(left=self.df_items,
                                 right=als_item_factors,
                                 on='id',
                                 how='left')
Ejemplo n.º 21
0
def main(params):
    """Main function."""
    # check for mandatory params
    if 'reference_repo' not in params:
        return {'error': 'Mandatory param reference_repo not present'}

    reference_repo = params['reference_repo']
    LOGGER.info('reference_repo %s' % reference_repo)

    # get data
    LOGGER.info('read GBQ data')
    _GC_SVC_ACCOUNT['private_key_id'] = params['GC_SVC_PRIVATE_KEY_ID']
    _GC_SVC_ACCOUNT['private_key'] = params['GC_SVC_PRIVATE_KEY']
    data = pd.io.gbq.read_gbq(_QUERY,
                              dialect="standard",
                              project_id=_GC_SVC_ACCOUNT['project_id'],
                              private_key=json.dumps(_GC_SVC_ACCOUNT))

    # map each repo and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['repo'] = data['repo'].astype("category")

    # dictionaries to translate names to ids and vice-versa
    repos = dict(enumerate(data['repo'].cat.categories))
    repo_ids = {r: i for i, r in repos.items()}

    if reference_repo not in repo_ids:
        return {"message": "No result. Reference repo not in training set."}

    # create a sparse matrix of all the users/repos
    stars = coo_matrix(
        (np.ones(data.shape[0]), (data['repo'].cat.codes.copy(),
                                  data['user'].cat.codes.copy())))

    # train model
    LOGGER.info('training model')
    model = AlternatingLeastSquares(
        factors=50,
        regularization=0.01,
        dtype=np.float64,  # pylint: disable=no-member
        iterations=50)
    confidence = 40
    model.fit(confidence * stars)

    similar_ids = model.similar_items(repo_ids[reference_repo])
    LOGGER.info('found %d similar repos' % len(similar_ids))

    similar_repos = []
    for idx in range(1, len(similar_ids)):
        similar_repos.append(repos[similar_ids[idx][0]])

    return {
        'reference_repo': reference_repo,
        'similar_repos': similar_repos,
        'error': ''
    }
Ejemplo n.º 22
0
    def fit(user_item_matrix):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=100,
                                        regularization=0.01,
                                        iterations=15,
                                        num_threads=4)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Ejemplo n.º 23
0
    def fit(user_item_matrix, n_factors=20, regularization=0.1, iterations=40, num_threads=0):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Ejemplo n.º 24
0
def _train_als(hyperparameters, train):
    h = hyperparameters
    model = AlternatingLeastSquares(factors=h['factors'],
                                    iterations=h['n_iter'],
                                    num_threads=nproc)

    model.fit(train)
    #    test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)}
    #    val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)}
    return model
Ejemplo n.º 25
0
 def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4, show_progress=False):
     """Обучает ALS"""
     
     model = AlternatingLeastSquares(factors=n_factors, 
                                          regularization=regularization,
                                          iterations=iterations,  
                                          num_threads=num_threads)
     model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=show_progress)
     
     return model
Ejemplo n.º 26
0
 def __init__(self, params={"c": None}, nunique_feature=None):
     self.params = params.copy()
     self.c = params["c"]
     del params["c"]
     self.model = ALS(**params)
     self.song_model = ALS(**params)
     self.tag_model = ALS(**params)
     self.song_rec_csr = None
     self.tag_rec_csr = None
     self.nunique_feature = nunique_feature
Ejemplo n.º 27
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Ejemplo n.º 28
0
def collab_filter(song_id, user_song_df, num_songs=5):
    '''
    song_id = spotify id for individual song
    user_song_df= dataframe with users, songs, playcounts etc
    for the time being i am not going to enable filtering by key/tempo as not enough songs
    but in future will do
    '''

    song_num = user_song_df[user_song_df.spotify_id ==
                            song_id].song_nums.values[0]
    print(song_num)
    print(type(song_num))
    #orig_key = song_list[song_list.spotify_id==song_id].key.values[0]
    #orig_tempo= song_list[song_list.spotify_id==song_id].tempo.values[0]

    #check if you want songs of same key
    #if same_key=='yes':
    #if yes then filter out other keys
    #    print(f'key:{orig_key}')
    #    song_list = song_list[song_list.key ==orig_key]

    #can also enter number to specify what key you want
    # elif type(same_key) !=str:
    #     song_list = song_list[song_list.key==same_key]

    # check if you want similar tempo
    #  if similar_tempo=='yes':
    #     print(f'tempo:{orig_tempo}')
    #if yes can also specify how similar you want it
    #     lower= int(orig_tempo)-margin
    #    higher=int(orig_tempo)+margin
    #    song_list=song_list[song_list.tempo.between(lower,higher)]

    #elif type(similar_tempo) !=str:
    #can also specify a specific tempo that you want
    #   song_list = song_list[song_list.tempo.between(int(similar_tempo)-margin,int(similar_tempo)+margin)]

    # refined_ids=song_list.spotify_id
    #this will be updated
    user_song_refined = user_song_df
    #[user_song_df.spotify_id.isin(
    #    refined_ids)].copy()

    plays = user_song_refined['size']
    user_nums = user_song_refined.user_nums
    song_nums = user_song_refined.song_nums

    B = coo_matrix((plays, (song_nums, user_nums))).tocsr()

    model = AlternatingLeastSquares(factors=30)
    model.fit(B)
    songs_inds = model.similar_items(song_num, N=num_songs)
    songs_inds = [tup[0] for tup in songs_inds]

    return user_song_df[user_song_df.song_nums.isin(songs_inds)]
Ejemplo n.º 29
0
def load_recommender(als_model_file: str,
                     index_file: str,
                     item_feature_file: str = None,
                     **kwargs) -> ImplicitRecommender:
    log.info("Loading als model")
    data = np.load(als_model_file, allow_pickle=True)
    model = AlternatingLeastSquares(
        factors=data['model.item_factors'].shape[1])
    model.item_factors = data['model.item_factors']
    model.YtY  # This will initialize the _YtY instance variable which is used directly in internal methods
    if 'user_factors' in data:
        model.user_factors = data['model.user_factors']

    user_labels = data['user_labels']
    item_labels = data['item_labels']

    if index_file is None:
        return ImplicitRecommender(model, user_labels, item_labels)

    elif index_file.endswith('.ann'):
        import annoy
        log.info("Loading annoy recommendation index")
        max_norm, extra = augment_inner_product_matrix(model.item_factors)
        recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
        recommend_index.load(
            index_file)  # prefault=load_to_memory does not seem to work

        if item_feature_file is None:
            from .annoy import ImplicitAnnoyRecommender
            return ImplicitAnnoyRecommender(model, recommend_index, max_norm,
                                            user_labels, item_labels)
        else:
            log.info("Loading item features for recommendation")
            item_feature_data = pickle.load(open(item_feature_file, "rb"))
            tag_tfidf_transformer = item_feature_data['tag_tfidf_transformer']
            tag_lookup = item_feature_data['tag_lookup']
            item_embedding_weight = item_feature_data['item_embedding_weight']
            from .annoy_item_features import ImplicitAnnoyItemFeatureRecommender
            return ImplicitAnnoyItemFeatureRecommender(
                model, recommend_index, max_norm, user_labels, item_labels,
                tag_tfidf_transformer, tag_lookup, item_embedding_weight)
    elif index_file.endswith('.hnsw'):
        import hnswlib
        from .hnsw import ImplicitHNSWRecommender
        log.info("Loading hnsw recommendation index")
        # we build the index in l2 space and load it in inner product space on purpose.
        # This space change gives us 0.96 recall
        l2_recommend_index = hnswlib.Index(space='ip',
                                           dim=model.item_factors.shape[1])
        l2_recommend_index.load_index(index_file)
        l2_recommend_index.set_ef(kwargs.get('ef', 2000))
        return ImplicitHNSWRecommender(model, l2_recommend_index, user_labels,
                                       item_labels)
    else:
        raise RecommenderException("Unsupported file type" + index_file)
Ejemplo n.º 30
0
    def fit(self, user_item_matrix, n_factors, regularization=0.001, iterations=50, num_threads=1, use_gpu=False):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads,
                                        use_gpu=use_gpu)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Ejemplo n.º 31
0
 def fit(user_item_matrix, n_factors=32, regularization=0.001, iterations=15, num_threads=16):
     """Обучает ALS"""
            
     model = AlternatingLeastSquares(factors=n_factors, 
                                     regularization=regularization,
                                     iterations=iterations, 
                                     calculate_training_loss=True, 
                                     num_threads=num_threads)
     model.fit(csr_matrix(user_item_matrix).T.tocsr())
     
     return model      
Ejemplo n.º 32
0
    def test_cg_nan2(self):
        # test out Nan appearing in CG code (from https://github.com/benfred/implicit/issues/106)
        Ciu = random(m=100, n=100, density=0.0005, format='coo', dtype=np.float32,
                     random_state=42, data_rvs=None).T.tocsr()

        configs = [{'use_native': True, 'use_gpu': False}, {'use_native': False, 'use_gpu': False}]
        if HAS_CUDA:
            configs.append({'use_gpu': True})

        for options in configs:
            model = AlternatingLeastSquares(factors=32, regularization=10, iterations=10,
                                            dtype=np.float32,  **options)
            model.fit(Ciu, show_progress=False)

            self.assertTrue(np.isfinite(model.item_factors).all())
            self.assertTrue(np.isfinite(model.user_factors).all())
Ejemplo n.º 33
0
    def test_factorize(self):
        counts = csr_matrix([[1, 1, 0, 1, 0, 0],
                             [0, 1, 1, 1, 0, 0],
                             [1, 0, 1, 0, 0, 0],
                             [1, 1, 0, 0, 0, 0],
                             [0, 0, 1, 1, 0, 1],
                             [0, 1, 0, 0, 0, 1],
                             [0, 0, 0, 0, 1, 1]], dtype=np.float64)
        user_items = counts * 2

        # try all 8 variants of native/python, cg/cholesky, and
        # 64 vs 32 bit factors
        options = [(dtype, cg, native, False)
                   for dtype in (np.float32, np.float64)
                   for cg in (False, True)
                   for native in (False, True)]

        # also try out GPU support if available
        if HAS_CUDA:
            options.append((np.float32, False, False, True))

        for dtype, use_cg, use_native, use_gpu in options:
            try:
                model = AlternatingLeastSquares(factors=6,
                                                regularization=0,
                                                dtype=dtype,
                                                use_native=use_native,
                                                use_cg=use_cg,
                                                use_gpu=use_gpu)
                np.random.seed(23)
                model.fit(user_items, show_progress=False)
                rows, cols = model.item_factors, model.user_factors

            except Exception as e:
                self.fail(msg="failed to factorize matrix. Error=%s"
                              " dtype=%s, cg=%s, native=%s gpu=%s"
                              % (e, dtype, use_cg, use_native, use_gpu))

            reconstructed = rows.dot(cols.T)
            for i in range(counts.shape[0]):
                for j in range(counts.shape[1]):
                    self.assertAlmostEqual(counts[i, j], reconstructed[i, j],
                                           delta=0.0001,
                                           msg="failed to reconstruct row=%s, col=%s,"
                                               " value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s"
                                               % (i, j, reconstructed[i, j], dtype, use_cg,
                                                  use_native, use_gpu))
Ejemplo n.º 34
0
def calculate_similar_movies(output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings,  B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Ejemplo n.º 35
0
    def test_explain(self):
        counts = csr_matrix([[1, 1, 0, 1, 0, 0],
                             [0, 1, 1, 1, 0, 0],
                             [1, 4, 1, 0, 7, 0],
                             [1, 1, 0, 0, 0, 0],
                             [9, 0, 4, 1, 0, 1],
                             [0, 1, 0, 0, 0, 1],
                             [0, 0, 2, 0, 1, 1]], dtype=np.float64)
        user_items = counts * 2
        item_users = user_items.T

        model = AlternatingLeastSquares(factors=4,
                                        regularization=20,
                                        use_native=False,
                                        use_cg=False,
                                        iterations=100)
        np.random.seed(23)
        model.fit(user_items, show_progress=False)

        userid = 0

        # Assert recommendation is the the same if we recompute user vectors
        recs = model.recommend(userid, item_users, N=10)
        recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True)
        for (item1, score1), (item2, score2) in zip(recs, recalculated_recs):
            self.assertEqual(item1, item2)
            self.assertAlmostEqual(score1, score2, 4)

        # Assert explanation makes sense
        top_rec, score = recalculated_recs[0]
        score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec)
        scores = [s for _, s in contributions]
        items = [i for i, _ in contributions]
        self.assertAlmostEqual(score, score_explained, 4)
        self.assertAlmostEqual(score, sum(scores), 4)
        self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order")
        self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user")

        # Assert explanation with precomputed user weights is correct
        top_score_explained, top_contributions, W = model.explain(
            userid, item_users, itemid=top_rec, user_weights=W, N=2)
        top_scores = [s for _, s in top_contributions]
        top_items = [i for i, _ in top_contributions]
        self.assertEqual(2, len(top_contributions))
        self.assertAlmostEqual(score, top_score_explained, 4)
        self.assertEqual(scores[:2], top_scores)
        self.assertEqual(items[:2], top_items)
Ejemplo n.º 36
0
def benchmark_implicit(matrix, factors, reg, iterations):
    start = time.time()
    model = AlternatingLeastSquares(factors, regularization=reg, iterations=iterations, use_cg=True)
    model.fit(matrix)
    return time.time() - start
Ejemplo n.º 37
0
def benchmark_accuracy(plays):
    output = defaultdict(list)

    def store_loss(model, name):
        def inner(iteration, elapsed):
            loss = calculate_loss(plays, model.item_factors, model.user_factors, 0)
            print("model %s iteration %i loss %.5f" % (name, iteration, loss))
            output[name].append(loss)
        return inner

    for steps in [2, 3, 4]:
        model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=True, regularization=0,
                                        iterations=25)
        model.cg_steps = steps
        model.fit_callback = store_loss(model, 'cg%i' % steps)
        model.fit(plays)

    if has_cuda:
        model = AlternatingLeastSquares(factors=100, use_native=True, use_gpu=True,
                                        regularization=0, iterations=25)
        model.fit_callback = store_loss(model, 'gpu')
        model.use_gpu = True
        model.fit(plays)

    model = AlternatingLeastSquares(factors=100, use_native=True, use_cg=False, regularization=0,
                                    iterations=25)
    model.fit_callback = store_loss(model, 'cholesky')
    model.fit(plays)

    return output
Ejemplo n.º 38
0
def benchmark_times(plays, iterations=3):
    times = defaultdict(lambda: defaultdict(list))

    def store_time(model, name):
        def inner(iteration, elapsed):
            print(name, model.factors, iteration, elapsed)
            times[name][model.factors].append(elapsed)
        return inner

    output = defaultdict(list)
    for factors in range(32, 257, 32):
        for steps in [2, 3, 4]:
            model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=True,
                                            regularization=0, iterations=iterations)
            model.fit_callback = store_time(model, 'cg%i' % steps)
            model.cg_steps = steps
            model.fit(plays)

        model = AlternatingLeastSquares(factors=factors, use_native=True, use_cg=False,
                                        regularization=0, iterations=iterations)
        model.fit_callback = store_time(model, 'cholesky')
        model.fit(plays)

        if has_cuda:
            model = AlternatingLeastSquares(factors=factors, use_native=True, use_gpu=True,
                                            regularization=0, iterations=iterations)
            model.fit_callback = store_time(model, 'gpu')
            model.fit(plays)

        # take the min time for the output
        output['factors'].append(factors)
        for name, stats in times.items():
            output[name].append(min(stats[factors]))

    return output