Ejemplo n.º 1
0
class BPRRecommender(BaseRecommender):

    def __init__(self):
        super().__init__()
        self.URM = None
        self.item_factors = None
        self.user_factors = None
        self.model = BayesianPersonalizedRanking(factors=1000, num_threads=8, verify_negative_samples=True)

    def fit(self, URM):
        self.URM = URM
        URM_transpose = self.URM.T

        self.model.fit(URM_transpose)

        self.user_factors = self.model.user_factors
        self.item_factors = self.model.item_factors

    def get_expected_ratings(self, user_id):
        scores = np.dot(self.user_factors[user_id], self.item_factors.T)
        return np.squeeze(scores)

    def recommend(self, user_id, at=10):
        expected_ratings = self.get_expected_ratings(user_id)

        recommended_items = np.flip(np.argsort(expected_ratings), 0)

        unseen_items_mask = np.in1d(recommended_items, self.URM[user_id].indices, assume_unique=True, invert=True)
        recommended_items = recommended_items[unseen_items_mask]

        return recommended_items[:at]
Ejemplo n.º 2
0
    def train(self):
        b_time = time.time()
        self.item_idx, self.item_idx_reverse = {}, {}

        if self.reload:
            with open(self.config['item_vec_file'], 'r') as in_f:
                num_items, dim = in_f.readline().strip().split()
                print(f'Num of items : {num_items}, dim : {dim}')
                self.t = AnnoyIndex(int(dim), 'angular')
                
                for idx, line in tqdm(enumerate(in_f)):
                    tmp = line.split()
                    item = tmp[0]
                    self.item_idx[item] = idx
                    self.item_idx_reverse[idx] = item

                self.t = AnnoyIndex(int(dim), 'angular')
                self.t.load(f'{file_name}.ann')
        else:
            Y = []
            with open(self.config['train_file'], 'r') as in_f:
                for idx, line in tqdm(enumerate(in_f)):
                    items_list = line.strip().split()
                    Y.append([self.__get_id(item) for item in items_list])
            # construct the sparse matrix
            indptr = np.fromiter(chain((0,), map(len, Y)), int, len(Y) + 1).cumsum()
            indices = np.fromiter(chain.from_iterable(Y), int, indptr[-1])
            data = np.ones_like(indices)
            user_item_table_csr = csr_matrix((data, indices, indptr))
            item_user_table_csr = user_item_table_csr.T.tocsr()
            print('Matrix size : ', item_user_table_csr.shape)
            print("Train finished ... : ", time.time() - b_time)

            # Train MF
            model_name = "bpr"
            self.model = BayesianPersonalizedRanking(num_threads=20)
            print("training model %s", model_name)
            start = time.time()
            self.model.fit(item_user_table_csr)
            print("trained model '%s' in %s", model_name, time.time() - start)
            print("calculating top movies")

            items_count, dim = self.model.item_factors.shape
            # Build Ann
            self.t = AnnoyIndex(int(dim), 'angular')
            
            with open(config['item_vec_file']) as out_f:
                print(f"{items_count} {dim}", file=out_f)
                for idx, vec in tqdm(enumerate(self.model.item_factors)):
                    self.t.add_item(idx, vec)
                    print(f"{self.item_idx_reverse[idx]} {' '.join(vec.astype(str))}", file=out_f)

            print("Read file finished ...")
            file_name = self.config['index_file_file']

            self.t.build(30) # 10 trees
            self.t.save(f'{file_name}.ann')

        print(f"Train finished ...{time.time() - b_time}")
Ejemplo n.º 3
0
def _train_bpr(hyperparameters, train):
    h = hyperparameters
    model = BayesianPersonalizedRanking(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    #    test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)}
    #    val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)}
    return model
Ejemplo n.º 4
0
    def fit(user_item_matrix,
            n_factors=20,
            regularization=0.001,
            iterations=15,
            num_threads=4):
        """Обучает BPR"""

        model = BayesianPersonalizedRanking(factors=n_factors,
                                            regularization=regularization,
                                            iterations=iterations,
                                            num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Ejemplo n.º 5
0
def get_aucs_vs_factors():
    factors = [8, 16, 32, 64, 128]
    params_list = [{"factors": factor} for factor in factors]

    aucs = []

    for params in params_list:
        model = BayesianPersonalizedRanking(**params)
        model.fit(comments)
        aucs.append(
            auc(test_set[:20000], model.user_factors, model.item_factors,
                subreddits, users))

    return aucs
Ejemplo n.º 6
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Ejemplo n.º 7
0
 def __init__(self, *args, **kwargs):
     """
     Construct an ALS recommender.  The arguments are passed as-is to
     :py:class:`implicit.als.BayesianPersonalizedRanking`.
     """
     from implicit.bpr import BayesianPersonalizedRanking
     super().__init__(BayesianPersonalizedRanking(*args, **kwargs))
Ejemplo n.º 8
0
def train_implicit_bpr(
    train_df: pd.DataFrame, params: Dict[str, Union[str, float, int]],
    shape: [int,
            int]) -> Tuple[BayesianPersonalizedRanking, sp.csr.csr_matrix]:

    train_matrix = create_implicit_train_matrix(train_df, shape)

    model_params = dict()
    args = inspect.getfullargspec(BayesianPersonalizedRanking.__init__)[0]
    for param, param_val in params.items():
        if param in args:
            model_params[param] = param_val

    model = BayesianPersonalizedRanking(**model_params)
    model.fit(train_matrix, show_progress=False)

    return model, train_matrix
Ejemplo n.º 9
0
def evaluate_bpr_model(hyperparameters, train, test, validation):
    h = hyperparameters

    model = BayesianPersonalizedRanking(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    test_eval = {
        'p@k': precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10)
    }
    val_eval = {
        'p@k': precision_at_k(model,
                              train.T.tocsr(),
                              validation.T.tocsr(),
                              K=10)
    }
    return test_eval, val_eval
Ejemplo n.º 10
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
Ejemplo n.º 11
0
 def initialize_components(self):
     self.bpr_mf = BPR_matrix_factorization(factors=200,
                                            regularization=0.00000,
                                            learning_rate=0.01,
                                            iterations=65)
     self.ials_cg_mf = IALS_CG(iterations=15,
                               calculate_training_loss=True,
                               factors=500,
                               use_cg=True,
                               regularization=1e-3)
Ejemplo n.º 12
0
    def fit(user_item_matrix,
            n_factors=20,
            regularization=0.001,
            iterations=15,
            num_threads=0):
        """Обучает ALS"""

        # model = AlternatingLeastSquares(factors=n_factors,
        model = BayesianPersonalizedRanking(
            factors=n_factors,
            regularization=regularization,
            iterations=iterations,
            # calculate_training_loss=True,
            num_threads=num_threads,
        )

        model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
        print('LOG = 005')
        return model
Ejemplo n.º 13
0
    def BPR_train(self, inputs, rating_df):
        model = BayesianPersonalizedRanking(factors=60)
        model.fit(inputs)

        # user_embeddings = model.user_factors
        movie_embeddings = model.item_factors

        # id를 영화 이름으로 변경
        id_title_dict = {k: v for k, v in self.movie_df['title'].items()}
        title = [
            id_title_dict[movie_id]
            for movie_id in rating_df['movie_id'].cat.categories
        ]

        # movie embedding
        movie_embedding_df = pd.DataFrame(movie_embeddings, index=title)
        # user_names = [user_id for user_id in rating_df['user_id'].cat.categories]
        # user_embedding_df = pd.DataFrame(user_embeddings, index=user_names)

        return movie_embedding_df  # , user_embedding_df
Ejemplo n.º 14
0
    def bpr(self, database, **kwargs):
        from implicit.bpr import BayesianPersonalizedRanking
        opts = self.get_option('implicit', 'bpr', **kwargs)
        model = BayesianPersonalizedRanking(
            **opts
        )
        ratings = self.get_database(database, **kwargs)
        if kwargs.get('return_instance_before_train'):
            return (model, ratings)

        elapsed, mem_info = self.run(model.fit, ratings)
        model = None
        return elapsed, mem_info
Ejemplo n.º 15
0
    def initialize_components(self):

        self.train = self.rescale_wrt_insertion_order(self.train)

        self.item_cosineCF_recommender = Cosine_Similarity(self.train, topK=200, shrink=15, normalize=True, mode='cosine')
        self.user_cosineCF_recommender = Cosine_Similarity(self.train.T, topK=200, shrink=15, normalize=True, mode='cosine')
        self.svd_recommender = PureSVDRecommender(self.train)
        self.cbf_bpr_recommender = SLIM_BPR_Cython(self.icm.T, positive_threshold=0)
        self.cbf_recommender = Cosine_Similarity(self.icm.T, topK=50, shrink=10, normalize=True, mode='cosine')
        self.item_rp3b_recommender = RP3betaRecommender(self.train)
        self.user_rp3b_recommender = RP3betaRecommender(self.train.T)
        self.bpr_mf = BPR_matrix_factorization(factors=800, regularization=0.01, learning_rate=0.01, iterations=300)
        self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3)
        self.lightfm = LightFM_Recommender(self.train, self.icm, no_components=200)
Ejemplo n.º 16
0
def BPR(A: sp.coo_matrix, factors: int, lr: float, regularization: float,
        iterations: float):
    '''
    Run BayesianPersonalizedRanking - BPR: Bayesian Personalized Ranking from ImplicitFeedback
    :param A: userxitem matrix
    :param factors: embedding size
    :param lr: learning rate
    :param regularization: regularizazion parameter
    :param iterations: how many training updates
    '''
    bpr = BayesianPersonalizedRanking(factors=factors,
                                      learning_rate=lr,
                                      regularization=regularization,
                                      use_gpu=True,
                                      iterations=iterations,
                                      verify_negative_samples=True,
                                      num_threads=10)
    bpr.fit(A.T)

    # Last one is the bias term. However user_bias is 1 (not used) so a simple dot product works.
    item_factors = bpr.item_factors
    user_factors = bpr.user_factors
    return user_factors.dot(item_factors.T)
Ejemplo n.º 17
0
 def __init__(self):
     super().__init__()
     self.URM = None
     self.item_factors = None
     self.user_factors = None
     self.model = BayesianPersonalizedRanking(factors=1000, num_threads=8, verify_negative_samples=True)
Ejemplo n.º 18
0
class MF_kNN(Model):
    
    def __init__(self, config):
        self.requirement = ['test_file', 'lastN', 'topN', 'train_file', 'index_file_file']
        self.config = config
        miss = set()
        for item in self.requirement:
            if item not in self.config:
                miss.add(item)
        if len(miss) > 0:
            raise Exception(f"Miss the key : {miss}")

        Model.__init__(self, 
                self.config['test_file'], 
                self.config['lastN'],
                self.config['topN']
            )

        self.reload = self.config.get('reload', False)


    def __get_id(self, item):
        if item in self.item_idx:
            _id = self.item_idx[item]
        else:
            _id = len(self.item_idx)
            self.item_idx[item] = _id
            self.item_idx_reverse[_id] = item

        return _id


    def train(self):
        b_time = time.time()
        self.item_idx, self.item_idx_reverse = {}, {}

        if self.reload:
            with open(self.config['item_vec_file'], 'r') as in_f:
                num_items, dim = in_f.readline().strip().split()
                print(f'Num of items : {num_items}, dim : {dim}')
                self.t = AnnoyIndex(int(dim), 'angular')
                
                for idx, line in tqdm(enumerate(in_f)):
                    tmp = line.split()
                    item = tmp[0]
                    self.item_idx[item] = idx
                    self.item_idx_reverse[idx] = item

                self.t = AnnoyIndex(int(dim), 'angular')
                self.t.load(f'{file_name}.ann')
        else:
            Y = []
            with open(self.config['train_file'], 'r') as in_f:
                for idx, line in tqdm(enumerate(in_f)):
                    items_list = line.strip().split()
                    Y.append([self.__get_id(item) for item in items_list])
            # construct the sparse matrix
            indptr = np.fromiter(chain((0,), map(len, Y)), int, len(Y) + 1).cumsum()
            indices = np.fromiter(chain.from_iterable(Y), int, indptr[-1])
            data = np.ones_like(indices)
            user_item_table_csr = csr_matrix((data, indices, indptr))
            item_user_table_csr = user_item_table_csr.T.tocsr()
            print('Matrix size : ', item_user_table_csr.shape)
            print("Train finished ... : ", time.time() - b_time)

            # Train MF
            model_name = "bpr"
            self.model = BayesianPersonalizedRanking(num_threads=20)
            print("training model %s", model_name)
            start = time.time()
            self.model.fit(item_user_table_csr)
            print("trained model '%s' in %s", model_name, time.time() - start)
            print("calculating top movies")

            items_count, dim = self.model.item_factors.shape
            # Build Ann
            self.t = AnnoyIndex(int(dim), 'angular')
            
            with open(config['item_vec_file']) as out_f:
                print(f"{items_count} {dim}", file=out_f)
                for idx, vec in tqdm(enumerate(self.model.item_factors)):
                    self.t.add_item(idx, vec)
                    print(f"{self.item_idx_reverse[idx]} {' '.join(vec.astype(str))}", file=out_f)

            print("Read file finished ...")
            file_name = self.config['index_file_file']

            self.t.build(30) # 10 trees
            self.t.save(f'{file_name}.ann')

        print(f"Train finished ...{time.time() - b_time}")


    def predict(self, last_n_events, topN):
        b_time = time.time()
        item_similar = list()
        candidate_items = set()
        
        last_n_items = [self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1] if e.split(':', 1)[1] in self.item_idx]
        
        if len(last_n_items) == 0:
            return []

        for item_idx in last_n_items:
            similar_res = self.__item_topK_similar(item_idx, topN)
            item_similar.append(similar_res)
            candidate_items.update(set(similar_res.keys()))

        candidate_list = list(candidate_items)
        score_matric = np.zeros((len(last_n_items), len(candidate_list)))
        for i, item_id in enumerate(last_n_items):
            score_matric[i] = self.__item_item_arr_norm_score(item_id, candidate_list, item_similar[i])

        rank_weight = np.array([1 / np.log2(rank + 2) for rank in range(len(last_n_items))])
        final_score = rank_weight.dot(score_matric).tolist()

        # print(last_n_items, list(zip(candidate_list, final_score)))
        final_items = sorted(zip(candidate_list, final_score), key=lambda x:x[1], reverse=True)
        return [item for item, score in final_items[:topN]]


    def __item_topK_similar(self, given_idx, topK):
        item_idx_arr, score_arr = self.t.get_nns_by_item(given_idx, topK, include_distances=True)
        res = {}
        for idx, score in zip(item_idx_arr, score_arr):
            try:
                item_raw = self.item_idx_reverse[idx]
                if item_raw not in res:
                    # return to cosine score
                    res[item_raw] = 1 - score**2/2
            except:
                pass
            
        return res


    def __item_item_arr_norm_score(self, item, candidate_item_arr, similar_items):
        res = np.zeros(len(candidate_item_arr))
        for _item in similar_items:
            _score = similar_items[_item]
            if _item in candidate_item_arr:
                res[candidate_item_arr.index(_item)] = float(_score)
        return res / np.linalg.norm(res)
Ejemplo n.º 19
0
def calculate_similar_movies(input_filename,
                             output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    # titles, ratings = get_movielens(variant)

    user_item_df = read_user_item_data(input_filename)
    print(user_item_df)
    unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto(
        user_item_df)

    #user_item_df = user_item_df.sort_values(by=['user_index','item_index'])
    user_item_ratings = scipy.sparse.csr_matrix(
        (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index'])))
    print(user_item_ratings)
    '''
    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    '''

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares(
            factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True)

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_item_ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    k=10
    iterations = 10000
    similar_df_gen = similar_to_csv(model, k, unique_item, iterations)

    with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress:
        for similar_df_slice in similar_df_gen:
            similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False)
            print("finsih a batch")
            progress.update(1)

    '''
Ejemplo n.º 20
0
 def _get_model(self):
     return BayesianPersonalizedRanking(factors=3,
                                        regularization=0,
                                        use_gpu=False)
Ejemplo n.º 21
0
class HHimmlerEnsemble:
    def __init__(self, urm_train, urm_test, icm, parameters=None):

        if parameters is None:
            parameters = {
                "USER_CF": 0.8,
                "USER_BPR": 0.7,
                "ITEM_CF": 1,
                "ITEM_BPR": 0.8,
                "CBF": 0.3,
                "IALS": 1.0,
                "CBF_BPR": 1
            }

        self.ensemble_weights = parameters
        self.train = urm_train.tocsr()
        self.test = urm_test.tocsr()
        self.icm = icm.tocsr()

        self.initialize_components()

    def initialize_components(self):
        self.bpr_mf = BPR_matrix_factorization(factors=200,
                                               regularization=0.00000,
                                               learning_rate=0.01,
                                               iterations=65)
        self.ials_cg_mf = IALS_CG(iterations=15,
                                  calculate_training_loss=True,
                                  factors=500,
                                  use_cg=True,
                                  regularization=1e-3)

    def fit(self):
        self.bpr_mf.fit(self.train.T.tocoo())
        self.ials_cg_mf.fit(40 * self.train.T)
        self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy()
        self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy()
        self.ials_cg_mf_latent_x = self.ials_cg_mf.user_factors.copy()
        self.ials_cg_mf_latent_y = self.ials_cg_mf.item_factors.copy()

    def recommend(self, user_id, combiner, at=10):
        bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id],
                          self.bpr_mf_latent_y.T).ravel()
        ials_cg_mf_r = np.dot(self.ials_cg_mf_latent_x[user_id],
                              self.ials_cg_mf_latent_y.T).ravel()

        scores = [
            # [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"],
            [ials_cg_mf_r, 1, "IALS_CG"]
        ]

        for r in scores:
            self.filter_seen(user_id, r[0])

        return combiner.combine(scores, at)

    def filter_seen(self, user_id, scores):

        start_pos = int(self.train.indptr[user_id])
        end_pos = int(self.train.indptr[user_id + 1])

        user_profile = self.train.indices[start_pos:end_pos]

        scores[user_profile] = -1000000  #-np.inf
        return scores

    def recommend_batch(self, user_list, combiner, at=10):
        res = np.array([])
        n = 0
        for i in user_list:
            bpr = self.bpr_mf.recommend(user_items=self.train,
                                        userid=i,
                                        N=at,
                                        recalculate_user=False)
            ials = self.ials_cg_mf.recommend(userid=i,
                                             user_items=self.train,
                                             N=10)
            list = [x[0] for x in ials]
            recList = np.array(list)
            tuple = np.concatenate(([i], recList))
            if (res.size == 0):
                res = tuple
            else:
                res = np.vstack([res, tuple])
        return res

    def get_component_data(self):
        print('cyka')
Ejemplo n.º 22
0
class BMussoliniEnsemble:

    def __init__(self, urm_train, urm_test, icm, parameters=None):

        if parameters is None:
            parameters = {
                "USER_CF" : 7,
                "SVD" : 26,
                "ITEM_CF" : 0,
                "ITEM_BPR" : 16,
                "CBF" : 7,
                "IALS" : 26,
                "CBF_BPR" : 64,
                "BPR_MF": 6,
                "ITEM_RP3B": 16,
                "USER_RP3B": 0,
                "FM": 10
            }

        self.ensemble_weights = parameters
        self.train = urm_train.tocsr()
        self.test = urm_test.tocsr()
        self.icm = icm.tocsr()
        self.sequential_playlists = None
        self.sequential_playlists = load_sequential.load_train_sequential()
        self.initialize_components()


    def initialize_components(self):

        self.train = self.rescale_wrt_insertion_order(self.train)

        self.item_cosineCF_recommender = Cosine_Similarity(self.train, topK=200, shrink=15, normalize=True, mode='cosine')
        self.user_cosineCF_recommender = Cosine_Similarity(self.train.T, topK=200, shrink=15, normalize=True, mode='cosine')
        self.svd_recommender = PureSVDRecommender(self.train)
        self.cbf_bpr_recommender = SLIM_BPR_Cython(self.icm.T, positive_threshold=0)
        self.cbf_recommender = Cosine_Similarity(self.icm.T, topK=50, shrink=10, normalize=True, mode='cosine')
        self.item_rp3b_recommender = RP3betaRecommender(self.train)
        self.user_rp3b_recommender = RP3betaRecommender(self.train.T)
        self.bpr_mf = BPR_matrix_factorization(factors=800, regularization=0.01, learning_rate=0.01, iterations=300)
        self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3)
        self.lightfm = LightFM_Recommender(self.train, self.icm, no_components=200)

    def fit(self):

        self.svd_latent_x, self.svd_latent_y = self.svd_recommender.fit(num_factors=500)
        self.min_svd = np.dot(self.svd_latent_x, self.svd_latent_y).min()
        self.cbf_bpr_w = self.cbf_bpr_recommender.fit(epochs=10, topK=200, batch_size=20, sgd_mode='adagrad', learning_rate=1e-2)
        self.item_cosineCF_w = self.item_cosineCF_recommender.compute_similarity()
        self.user_cosineCF_w = self.user_cosineCF_recommender.compute_similarity()
        self.cbf_w = self.cbf_recommender.compute_similarity()
        self.item_rp3b_w = self.item_rp3b_recommender.fit()
        self.user_rp3b_w = self.user_rp3b_recommender.fit()
        self.ials_cg_mf.fit(40*self.train.T)
        self.ials_latent_x = self.ials_cg_mf.user_factors.copy()
        self.ials_latent_y = self.ials_cg_mf.item_factors.copy()
        self.min_ials = np.dot(self.ials_latent_x, self.ials_latent_y.T).min()
        self.bpr_mf.fit(self.train.T.tocoo())
        self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy()
        self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy()
        self.lightfm.fit(100)


    def recommend(self, user_id, combiner, at=10):
        user_profile = self.train[user_id, :]

        svd_r = self.svd_latent_x[user_id, :].dot(self.svd_latent_y)
        item_cosineCF_r = user_profile.dot(self.item_cosineCF_w).toarray().ravel()
        user_cosineCF_r = self.user_cosineCF_w[user_id].dot(self.train).toarray().ravel()
        cbf_r = user_profile.dot(self.cbf_w).toarray().ravel()
        cbf_bpr_r = user_profile.dot(self.cbf_bpr_w).toarray().ravel()
        ials_r = np.dot(self.ials_latent_x[user_id], self.ials_latent_y.T + self.min_ials).ravel()
        bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id], self.bpr_mf_latent_y.T).ravel()
        item_rp3b_r = user_profile.dot(self.item_rp3b_w).toarray().ravel()
        user_rp3b_r = self.user_rp3b_w[user_id].dot(self.train).toarray().ravel()
        lightfm_r = self.lightfm.scores(user_id)

        scores = [
            # [item_bpr_r, self.ensemble_weights["ITEM_BPR"], "ITEM_BPR" ],
            # [user_bpr_r, self.ensemble_weights["USER_BPR"], "USER_BPR" ],
            [svd_r, self.ensemble_weights["SVD"], "SVD"],
            [item_cosineCF_r, self.ensemble_weights["ITEM_CF"], "ITEM_CF" ],
            [user_cosineCF_r, self.ensemble_weights["USER_CF"], "USER_CF" ],
            [ials_r, self.ensemble_weights["IALS"], "IALS" ],
            [cbf_r, self.ensemble_weights["CBF"], "CBF" ],
            [cbf_bpr_r, self.ensemble_weights["CBF_BPR"], "CBF_BPR"],
            [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"],
            [item_rp3b_r, self.ensemble_weights["ITEM_RP3B"], "ITEM_RP3B"],
            [user_rp3b_r, self.ensemble_weights["USER_RP3B"], "USER_RP3B"],
            [lightfm_r, self.ensemble_weights["FM"], "FM"]
            ]

        for r in scores:
            self.filter_seen(user_id, r[0])

        R = combiner.combine(scores, at)
        return R

    def rescale_wrt_insertion_order(self, R):
        R = R.copy()
        R = R.tolil()
        R = R*0.8
        for i in self.sequential_playlists:
            pl = i["id"]
            k = 1
            for j in i["songs"]:
                factor = 1/(k**POPULARITY_SCALING_EXP)
                R[pl, j] = factor*(R[pl,j] + 0.2)
                k += 1
        R = R.tocsr()
        return R
    def filter_seen(self, user_id, scores):

        start_pos = int(self.train.indptr[user_id])
        end_pos = int(self.train.indptr[user_id + 1])

        user_profile = self.train.indices[start_pos:end_pos]

        scores[user_profile] = -1000000 #-np.inf
        return scores

    def recommend_batch(self, user_list, combiner, at=10):
        res = np.array([])
        n=0
        for i in user_list:
            recList = self.recommend(i, combiner, at).T
            tuple = np.concatenate(([i], recList))
            if (res.size == 0):
                res = tuple
            else:
                res = np.vstack([res, tuple])
        return res

    def get_component_data(self):
        item_cf_rating = self.ensemble_weights["ITEM_CF"]*self.train.dot(self.item_cosineCF_w)

        item_cf = {

                "min" : item_cf_rating.min(),
                "max" : item_cf_rating.max(),
                "mean" : item_cf_rating.mean(),

            }
        del item_cf_rating

        user_cf_rating = self.ensemble_weights["USER_CF"]*self.user_cosineCF_w.dot(self.train)

        user_cf = {
                "min": user_cf_rating.min(),
                "max": user_cf_rating.max(),
                "mean": user_cf_rating.mean(),
            }
        del user_cf_rating
        ials_rating =  self.ensemble_weights["IALS"]*(np.dot(self.ials_latent_x, self.ials_latent_y.T)+self.min_ials)

        ials = {

                "min": ials_rating.min(),
                "max": ials_rating.max(),
                "mean": np.mean(ials_rating),
            }
        del ials_rating
        cbf_rating = self.ensemble_weights["CBF"]*self.train.dot(self.cbf_w)
        cbf = {

                "min": cbf_rating.min(),
                "max": cbf_rating.max(),
                "mean": cbf_rating.mean(),
            }
        del cbf_rating
        cbf_bpr_rating = self.ensemble_weights["CBF_BPR"]*self.train.dot(self.cbf_bpr_w)
        cbf_bpr = {

                "min": cbf_bpr_rating.min(),
                "max": cbf_bpr_rating.max(),
                "mean": cbf_bpr_rating.mean(),
            }
        del cbf_bpr_rating
        svd_ratings = self.ensemble_weights["SVD"] * (np.dot(self.svd_latent_x, self.svd_latent_y) + self.min_svd)

        svd = {

            "min": svd_ratings.min(),
            "max": svd_ratings.max(),
            "mean": svd_ratings.mean(),
        }
        del svd_ratings


        return {
            "ITEM_CF" : item_cf,
            "USER_CF": user_cf ,
            "SVD" : svd ,
            "IALS" : ials,
            "CBF" : cbf,
            "CBF_BPR" : cbf_bpr
        }
Ejemplo n.º 23
0
while (time.time() - start) / 60 / 60 < RUN_LIMIT_HOURS:
    print(str(timedelta(seconds=time.time() - start)), ' -- config #',
          len(performance_list) + 1, ' >> training starting...')
    aux_time = time.time()

    # hyperparameters
    factors = 25 * np.random.randint(1, 31)  # 25, 50, 75, ... , 750
    learning_rate = (10**(-np.random.randint(2, 5))) * np.random.randint(1, 10)
    regularization = (10**(-np.random.randint(2, 5))) * np.random.randint(
        1, 10)
    iterations = 25 * np.random.randint(1, 31)  # 25, 50, 75, ... , 750

    alg = BayesianPersonalizedRanking(num_threads=NUM_THREADS,
                                      factors=factors,
                                      learning_rate=learning_rate,
                                      regularization=regularization,
                                      iterations=iterations)
    alg.fit(data_to_fit)

    perf_ndcg_at_100 = []
    rec_list = []

    print(' >> took ', str(timedelta(seconds=time.time() - aux_time)))
    print(str(timedelta(seconds=time.time() - start)), ' -- config #',
          len(performance_list) + 1, ' >> evaluation starting...')
    aux_time = time.time()

    with Pool(NUM_THREADS) as p:
        perf_ndcg_at_100 = p.map(
            paralelize_ndcg,
Ejemplo n.º 24
0
# Read matrix item/playtime
df_matrix = read_user_item_playtime(DATA_FILEPATH)

# Create index for items
index2item = pd.Series(list(df_matrix.columns.values), dtype="category").cat.categories

# Create normalized hours matrix
df_scaled_matrix = normalize_hours_matrix(df_matrix)

# compress matrix
csr_df_matrix = csr_matrix(df_scaled_matrix)
np.random.seed()

# Train
user_item_train, user_item_test = train_test_split(csr_df_matrix, train_percentage=train_percent)
bpr = BayesianPersonalizedRanking(iterations=train_interactions)
bpr.fit(user_item_train.T.tocsr())

print(user_item_train[user_id])
interacted_ids = user_item_train[user_id].nonzero()[1]
index2item = index2item.astype('int32')

interacted_items = [item_mapping[index2item[index]] for index in interacted_ids if
                    index2item[index] in item_mapping.keys()]

# it returns the recommended index and their corresponding score
reco = bpr.recommend(user_id, user_item_train, N=topn)
print(reco)

# map the index to Item
reco_items = [item_mapping[index2item[index]] for index, _ in reco if index2item[index] in item_mapping.keys()]
Ejemplo n.º 25
0
 def test_fit_almost_empty_matrix(self):
     raw = [[0, 0, 0], [0, 1, 0], [0, 0, 0]]
     return BayesianPersonalizedRanking(use_gpu=False).fit(
         csr_matrix(raw), show_progress=False)
Ejemplo n.º 26
0
#%% [markdown]
# ### Bayesian Personalized Ranking

#%%
from implicit.bpr import BayesianPersonalizedRanking

params = {"factors": 63}

#%%
import logging
import tqdm
import time
import codecs

#%%
model = BayesianPersonalizedRanking(**params)

#%%
model_name = 'bpr'
output_filename = 'subreddits_recs_bpr'

#%%
model.fit(comments)


#%%
def bpr_related_subreddits(subreddit):
    found = np.where(subreddits == subreddit)
    if len(found[0]) == 0:
        raise ValueError("Subreddit doesn't exist in the dataset.")
    _id = found[0][0]
Ejemplo n.º 27
0
 def _get_model(self):
     return BayesianPersonalizedRanking(factors=3,
                                        regularization=0,
                                        use_gpu=True,
                                        random_state=42)
Ejemplo n.º 28
0
def test_implicit_bpr(Rtr, Rts, k=20):
    from implicit.bpr import BayesianPersonalizedRanking
    bpr = BayesianPersonalizedRanking(k)
    bpr.fit(Rtr.T)
    return bpr