Esempio n. 1
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Esempio n. 2
0
 def __init__(self, *args, **kwargs):
     """
     Construct an ALS recommender.  The arguments are passed as-is to
     :py:class:`implicit.als.BayesianPersonalizedRanking`.
     """
     from implicit.bpr import BayesianPersonalizedRanking
     super().__init__(BayesianPersonalizedRanking(*args, **kwargs))
Esempio n. 3
0
    def train(self):
        b_time = time.time()
        self.item_idx, self.item_idx_reverse = {}, {}

        if self.reload:
            with open(self.config['item_vec_file'], 'r') as in_f:
                num_items, dim = in_f.readline().strip().split()
                print(f'Num of items : {num_items}, dim : {dim}')
                self.t = AnnoyIndex(int(dim), 'angular')
                
                for idx, line in tqdm(enumerate(in_f)):
                    tmp = line.split()
                    item = tmp[0]
                    self.item_idx[item] = idx
                    self.item_idx_reverse[idx] = item

                self.t = AnnoyIndex(int(dim), 'angular')
                self.t.load(f'{file_name}.ann')
        else:
            Y = []
            with open(self.config['train_file'], 'r') as in_f:
                for idx, line in tqdm(enumerate(in_f)):
                    items_list = line.strip().split()
                    Y.append([self.__get_id(item) for item in items_list])
            # construct the sparse matrix
            indptr = np.fromiter(chain((0,), map(len, Y)), int, len(Y) + 1).cumsum()
            indices = np.fromiter(chain.from_iterable(Y), int, indptr[-1])
            data = np.ones_like(indices)
            user_item_table_csr = csr_matrix((data, indices, indptr))
            item_user_table_csr = user_item_table_csr.T.tocsr()
            print('Matrix size : ', item_user_table_csr.shape)
            print("Train finished ... : ", time.time() - b_time)

            # Train MF
            model_name = "bpr"
            self.model = BayesianPersonalizedRanking(num_threads=20)
            print("training model %s", model_name)
            start = time.time()
            self.model.fit(item_user_table_csr)
            print("trained model '%s' in %s", model_name, time.time() - start)
            print("calculating top movies")

            items_count, dim = self.model.item_factors.shape
            # Build Ann
            self.t = AnnoyIndex(int(dim), 'angular')
            
            with open(config['item_vec_file']) as out_f:
                print(f"{items_count} {dim}", file=out_f)
                for idx, vec in tqdm(enumerate(self.model.item_factors)):
                    self.t.add_item(idx, vec)
                    print(f"{self.item_idx_reverse[idx]} {' '.join(vec.astype(str))}", file=out_f)

            print("Read file finished ...")
            file_name = self.config['index_file_file']

            self.t.build(30) # 10 trees
            self.t.save(f'{file_name}.ann')

        print(f"Train finished ...{time.time() - b_time}")
Esempio n. 4
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
Esempio n. 5
0
def _train_bpr(hyperparameters, train):
    h = hyperparameters
    model = BayesianPersonalizedRanking(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    #    test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)}
    #    val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)}
    return model
Esempio n. 6
0
    def bpr(self, database, **kwargs):
        from implicit.bpr import BayesianPersonalizedRanking
        opts = self.get_option('implicit', 'bpr', **kwargs)
        model = BayesianPersonalizedRanking(
            **opts
        )
        ratings = self.get_database(database, **kwargs)
        if kwargs.get('return_instance_before_train'):
            return (model, ratings)

        elapsed, mem_info = self.run(model.fit, ratings)
        model = None
        return elapsed, mem_info
Esempio n. 7
0
    def fit(user_item_matrix,
            n_factors=20,
            regularization=0.001,
            iterations=15,
            num_threads=4):
        """Обучает BPR"""

        model = BayesianPersonalizedRanking(factors=n_factors,
                                            regularization=regularization,
                                            iterations=iterations,
                                            num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
Esempio n. 8
0
def get_aucs_vs_factors():
    factors = [8, 16, 32, 64, 128]
    params_list = [{"factors": factor} for factor in factors]

    aucs = []

    for params in params_list:
        model = BayesianPersonalizedRanking(**params)
        model.fit(comments)
        aucs.append(
            auc(test_set[:20000], model.user_factors, model.item_factors,
                subreddits, users))

    return aucs
Esempio n. 9
0
def train_implicit_bpr(
    train_df: pd.DataFrame, params: Dict[str, Union[str, float, int]],
    shape: [int,
            int]) -> Tuple[BayesianPersonalizedRanking, sp.csr.csr_matrix]:

    train_matrix = create_implicit_train_matrix(train_df, shape)

    model_params = dict()
    args = inspect.getfullargspec(BayesianPersonalizedRanking.__init__)[0]
    for param, param_val in params.items():
        if param in args:
            model_params[param] = param_val

    model = BayesianPersonalizedRanking(**model_params)
    model.fit(train_matrix, show_progress=False)

    return model, train_matrix
Esempio n. 10
0
def evaluate_bpr_model(hyperparameters, train, test, validation):
    h = hyperparameters

    model = BayesianPersonalizedRanking(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    test_eval = {
        'p@k': precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10)
    }
    val_eval = {
        'p@k': precision_at_k(model,
                              train.T.tocsr(),
                              validation.T.tocsr(),
                              K=10)
    }
    return test_eval, val_eval
Esempio n. 11
0
    def fit(user_item_matrix,
            n_factors=20,
            regularization=0.001,
            iterations=15,
            num_threads=0):
        """Обучает ALS"""

        # model = AlternatingLeastSquares(factors=n_factors,
        model = BayesianPersonalizedRanking(
            factors=n_factors,
            regularization=regularization,
            iterations=iterations,
            # calculate_training_loss=True,
            num_threads=num_threads,
        )

        model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
        print('LOG = 005')
        return model
Esempio n. 12
0
    def BPR_train(self, inputs, rating_df):
        model = BayesianPersonalizedRanking(factors=60)
        model.fit(inputs)

        # user_embeddings = model.user_factors
        movie_embeddings = model.item_factors

        # id를 영화 이름으로 변경
        id_title_dict = {k: v for k, v in self.movie_df['title'].items()}
        title = [
            id_title_dict[movie_id]
            for movie_id in rating_df['movie_id'].cat.categories
        ]

        # movie embedding
        movie_embedding_df = pd.DataFrame(movie_embeddings, index=title)
        # user_names = [user_id for user_id in rating_df['user_id'].cat.categories]
        # user_embedding_df = pd.DataFrame(user_embeddings, index=user_names)

        return movie_embedding_df  # , user_embedding_df
Esempio n. 13
0
def BPR(A: sp.coo_matrix, factors: int, lr: float, regularization: float,
        iterations: float):
    '''
    Run BayesianPersonalizedRanking - BPR: Bayesian Personalized Ranking from ImplicitFeedback
    :param A: userxitem matrix
    :param factors: embedding size
    :param lr: learning rate
    :param regularization: regularizazion parameter
    :param iterations: how many training updates
    '''
    bpr = BayesianPersonalizedRanking(factors=factors,
                                      learning_rate=lr,
                                      regularization=regularization,
                                      use_gpu=True,
                                      iterations=iterations,
                                      verify_negative_samples=True,
                                      num_threads=10)
    bpr.fit(A.T)

    # Last one is the bias term. However user_bias is 1 (not used) so a simple dot product works.
    item_factors = bpr.item_factors
    user_factors = bpr.user_factors
    return user_factors.dot(item_factors.T)
Esempio n. 14
0
while (time.time() - start) / 60 / 60 < RUN_LIMIT_HOURS:
    print(str(timedelta(seconds=time.time() - start)), ' -- config #',
          len(performance_list) + 1, ' >> training starting...')
    aux_time = time.time()

    # hyperparameters
    factors = 25 * np.random.randint(1, 31)  # 25, 50, 75, ... , 750
    learning_rate = (10**(-np.random.randint(2, 5))) * np.random.randint(1, 10)
    regularization = (10**(-np.random.randint(2, 5))) * np.random.randint(
        1, 10)
    iterations = 25 * np.random.randint(1, 31)  # 25, 50, 75, ... , 750

    alg = BayesianPersonalizedRanking(num_threads=NUM_THREADS,
                                      factors=factors,
                                      learning_rate=learning_rate,
                                      regularization=regularization,
                                      iterations=iterations)
    alg.fit(data_to_fit)

    perf_ndcg_at_100 = []
    rec_list = []

    print(' >> took ', str(timedelta(seconds=time.time() - aux_time)))
    print(str(timedelta(seconds=time.time() - start)), ' -- config #',
          len(performance_list) + 1, ' >> evaluation starting...')
    aux_time = time.time()

    with Pool(NUM_THREADS) as p:
        perf_ndcg_at_100 = p.map(
            paralelize_ndcg,
Esempio n. 15
0
def calculate_similar_movies(input_filename,
                             output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    # titles, ratings = get_movielens(variant)

    user_item_df = read_user_item_data(input_filename)
    print(user_item_df)
    unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto(
        user_item_df)

    #user_item_df = user_item_df.sort_values(by=['user_index','item_index'])
    user_item_ratings = scipy.sparse.csr_matrix(
        (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index'])))
    print(user_item_ratings)
    '''
    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    '''

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares(
            factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True)

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_item_ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    k=10
    iterations = 10000
    similar_df_gen = similar_to_csv(model, k, unique_item, iterations)

    with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress:
        for similar_df_slice in similar_df_gen:
            similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False)
            print("finsih a batch")
            progress.update(1)

    '''
Esempio n. 16
0
 def _get_model(self):
     return BayesianPersonalizedRanking(factors=3,
                                        regularization=0,
                                        use_gpu=False)
Esempio n. 17
0
 def __init__(self):
     super().__init__()
     self.URM = None
     self.item_factors = None
     self.user_factors = None
     self.model = BayesianPersonalizedRanking(factors=1000, num_threads=8, verify_negative_samples=True)
Esempio n. 18
0
 def test_fit_almost_empty_matrix(self):
     raw = [[0, 0, 0], [0, 1, 0], [0, 0, 0]]
     return BayesianPersonalizedRanking(use_gpu=False).fit(
         csr_matrix(raw), show_progress=False)
Esempio n. 19
0
# Read matrix item/playtime
df_matrix = read_user_item_playtime(DATA_FILEPATH)

# Create index for items
index2item = pd.Series(list(df_matrix.columns.values), dtype="category").cat.categories

# Create normalized hours matrix
df_scaled_matrix = normalize_hours_matrix(df_matrix)

# compress matrix
csr_df_matrix = csr_matrix(df_scaled_matrix)
np.random.seed()

# Train
user_item_train, user_item_test = train_test_split(csr_df_matrix, train_percentage=train_percent)
bpr = BayesianPersonalizedRanking(iterations=train_interactions)
bpr.fit(user_item_train.T.tocsr())

print(user_item_train[user_id])
interacted_ids = user_item_train[user_id].nonzero()[1]
index2item = index2item.astype('int32')

interacted_items = [item_mapping[index2item[index]] for index in interacted_ids if
                    index2item[index] in item_mapping.keys()]

# it returns the recommended index and their corresponding score
reco = bpr.recommend(user_id, user_item_train, N=topn)
print(reco)

# map the index to Item
reco_items = [item_mapping[index2item[index]] for index, _ in reco if index2item[index] in item_mapping.keys()]
Esempio n. 20
0
#%% [markdown]
# ### Bayesian Personalized Ranking

#%%
from implicit.bpr import BayesianPersonalizedRanking

params = {"factors": 63}

#%%
import logging
import tqdm
import time
import codecs

#%%
model = BayesianPersonalizedRanking(**params)

#%%
model_name = 'bpr'
output_filename = 'subreddits_recs_bpr'

#%%
model.fit(comments)


#%%
def bpr_related_subreddits(subreddit):
    found = np.where(subreddits == subreddit)
    if len(found[0]) == 0:
        raise ValueError("Subreddit doesn't exist in the dataset.")
    _id = found[0][0]
Esempio n. 21
0
 def _get_model(self):
     return BayesianPersonalizedRanking(factors=3,
                                        regularization=0,
                                        use_gpu=True,
                                        random_state=42)
Esempio n. 22
0
def test_implicit_bpr(Rtr, Rts, k=20):
    from implicit.bpr import BayesianPersonalizedRanking
    bpr = BayesianPersonalizedRanking(k)
    bpr.fit(Rtr.T)
    return bpr