def fit_bm25_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        bm25_recommender = BM25Recommender(K=6, K1=1.2, B=.76, num_threads=0)
        bm25_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return bm25_recommender
Ejemplo n.º 2
0
def main():
    args = get_args()

    set_seeds(args.seed)

    data_path = args.data_path

    data = read_data(root=data_path)
    result = evaluate(data=c.deepcopy(data),
                      smoothing=args.smoothing,
                      models_list=[
                          BM25Recommender(K=args.bm25_k),
                          AlternatingLeastSquares(factors=args.als_factors,
                                                  iterations=args.als_iters),
                      ],
                      r=args.rating,
                      N=args.top_k)

    RATES = [5.5, 6]
    TOP_K = 100

    predictions = mix_solutions(result=result,
                                rates=RATES,
                                pictures_num_to_leave=TOP_K)

    test_users = pd.DataFrame.from_dict(predictions).T.reset_index()
    test_users.rename({'index': 'user_id'}, inplace=True, axis=1)
    test_users.sort_values('user_id', inplace=True)
    test_users['predictions'] = test_users[list(range(TOP_K))].apply(
        lambda x: ' '.join(map(str, x)), axis=1)
    test_users[['user_id', 'predictions']].to_csv('sub.csv', index=False)
Ejemplo n.º 3
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Ejemplo n.º 4
0
    def get_model(self):

        # Get a model based off the input params

        self.app_logger.info(msg='Initializing the nearest neighbors model')

        return BM25Recommender(**self.model_params)
Ejemplo n.º 5
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with codecs.open(output_filename, "w", "utf8") as o:
        for movieid in to_generate:
            # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
            # no ratings > 4 meaning we've filtered out all data for it.
            if m.indptr[movieid] == m.indptr[movieid + 1]:
                continue

            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_movies(input_path, output_filename,
                             model_name="als", min_rating=4.0):
    """
    :param input_path: 训练数据集的路径
    :param output_filename: 输出的文件名称
    :param model_name: 采用的模型
    :param min_rating: 过滤所需的阈值大小
    :return:
    """

    logging.debug("reading data from %s", input_path)
    start = time.time()
    rating_data, movies_data, m = read_data(input_path, min_rating=min_rating)
    logging.debug("reading data in %s", time.time() - start)

    if model_name == "als":
        model = AlternatingLeastSquares()

        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender()

    else:
        raise NotImplementedError("TODU: model %s" % model_name)


    m = m.tocsr()
    logging.debug("Training model :%s" % model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = rating_data.groupby("movieId").size()
    movie_lookup = dict((i, m) for i,m in
                        zip(movies_data['movieId'], movies_data['title']))
    to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            if(m.indptr[movieid] == m.indptr[movieid + 1]):
                continue

            movie = movie_lookup[movieid]

            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
Ejemplo n.º 7
0
def calculate_similar_beers(input_path, output_filename, model_name="cosine"):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, beers, m = read_data(input_path)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    m = m.tocsr()
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top beers")

    user_count = ratings.groupby('beerId').size()
    beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name']))
    to_generate = sorted(list(beers['beerId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for beerId in to_generate:
            if m.indptr[beerId] == m.indptr[beerId + 1]:
                continue
            beer = beer_lookup[beerId]
            for other, score in model.similar_items(beerId, 11):
                o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
Ejemplo n.º 8
0
def load_recommender(item_to_item_model_file: str) -> ItemToItemRecommender:
    log.info("Loading item to item bm25 model")
    data = np.load(item_to_item_model_file)
    k = data['model.K'][0]
    k1 = data['model.bm25.K1'][0]
    b = data['model.bm25.B'][0]
    model = BM25Recommender(K=k, K1=k1, B=b)
    model.similarity = sparse.csr_matrix(
        (data['model.similarity.data'], data['model.similarity.indices'],
         data['model.similarity.indptr']),
        shape=data['model.similarity.shape'])
    model.scorer = NearestNeighboursScorer(model.similarity)
    user_labels = data['user_labels']
    item_labels = data['item_labels']
    return ItemToItemRecommender(model, user_labels, item_labels)
Ejemplo n.º 9
0
def calculate_similar_movies(input_path,
                             output_filename,
                             model_name="als",
                             min_rating=4.0):
    # read in the input data file
    logging.debug("reading data from %s", input_path)
    start = time.time()
    ratings, movies, m = read_data(input_path, min_rating=min_rating)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        m = bm25_weight(m, B=0.9) * 5

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(m)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)
    logging.debug("calculating top movies")

    user_count = ratings.groupby('movieId').size()
    movie_lookup = dict(
        (i, m) for i, m in zip(movies['movieId'], movies['title']))
    to_generate = sorted(list(movies['movieId']),
                         key=lambda x: -user_count.get(x, 0))

    with open(output_filename, "w") as o:
        for movieid in to_generate:
            movie = movie_lookup[movieid]
            for other, score in model.similar_items(movieid, 11):
                o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
Ejemplo n.º 10
0
def calculate_recommendations(train_filename,
                              test_filename,
                              output_filename,
                              dir,
                              model_name="als",
                              factors=80,
                              regularization=0.8,
                              iterations=10,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar items. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", dir + train_filename)
    start = time.time()
    df, cnts = read_data(dir + train_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based on the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            iterations=iterations,
                                            dtype=dtype)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 iterations=iterations,
                                                 dtype=dtype)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        cnts = bm25_weight(cnts, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(cnts)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    #
    test_data = pandas.read_csv(test_filename,
                                sep="\t",
                                usecols=[0, 1, 2],
                                names=['user', 'item', 'cnt'])
    test_data = test_data.groupby(["user", "item"], as_index=False).sum()
    users_test = set(test_data['user'])
    users_train = set(df['user'])

    # position is important for recommendation list and actual list
    dict_actual = {}
    for user in users_test:
        if user not in users_train:
            continue
        matched_df = test_data.loc[test_data["user"] == user]
        matched_df.sort(["cnt"], ascending=[False], inplace=True)
        dict_actual[user] = list(matched_df["item"])

    user_items = cnts.T.tocsr()
    # print(user_items)
    # recommend items for a user
    dict_recommended = {}  # for computing MAP and MP

    for user in users_test:
        if user not in users_train:
            continue
        # print(user)
        recommendations = model.recommend(user, user_items)
        df = pandas.DataFrame(recommendations, columns=["item", "score"])
        # print(recommendations)
        # print(df["item"])
        dict_recommended[user] = list(df["item"])

    ndcg = NDCG(dict_actual, dict_recommended)

    err = ERR(dict_actual, dict_recommended)

    map = MAP(dict_actual, dict_recommended)

    mp = MP(dict_actual, dict_recommended)

    with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o:
        o.write("NDCG\tERR\tMAP\tMP\n")
        o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp))

    return (ndcg, err, map, mp)
Ejemplo n.º 11
0
def calculate_similar_movies(input_filename,
                             output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    # titles, ratings = get_movielens(variant)

    user_item_df = read_user_item_data(input_filename)
    print(user_item_df)
    unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto(
        user_item_df)

    #user_item_df = user_item_df.sort_values(by=['user_index','item_index'])
    user_item_ratings = scipy.sparse.csr_matrix(
        (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index'])))
    print(user_item_ratings)
    '''
    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    '''

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares(
            factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True)

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_item_ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    k=10
    iterations = 10000
    similar_df_gen = similar_to_csv(model, k, unique_item, iterations)

    with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress:
        for similar_df_slice in similar_df_gen:
            similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False)
            print("finsih a batch")
            progress.update(1)

    '''
Ejemplo n.º 12
0
    def fit_bm25_recommender(user_item_matrix):

        bm25_recommender = BM25Recommender(K=6, K1=1.2, B=.76, num_threads=0)
        bm25_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return bm25_recommender
Ejemplo n.º 13
0
# regularization_levels = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# num_factors_levels = [10, 50, 100]

K1_levels = [10, 20, 50, 100, 200]
B_levels = [0, 0.25, 0.5, 0.75, 1]
filter_already_liked_items_levels = [True, False]

K = 20

runs = []
for K1, B, filter_already_liked_items in itertools.product(
        K1_levels, B_levels, filter_already_liked_items_levels):
    print((K1, B, filter_already_liked_items))
    start_time = time()

    model = BM25Recommender(K1, B)
    model.fit(implicit_matrix)

    brec = recommenders.MyBM25Recommender(model, implicit_matrix)
    brecs = brec.recommend_all(
        userids,
        K,
        u2i=u2i,
        n2i=n2i,
        i2p=i2p,
        filter_already_liked_items=filter_already_liked_items,
    )
    print("Computing metrics...")
    metrics = wr.get_recs_metrics(
        histories_test,
        brecs,
Ejemplo n.º 14
0
    def get_model(self):

        return BM25Recommender(**self.model_params)
Ejemplo n.º 15
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model_name="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")

    # read in the input data file
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        if exact:
            model = AlternatingLeastSquares(factors=factors,
                                            regularization=regularization,
                                            use_native=use_native,
                                            use_cg=cg,
                                            dtype=dtype,
                                            iterations=iterations)
        else:
            model = AnnoyAlternatingLeastSquares(factors=factors,
                                                 regularization=regularization,
                                                 use_native=use_native,
                                                 use_cg=cg,
                                                 dtype=dtype,
                                                 iterations=iterations)

        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(K1=100, B=0.5)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %s", model_name, time.time() - start)

    # write out similar artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 11):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
Ejemplo n.º 16
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # write out artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    start = time.time()

    if model == "als":
        logging.debug("weighting matrix by bm25")
        weighted = bm25_weight(plays, K1=100, B=0.8)

        logging.debug("calculating factors")
        artist_factors, user_factors = alternating_least_squares(
            weighted,
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            use_native=use_native,
            dtype=dtype,
            use_cg=cg)
        logging.debug("calculated factors in %s", time.time() - start)

        if exact:
            calc = TopRelated(artist_factors)
        else:
            calc = ApproximateTopRelated(artist_factors, trees)
        logging.debug("writing top related to %s", output_filename)
        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in calc.get_related(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

    elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai",
                   "overlap"):
        if model == "bm25":
            scorer = BM25Recommender(K1=100, B=0.5)

        elif model == "tfidf":
            scorer = TFIDFRecommender()

        elif model == "cosine":
            scorer = CosineRecommender()

        else:
            raise NotImplementedError("TODO: model %s" % model)

        logging.debug("calculating similar items")
        start = time.time()
        scorer.fit(plays, K=11)
        logging.debug("calculated all_pairs_knn in %s", time.time() - start)

        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in scorer.similar_items(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
Ejemplo n.º 17
0
 def _get_model(self):
     self.app_logger.info("Initializing {} model".format(
         BM25Recommender.__dict__["__module__"]))
     return BM25Recommender(**self.model_params)