Ejemplo n.º 1
0
def _train_lmf(hyperparameters, train):
    h = hyperparameters
    model = LogisticMatrixFactorization(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    #    test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)}
    #    val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)}
    return model
Ejemplo n.º 2
0
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
    # read in the input data file
    start = time.time()
    titles, ratings = get_movielens(variant)

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares()

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    user_count = np.ediff1d(ratings.indptr)
    to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

    log.debug("calculating similar movies")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for movieid in to_generate:
                # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
                # no ratings > 4 meaning we've filtered out all data for it.
                if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                    title = titles[movieid]
                    for other, score in model.similar_items(movieid, 11):
                        o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                progress.update(1)
Ejemplo n.º 3
0
def evaluate_lmf_model(hyperparameters, train, test, validation):
    h = hyperparameters

    model = LogisticMatrixFactorization(factors=h['factors'],
                                        iterations=h['n_iter'],
                                        num_threads=nproc)

    model.fit(train)
    test_eval = {
        'p@k': precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10)
    }
    val_eval = {
        'p@k': precision_at_k(model,
                              train.T.tocsr(),
                              validation.T.tocsr(),
                              K=10)
    }
    return test_eval, val_eval
Ejemplo n.º 4
0
class LMF:
    def __init__(self, factors=200, iterations=100, regularization=1, neg_prop=10, already_liked=None):
        self.model = LogisticMatrixFactorization(factors, iterations=iterations,
                                                 regularization=regularization, neg_prop=neg_prop)
        self._already_liked = already_liked
        self._fitted = False
        self._user_items = None
    
    def fit(self, data_st, data_item, len_st_set, len_item_set):
        data = csr_matrix(([1] * len(data_st), (data_item, data_st)),
                          shape=(len_item_set, len_st_set))
        self._user_items = data.T.tocsr()
        self.model.fit(data, show_progress=False)
        self._fitted = True

    def recommend(self, user, k, selected_items=None):
        assert self._fitted, 'Model is not fitted'
        if selected_items is not None and self._already_liked is not None:
            if len(selected_items) != 0:
                return self.model.rank_items(user, self._user_items,
                    [item for item in selected_items if item not in self._already_liked[user]])[:k]
            else:
                return []
        else:
            return self.model.recommend(user, self._user_items, k, filter_already_liked_items=True)

    def recommend_item_based(self, k, choice, selected_items=None):
        assert self._fitted, 'Model is not fitted'
        assert len(choice) != 0, 'Given an empty list of chosen items'
        similar_items = [self.model.similar_items(x, 5*k) for x in choice]
        if selected_items is not None:
            d_list = [{e[0]: e[1] for e in similar_items[i] if (e[0] not in choice and e[0] in selected_items)}
                      for i in range(len(similar_items))]
        else:
            d_list = [{e[0]: e[1] for e in similar_items[i] if e[0] not in choice}
                      for i in range(len(similar_items))]
        results = dict()
        for d in d_list:
            for key in d:
                results[key] = results.get(key, 0) + d[key]
        return sorted(results.items(), key=lambda x: x[1], reverse=True)[:k]
Ejemplo n.º 5
0
 def _get_model(self):
     return LogisticMatrixFactorization(factors=3,
                                        regularization=0,
                                        use_gpu=False,
                                        random_state=43)
Ejemplo n.º 6
0
 def __init__(self, factors=200, iterations=100, regularization=1, neg_prop=10, already_liked=None):
     self.model = LogisticMatrixFactorization(factors, iterations=iterations,
                                              regularization=regularization, neg_prop=neg_prop)
     self._already_liked = already_liked
     self._fitted = False
     self._user_items = None
Ejemplo n.º 7
0
def calculate_similar_movies(input_filename,
                             output_filename,
                             model_name="als", min_rating=4.0,
                             variant='20m'):
    # read in the input data file
    start = time.time()
    # titles, ratings = get_movielens(variant)

    user_item_df = read_user_item_data(input_filename)
    print(user_item_df)
    unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto(
        user_item_df)

    #user_item_df = user_item_df.sort_values(by=['user_index','item_index'])
    user_item_ratings = scipy.sparse.csr_matrix(
        (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index'])))
    print(user_item_ratings)
    '''
    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))
    '''

    log.info("read data file in %s", time.time() - start)

    # generate a recommender model based off the input params
    if model_name == "als":
        model = AlternatingLeastSquares(
            factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True)

        # lets weight these models by bm25weight.
        log.debug("weighting matrix by bm25_weight")
        # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()

    elif model_name == "bpr":
        model = BayesianPersonalizedRanking()

    elif model_name == "lmf":
        model = LogisticMatrixFactorization()

    elif model_name == "tfidf":
        model = TFIDFRecommender()

    elif model_name == "cosine":
        model = CosineRecommender()

    elif model_name == "bm25":
        model = BM25Recommender(B=0.2)

    else:
        raise NotImplementedError("TODO: model %s" % model_name)

    # train the model
    log.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_item_ratings)
    log.debug("trained model '%s' in %s", model_name, time.time() - start)
    log.debug("calculating top movies")

    k=10
    iterations = 10000
    similar_df_gen = similar_to_csv(model, k, unique_item, iterations)

    with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress:
        for similar_df_slice in similar_df_gen:
            similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False)
            print("finsih a batch")
            progress.update(1)

    '''