def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def experiment(B, K1, conf, variant='20m', min_rating=3.0): # read in the input data file _, ratings = get_movielens(variant) ratings = ratings.tocsr() # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) training = ratings.tolil() # makes a copy # remove some implicit ratings (make them zeros, i.e., missing) # (these ratings might have already been missing, in fact) movieids = np.random.randint(low=0, high=np.shape(ratings)[0], size=100000) userids = np.random.randint(low=0, high=np.shape(ratings)[1], size=100000) training[movieids, userids] = 0 model = FaissAlternatingLeastSquares(factors=128, iterations=30) model.approximate_recommend = False model.approximate_similar_items = False model.show_progress = False # possibly recalculate scores by bm25weight. if B != "NA": training = bm25_weight(training, B=B, K1=K1).tocsr() # train the model model.fit(training) # compute the predicted ratings moviescores = np.einsum('ij,ij->i', model.item_factors[movieids], model.user_factors[userids]) # using confidence threshold, find boolean predictions preds = (moviescores >= conf) true_ratings = np.ravel(ratings[movieids,userids]) # both model predicted True and user rated movie tp = true_ratings[preds].sum() #tp = ratings[:,userids][preds][movieids].sum() # model predicted True but user did not rate movie fp = preds.sum() - tp # model predicted False but user did rate movie fn = true_ratings.sum() - tp if tp+fp == 0: prec = float('nan') else: prec = float(tp)/float(tp+fp) if tp+fn == 0: recall = float('nan') else: recall = float(tp)/float(tp+fn) if B != "NA": print("%.2f,%.2f,%.2f,%d,%d,%d,%.2f,%.2f" % (B, K1, conf, tp, fp, fn, prec, recall)) else: print("NA,NA,%.2f,%d,%d,%d,%.2f,%.2f" % (conf, tp, fp, fn, prec, recall))