def _get_model(self):
     return FaissAlternatingLeastSquares(nlist=1,
                                         nprobe=1,
                                         factors=2,
                                         regularization=0,
                                         use_gpu=False,
                                         random_state=23)
Beispiel #2
0
 def _get_model(self):
     return FaissAlternatingLeastSquares(
         nlist=1,
         nprobe=1,
         factors=32,
         regularization=self.__regularization,
         use_gpu=True)
Beispiel #3
0
def experiment(B, K1, conf, variant='20m', min_rating=3.0):
    # read in the input data file
    _, ratings = get_movielens(variant)
    ratings = ratings.tocsr()

    # remove things < min_rating, and convert to implicit dataset
    # by considering ratings as a binary preference only
    ratings.data[ratings.data < min_rating] = 0
    ratings.eliminate_zeros()
    ratings.data = np.ones(len(ratings.data))

    training = ratings.tolil() # makes a copy

    # remove some implicit ratings (make them zeros, i.e., missing)
    # (these ratings might have already been missing, in fact)
    movieids = np.random.randint(low=0, high=np.shape(ratings)[0], size=100000)
    userids = np.random.randint(low=0, high=np.shape(ratings)[1], size=100000)
    training[movieids, userids] = 0

    model = FaissAlternatingLeastSquares(factors=128, iterations=30)
    model.approximate_recommend = False
    model.approximate_similar_items = False
    model.show_progress = False

    # possibly recalculate scores by bm25weight.
    if B != "NA":
        training = bm25_weight(training, B=B, K1=K1).tocsr()

    # train the model
    model.fit(training)

    # compute the predicted ratings
    moviescores = np.einsum('ij,ij->i', model.item_factors[movieids], model.user_factors[userids])
    # using confidence threshold, find boolean predictions
    preds = (moviescores >= conf)
    true_ratings = np.ravel(ratings[movieids,userids])
    # both model predicted True and user rated movie
    tp = true_ratings[preds].sum()
    #tp = ratings[:,userids][preds][movieids].sum()
    # model predicted True but user did not rate movie
    fp = preds.sum() - tp
    # model predicted False but user did rate movie
    fn = true_ratings.sum() - tp
    if tp+fp == 0:
        prec = float('nan')
    else:
        prec = float(tp)/float(tp+fp)
    if tp+fn == 0:
        recall = float('nan')
    else:
        recall = float(tp)/float(tp+fn)
    if B != "NA":
        print("%.2f,%.2f,%.2f,%d,%d,%d,%.2f,%.2f" % (B, K1, conf, tp, fp, fn, prec, recall))
    else:
        print("NA,NA,%.2f,%d,%d,%d,%.2f,%.2f" % (conf, tp, fp, fn, prec, recall))
def fit_model():
    global model, userids, userids_reverse, productids, productids_reverse, purchases_matrix, purchases_matrix_T
    with model_lock:
        app.logger.info("Fitting model...")
        start = time.time()
        with open(purchases_pickle, 'wb') as f:
            pickle.dump((purchases, usernames, userids, userids_reverse, productnames, productids, productids_reverse), f)

        # choose first line if faiss is not installed
        #model = AlternatingLeastSquares(factors=64, dtype=np.float32)
        model = FaissAlternatingLeastSquares(factors=128, dtype=np.float32, iterations=30)

        model.approximate_recommend = False
        model.approximate_similar_items = False
        data = {'userid': [], 'productid': [], 'purchase_count': []}
        for userid in purchases:
            for productid in purchases[userid]:
                data['userid'].append(userid)
                data['productid'].append(productid)
                data['purchase_count'].append(purchases[userid][productid])
        app.logger.info("Gathered %d data points in %0.2fs" % (len(data['purchase_count']), time.time() - start))
        start = time.time()
        df = pd.DataFrame(data)
        df['userid'] = df['userid'].astype("category")
        df['productid'] = df['productid'].astype("category")
        userids = list(df['userid'].cat.categories)
        userids_reverse = dict(zip(userids, list(range(len(userids)))))
        productids = list(df['productid'].cat.categories)
        productids_reverse = dict(zip(productids, list(range(len(productids)))))
        purchases_matrix = coo_matrix((df['purchase_count'].astype(np.float32),
                                       (df['productid'].cat.codes.copy(),
                                        df['userid'].cat.codes.copy())))
        print("Matrix shape: %s, max value: %.2f" % (np.shape(purchases_matrix), np.max(purchases_matrix)))
        app.logger.info("Matrix shape: %s, max value: %.2f" % (np.shape(purchases_matrix), np.max(purchases_matrix)))
        app.logger.info("Built matrix in %0.2fs" % (time.time() - start))
        start = time.time()
        purchases_matrix = bm25_weight(purchases_matrix, K1=2.0, B=0.25)
        purchases_matrix_T = purchases_matrix.T.tocsr()
        app.logger.info("BM25 weighted matrix in %0.2fs" % (time.time() - start))
        start = time.time()
        purchases_matrix = purchases_matrix.tocsr() # to support indexing in recommend/similar_items functions
        app.logger.info("Converted to CSR matrix in %0.2fs" % (time.time() - start))
        start = time.time()
        model.fit(purchases_matrix)
        app.logger.info("Fitted model in %0.2fs" % (time.time() - start))
Beispiel #5
0
 def _get_model(self):
     return FaissAlternatingLeastSquares(nlist=1, nprobe=1, factors=2, regularization=0)