Beispiel #1
0
def test_feature_inference_fails():

    # On predict if we try to use feature inference and supply
    # higher ids than the number of features that were supplied to fit
    # we should complain

    no_users, no_items = (10, 100)
    no_features = 20

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.int32)

    user_features = sp.csr_matrix((no_users,
                                   no_features),
                                  dtype=np.int32)
    item_features = sp.csr_matrix((no_items,
                                   no_features),
                                  dtype=np.int32)
    model = LightFM()
    model.fit_partial(train,
                      user_features=user_features,
                      item_features=item_features)

    with pytest.raises(AssertionError):
        model.predict(np.array([no_features], dtype=np.int32),
                      np.array([no_features], dtype=np.int32))
Beispiel #2
0
def test_input_dtypes():

    dtypes = (np.int32,
              np.int64,
              np.float32,
              np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for dtype in dtypes:
        train = sp.coo_matrix((no_users,
                               no_items),
                              dtype=dtype)

        user_features = sp.coo_matrix((no_users,
                                       no_features),
                                      dtype=dtype)
        item_features = sp.coo_matrix((no_items,
                                       no_features),
                                      dtype=dtype)

        model = LightFM()
        model.fit_partial(train,
                          user_features=user_features,
                          item_features=item_features)

        model.predict(np.random.randint(0, no_users, 10).astype(np.int32),
                      np.random.randint(0, no_items, 10).astype(np.int32),
                      user_features=user_features,
                      item_features=item_features)
Beispiel #3
0
def test_predict(num_threads=2):

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users, no_items), dtype=np.int32)

    model = LightFM()
    model.fit_partial(train)

    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        scores_int = model.predict(uid, np.arange(no_items))
        assert np.allclose(scores_arr, scores_int)
        scores_parallel = model.predict(np.repeat(uid, no_items),
                                        np.arange(no_items),
                                        num_threads=num_threads)
        assert np.allclose(scores_parallel, scores_arr)
        scores_no_prec = model.predict(np.repeat(uid, no_items),
                                       np.arange(no_items),
                                       num_threads=num_threads,
                                       precompute_representations=False)
        assert np.allclose(scores_parallel, scores_no_prec)
        scores_no_prec_serial = model.predict(np.repeat(uid, no_items),
                                              np.arange(no_items),
                                              num_threads=1,
                                              precompute_representations=False)
        assert np.allclose(scores_parallel, scores_no_prec_serial)
Beispiel #4
0
def test_feature_inference_fails():

    # On predict if we try to use feature inference and supply
    # higher ids than the number of features that were supplied to fit
    # we should complain

    no_users, no_items = (10, 100)
    no_features = 20

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.int32)

    user_features = sp.csr_matrix((no_users,
                                   no_features),
                                  dtype=np.int32)
    item_features = sp.csr_matrix((no_items,
                                   no_features),
                                  dtype=np.int32)
    model = LightFM()
    model.fit_partial(train,
                      user_features=user_features,
                      item_features=item_features)

    with pytest.raises(ValueError):
        model.predict(np.array([no_features], dtype=np.int32),
                      np.array([no_features], dtype=np.int32))
Beispiel #5
0
def test_input_dtypes():

    dtypes = (np.int32,
              np.int64,
              np.float32,
              np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for dtype in dtypes:
        train = sp.coo_matrix((no_users,
                               no_items),
                              dtype=dtype)

        user_features = sp.coo_matrix((no_users,
                                       no_features),
                                      dtype=dtype)
        item_features = sp.coo_matrix((no_items,
                                       no_features),
                                      dtype=dtype)

        model = LightFM()
        model.fit_partial(train,
                          user_features=user_features,
                          item_features=item_features)

        model.predict(np.random.randint(0, no_users, 10).astype(np.int32),
                      np.random.randint(0, no_items, 10).astype(np.int32),
                      user_features=user_features,
                      item_features=item_features)
Beispiel #6
0
def test_matrix_types():

    mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix)

    dtypes = (np.int32, np.int64, np.float32, np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for mattype in mattypes:
        for dtype in dtypes:
            train = mattype((no_users, no_items), dtype=dtype)

            user_features = mattype((no_users, no_features), dtype=dtype)
            item_features = mattype((no_items, no_features), dtype=dtype)

            model = LightFM()
            model.fit_partial(train,
                              user_features=user_features,
                              item_features=item_features)

            model.predict(np.random.randint(0, no_users, 10).astype(np.int32),
                          np.random.randint(0, no_items, 10).astype(np.int32),
                          user_features=user_features,
                          item_features=item_features)
Beispiel #7
0
def test_matrix_types():

    mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix)

    dtypes = (np.int32, np.int64, np.float32, np.float64)

    no_users, no_items = (10, 100)
    no_features = 20

    for mattype in mattypes:
        for dtype in dtypes:
            train = mattype((no_users, no_items), dtype=dtype)

            user_features = mattype((no_users, no_features), dtype=dtype)
            item_features = mattype((no_items, no_features), dtype=dtype)

            model = LightFM()
            model.fit_partial(train, user_features=user_features, item_features=item_features)

            model.predict(
                np.random.randint(0, no_users, 10).astype(np.int32),
                np.random.randint(0, no_items, 10).astype(np.int32),
                user_features=user_features,
                item_features=item_features,
            )
Beispiel #8
0
def test_predict(num_threads=2):

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.int32)

    model = LightFM()
    model.fit_partial(train)

    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        scores_int = model.predict(uid,
                                   np.arange(no_items))
        assert np.allclose(scores_arr, scores_int)
        scores_parallel = model.predict(np.repeat(uid, no_items),
                                        np.arange(no_items),
                                        num_threads=num_threads)
        assert np.allclose(scores_parallel, scores_arr)
        scores_no_prec = model.predict(np.repeat(uid, no_items),
                                       np.arange(no_items),
                                       num_threads=num_threads,
                                       precompute_representations=False)
        assert np.allclose(scores_parallel, scores_no_prec)
        scores_no_prec_serial = model.predict(np.repeat(uid, no_items),
                                              np.arange(no_items),
                                              num_threads=1,
                                              precompute_representations=False)
        assert np.allclose(scores_parallel, scores_no_prec_serial)
Beispiel #9
0
def test_movielens_accuracy_fit():

    model = LightFM()
    model.fit(train, epochs=10)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #10
0
def test_movielens_accuracy():

    model = LightFM(random_state=SEED)
    model.fit_partial(train, epochs=10)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #11
0
def test_hogwild_accuracy():

    # Should get comparable accuracy with 2 threads
    model = LightFM(random_state=SEED)
    model.fit_partial(train, epochs=10, num_threads=2)

    train_predictions = model.predict(train.row, train.col, num_threads=2)
    test_predictions = model.predict(test.row, test.col, num_threads=2)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #12
0
def test_movielens_excessive_regularization():

    # Should perform poorly with high regularization
    model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0)
    model.fit_partial(train, epochs=10)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) < 0.6
    assert roc_auc_score(test.data, test_predictions) < 0.6
Beispiel #13
0
def test_regularization():

    # Let's regularize
    model = LightFM(no_components=50, item_alpha=0.0001, user_alpha=0.0001)
    model.fit_partial(train, epochs=30)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.80
    assert roc_auc_score(test.data, test_predictions) > 0.75
Beispiel #14
0
def test_movielens_accuracy_pickle():

    model = LightFM(random_state=SEED)
    model.fit(train, epochs=10)

    model = pickle.loads(pickle.dumps(model))

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #15
0
def test_movielens_accuracy_resume():

    model = LightFM()

    for _ in range(10):
        model.fit_partial(train, epochs=1)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #16
0
def test_zeros_negative_accuracy():

    # Should get the same accuracy when zeros are used to
    # denote negative interactions
    train.data[train.data == -1] = 0
    model = LightFM(random_state=SEED)
    model.fit_partial(train, epochs=10)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #17
0
def test_predict():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users, no_items), dtype=np.int32)

    model = LightFM()
    model.fit_partial(train)

    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items))
        scores_int = model.predict(uid, np.arange(no_items))
        assert np.allclose(scores_arr, scores_int)
Beispiel #18
0
def test_movielens_accuracy_fit():

    model = LightFM(random_state=SEED)
    model.fit(train,
              epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #19
0
def test_movielens_accuracy():

    model = LightFM()
    model.fit_partial(train,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #20
0
def test_overfitting():

    # Let's massivly overfit
    model = LightFM(no_components=50, random_state=SEED)
    model.fit_partial(train, epochs=30)

    train_predictions = model.predict(train.row, train.col)
    test_predictions = model.predict(test.row, test.col)
    overfit_train = roc_auc_score(train.data, train_predictions)
    overfit_test = roc_auc_score(test.data, test_predictions)

    assert overfit_train > 0.99
    assert overfit_test < 0.75
Beispiel #21
0
def test_predict():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users, no_items), dtype=np.int32)

    model = LightFM()
    model.fit_partial(train)

    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        scores_int = model.predict(uid, np.arange(no_items))
        assert np.allclose(scores_arr, scores_int)
class LightFM_Recommender:

    def __init__(self, train, icm, no_components=10, k=5, n=10, item_alpha=0.0, user_alpha=0.0, loss='warp',
                 learning_rate=0.05, rho=0.95, epsilon=1e-6, max_sampled=10, learning_schedule='adagrad'):
        self.train = train
        self.icm = icm
        self.model = LightFM(loss=loss, k=k, n=n, item_alpha=item_alpha, user_alpha=user_alpha,
                             no_components=no_components, learning_rate=learning_rate, rho=rho,
                             epsilon=epsilon, max_sampled=max_sampled, learning_schedule=learning_schedule)

        self.pid_array = np.arange(train.shape[1], dtype=np.int32)

    def fit(self, epochs):
        self.model.fit(epochs=epochs, interactions=self.train, item_features=self.icm, verbose=True)

    def filter_seen(self, user_id, scores):

        start_pos = int(self.train.indptr[user_id])
        end_pos = int(self.train.indptr[user_id + 1])

        user_profile = self.train.indices[start_pos:end_pos]

        scores[user_profile] = -1000000 #-np.inf
        return scores

    def scores(self, user_id):
        return self.model.predict(user_id, self.pid_array, item_features=self.icm)

    def recommend(self, user_id, at=10):
        scores = self.model.predict(user_id, self.pid_array, item_features=self.icm)
        scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def recommendALL(self, userList, at=10):
        res = np.array([])
        n=0
        for i in userList:
            n+=1
            recList = self.recommend(i[0], at)
            tuple = np.concatenate((i, recList))
            if (res.size == 0):
                res = tuple
            else:
                res = np.vstack([res, tuple])
        return res
Beispiel #23
0
def test_movielens_accuracy_resume():

    model = LightFM(random_state=SEED)

    for _ in range(10):
        model.fit_partial(train,
                          epochs=1)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #24
0
def test_predict_not_fitted():

    model = LightFM()

    with pytest.raises(ValueError):
        model.predict(np.arange(10), np.arange(10))

    with pytest.raises(ValueError):
        model.predict_rank(1)

    with pytest.raises(ValueError):
        model.get_user_representations()

    with pytest.raises(ValueError):
        model.get_item_representations()
Beispiel #25
0
def test_movielens_accuracy_pickle():

    model = LightFM()
    model.fit(train,
              epochs=10)

    model = pickle.loads(pickle.dumps(model))

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #26
0
def peuimportelenom():

    noms= request.form.getlist("dblst_artists")
    sugg= []
    #print(noms)

    for el in noms:
        artiste= ap[ap.name== el]
        lind= list(artiste.artistID)[0] -1
        vecteur[lind]= artiste.playCountScaled.median()

    # création de la matrice
    X= np.vstack((ratings,vecteur))
    
    # On importe le code du jupyter notebook
    n_users, n_items = X.shape

    Xcsr = csr_matrix(X)
    Xcoo = Xcsr.tocoo()
    data = Dataset()
    data.fit(np.arange(n_users), np.arange(n_items))
    interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
    train, test = random_train_test_split(interactions)

    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit(train, epochs=10, num_threads=2)

    scores = model.predict(0, vecteur)
    top_items = ap["name"].unique()[np.argsort(-scores)]

    sugg= top_items[:10]
    

    return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
Beispiel #27
0
def recommendSOAnswers(i_train, i_test, i_user_graph, i_item_graph, n_users,
                       n_items, n_tags):
    interactions = loadInteractions(i_train, n_users, n_items)

    u_features = loadUserFeatures(i_user_graph, n_users)

    i_features = loadItemFeatures(i_item_graph, n_items, n_tags)

    test_users, test_items, labels = loadTest(i_test)

    model = LightFM(learning_rate=0.05, loss='logistic')

    model.fit(interactions,
              user_features=u_features,
              item_features=i_features,
              epochs=5,
              verbose=True,
              num_threads=10)

    result = model.predict(test_users,
                           test_items,
                           item_features=i_features,
                           user_features=u_features,
                           num_threads=10)
    y_score = np.array([result])
    y_true = np.array([labels])
    print(result)
    print(len(y_score))
    print(len(y_true))

    score = label_ranking_average_precision_score(y_true, y_score)
    print(score)
Beispiel #28
0
class Warp(RecSys):
    def __init__(self,
                 NUM_TRACKS,
                 no_components=10,
                 learning_rate=0.05,
                 epochs=1):

        super().__init__()
        self.NUM_TRACKS = NUM_TRACKS
        self.no_components = no_components
        self.learning_rate = learning_rate
        self.epochs = epochs

        self.model = LightFM(no_components=self.no_components,
                             learning_schedule='adagrad',
                             loss='warp',
                             learning_rate=self.learning_rate)

    def get_scores(self, dataset, targets):
        self.model.fit(interactions=dataset,
                       epochs=self.epochs,
                       num_threads=mp.cpu_count(),
                       verbose=True)

        scores = np.empty((len(targets), dataset.shape[1]), dtype=np.float32)
        tracks = [i for i in range(self.NUM_TRACKS)]
        for i, target in enumerate(targets):
            new_row = self.model.predict(target, tracks)
            discard = np.argpartition(new_row, -1000)[:-1000]
            new_row[discard] = 0
            scores[i] = new_row

        return sparse.csr_matrix(scores, dtype=np.float32)
Beispiel #29
0
def get_recommendations(users_ids):

    results = dict()

    losses = ['warp', 'bpr', 'warp-kos']

    n_items = full_data['coo_matrix'].shape[1]

    for loss in losses:
        # Create model
        model = LightFM(loss=loss)

        # Train model
        # The dataset is given 'epoch' time to the algorithm
        # Numb_threads : parallel computation, not be higher than the number of physical core
        model.fit(full_data['coo_matrix'], epochs=10, num_threads=2)

        print('********* With {} algorithm *********\n'.format(loss))
        for user in users_ids:

            scores = model.predict(user, np.arange(n_items))
            top_scores = np.argsort(-scores)[:3]

            print('Recommendations for user {}:'.format(user))

            for x in top_scores.tolist():
                for artist, dict_artist in full_data['artists'].items():
                    if int(x) == dict_artist['id']:
                        print('   - {}'.format(dict_artist['name']))

            print('\n')  # Get it pretty
Beispiel #30
0
def sample_recommendation(
        model: LightFM,
        dataset: pd.DataFrame,
        raw_data: pd.DataFrame,
        item_features,
        user_ids,
        recommendations_num: int = 10) -> Tuple[List[str], List[str]]:

    for user_id in user_ids:
        # Retrieve the item's IDs
        items_map = [item_id for item_id in dataset.mapping()[2].values()]
        # Retrieve the product_code for each item ID
        items_names = [item_id for item_id in dataset.mapping()[2].keys()]
        # Construct a dataframe with product_codes and item ID as index
        items = pd.DataFrame(items_names, index=items_map)
        items.columns = ['product_code']

        # Retrieve the known items
        known_items = raw_data[raw_data.cac == 'cac_' +
                               str(user_id)]['product_code'][:5].values
        known_item_ids = items[items['product_code'].isin(
            known_items)].index.tolist()

        # Predict items
        scores = model.predict(user_ids,
                               np.arange(recommendations_num),
                               item_features=item_features)
        i_idx = [x for x in np.argsort(-scores)]

        # Remove known items
        i_idx = [x for x in i_idx if x not in known_item_ids]
        top_items = items[~items['product_code'].isin(known_items)].loc[i_idx]

        return top_items['product_code'].values.tolist(), known_items.tolist()
def test_get_representations():

    model = LightFM(random_state=SEED)
    model.fit_partial(train, epochs=10)

    num_users, num_items = train.shape

    for (item_features,
         user_features) in ((None, None), ((sp.identity(num_items) +
                                            sp.random(num_items, num_items)),
                                           (sp.identity(num_users) +
                                            sp.random(num_users, num_users)))):

        test_predictions = model.predict(test.row,
                                         test.col,
                                         user_features=user_features,
                                         item_features=item_features)

        item_biases, item_latent = model.get_item_representations(
            item_features)
        user_biases, user_latent = model.get_user_representations(
            user_features)

        assert item_latent.dtype == np.float32
        assert user_latent.dtype == np.float32

        predictions = (
            (user_latent[test.row] * item_latent[test.col]).sum(axis=1) +
            user_biases[test.row] + item_biases[test.col])

        assert np.allclose(test_predictions, predictions, atol=0.000001)
Beispiel #32
0
def func():
    ratings = list(Rating.objects.all())
    films = list(Film.objects.all())
    users = list(User.objects.all())

    interactions_matrix = []

    # a = Rating.objects.filter(user_id=1, movie_id='0068646')
    # print(int(a[0].rating))

    for user in users:
        rating_of_user = []
        for film in films:
            rating = Rating.objects.filter(user_id=user.id,
                                           movie_id=film.movie_id)
            if rating:
                rating_of_user.append(int(rating[0].rating))
            else:
                rating_of_user.append(0)
        print(rating_of_user[:20])
        interactions_matrix.append(rating_of_user)

    interactions_matrix = coo_matrix(interactions_matrix)

    model = LightFM(learning_rate=0.02, loss='bpr')
    model.fit(interactions_matrix, epochs=10)
    print(model.predict(np.int32([4, 5, 6]), np.int32([0, 1, 2])))

    pickle.dump(model, open("model.p", "wb"))
Beispiel #33
0
def test_movielens_excessive_regularization():

    # Should perform poorly with high regularization
    model = LightFM(no_components=10,
                    item_alpha=1.0,
                    user_alpha=1.0)
    model.fit_partial(train,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) < 0.6
    assert roc_auc_score(test.data, test_predictions) < 0.6
Beispiel #34
0
def test_zeros_negative_accuracy():

    # Should get the same accuracy when zeros are used to
    # denote negative interactions
    train.data[train.data == -1] = 0
    model = LightFM()
    model.fit_partial(train,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #35
0
def test_regularization():

    # Let's regularize
    model = LightFM(no_components=50,
                    item_alpha=0.0001,
                    user_alpha=0.0001)
    model.fit_partial(train,
                      epochs=30)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.80
    assert roc_auc_score(test.data, test_predictions) > 0.75
Beispiel #36
0
def test_overfitting():

    # Let's massivly overfit
    model = LightFM(no_components=50)
    model.fit_partial(train,
                      epochs=30)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)
    overfit_train = roc_auc_score(train.data, train_predictions)
    overfit_test = roc_auc_score(test.data, test_predictions)

    assert overfit_train > 0.99
    assert overfit_test < 0.75
def pred_i(df, user_id):
    """
    Takes in data dictionary and external user id, and outputs LightFM's predictions
    (converted to external workout ids) and their respective scores.

    Note: this function is deployed to web application
    """
    model = LightFM(loss='warp')

    model.fit(df['all_ui_matrix'])
    workout_ids = np.asarray([
        i for i in range(df['user_item_interactions']['workout_id'].nunique())
    ])

    # get LightFM scores, by internal indices
    scores = model.predict(get_internal_user_id(df['user_map'], user_id),
                           workout_ids)

    # internal indices ordered by scores (descending)
    internal_indices_ranked = np.argsort(-scores)

    # LightFM scores corresponding to ranked indices
    scores_ranked = scores[internal_indices_ranked]

    # external indices order by scores (decending)
    external_indices_ranked = [
        get_external_workout_id(df['item_map'], i)
        for i in internal_indices_ranked
    ]
    return external_indices_ranked, scores_ranked
Beispiel #38
0
def test_zero_weights_accuracy():

    # When very small weights are used
    # accuracy should be no better than
    # random.
    weights = train.copy()
    weights.data = np.zeros(train.getnnz(), dtype=np.float32)

    for loss in ('logistic', 'bpr', 'warp'):
        model = LightFM(loss=loss, random_state=SEED)
        model.fit_partial(train, sample_weight=weights, epochs=10)

        train_predictions = model.predict(train.row, train.col)
        test_predictions = model.predict(test.row, test.col)

        assert 0.45 < roc_auc_score(train.data, train_predictions) < 0.55
        assert 0.45 < roc_auc_score(test.data, test_predictions) < 0.55
Beispiel #39
0
def fit_model_and_create_predictions():
    model = LightFM(loss='warp')
    users = get_users()
    usr_cat_matrix = create_item_matrix(users)
    model.fit(usr_cat_matrix, epochs=30, num_threads=2)
    for user in users:
        never_bought_cats = u_never_bought_cats(user, usr_cat_matrix)
        save_user_prediction(get_top_100(model.predict(user, never_bought_cats)))
Beispiel #40
0
def get_recommendations(user_id, artist_name, n_items, X):
    # initialize the model
    model = LightFM(learning_rate=0.05, loss='bpr', random_state=42)
    model.fit(X, epochs=10, num_threads=2)
    # predict
    scores = model.predict(user_id, np.arange(n_items))
    top_items = artist_name[np.argsort(-scores)]
    return (top_items[:10])
Beispiel #41
0
def test_hogwild_accuracy():

    # Should get comparable accuracy with 2 threads
    model = LightFM()
    model.fit_partial(train,
                      epochs=10,
                      num_threads=2)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      num_threads=2)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     num_threads=2)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #42
0
class LightFMRecommender(Recommender):
    """ PURE LIGHTFM COLLABORATIVE FILTERING """

    N_CONFIG = 0

    def __init__(self,
                 train,
                 test,
                 validation,
                 targets,
                 subfolder="../",
                 log_filename='lightfmcf_config.txt'):
        super(LightFMRecommender,
              self).__init__(train, test, validation, targets, subfolder,
                             log_filename)
        self.configuration_txt = "PURE LIGHTFM COLLABORATIVE FILTERING"

    def fit(self,
            item_alpha=1e-5,
            user_alpha=1e-4,
            learning_schedule='adadelta',
            num_components=250,
            epochs=30,
            threads=2):
        self.item_alpha = item_alpha
        self.user_alpha = user_alpha
        self.learning_schedule = learning_schedule
        self.num_components = num_components
        self.epochs = epochs
        self.threads = threads

    def train(self, verbose=True):
        start_time = time.time()

        if verbose:
            print("LightFM training started!")

        # Let's fit a WARP model: these generally have the best performance.
        self.model = LightFM(loss='warp',
                             item_alpha=self.item_alpha,
                             user_alpha=self.user_alpha,
                             learning_schedule=self.learning_schedule,
                             no_components=self.num_components)

        # Run 3 epochs and time it.
        self.model = self.model.fit(self.URM_train,
                                    epochs=self.epochs,
                                    num_threads=self.threads)
        if verbose:
            print("LightFM training model fitted in {:.2f} seconds".format(
                time.time() - start_time))

    def compute_predicted_ratings(self, playlist_id):
        return self.model.predict(user_ids=playlist_id,
                                  item_ids=np.arange(self.n_tracks),
                                  item_features=None,
                                  user_features=None,
                                  num_threads=self.threads)
Beispiel #43
0
def fit_lightfm_model():
	""" Fit the lightFM model 
	
	returns d_user_pred, list_user, list_coupon
	list_coupon = list of test coupons 
	list_user = list of user ID 
	d_user_pred : key = user, value = predicted ranking of coupons in list_coupon
	"""

	#Load data
	Mui_train = spi.mmread("../Data/Data_translated/biclass_user_item_train_mtrx.mtx")
	uf        = spi.mmread("../Data/Data_translated/user_feat_mtrx.mtx")
	itrf      = spi.mmread("../Data/Data_translated/train_item_feat_mtrx.mtx")
	itef      = spi.mmread("../Data/Data_translated/test_item_feat_mtrx.mtx")
	
	#Print shapes as a check
	print "user_features shape: %s,\nitem train features shape: %s,\nitem test features shape: %s"   % (uf.shape, itrf.shape, itef.shape)
	
	#Load test coupon  and user lists
	cplte       = pd.read_csv("../Data/Data_translated/coupon_list_test_translated.csv")
	ulist       = pd.read_csv("../Data/Data_translated/user_list_translated.csv")
	list_coupon = cplte["COUPON_ID_hash"].values
	list_user   = ulist["USER_ID_hash"].values
	
	#Build model
	no_comp, lr, ep = 10, 0.01, 5
	model = LightFM(no_components=no_comp, learning_rate=lr, loss='warp')
	model.fit_partial(Mui_train, user_features = uf, item_features = itrf, epochs = ep, num_threads = 4, verbose = True)

	test               = sps.csr_matrix((len(list_user), len(list_coupon)), dtype = np.int32)
	no_users, no_items = test.shape
	pid_array          = np.arange(no_items, dtype=np.int32)

	#Create and initialise dict to store predictions
	d_user_pred = {}
	for user in list_user :
		d_user_pred[user] = []
	
	# Loop over users and compute predictions
	for user_id, row in enumerate(test):
		sys.stdout.write("\rProcessing user " + str(user_id)+"/ "+str(len(list_user)))
		sys.stdout.flush()
		uid_array         = np.empty(no_items, dtype=np.int32)
		uid_array.fill(user_id)
		predictions       = model.predict(uid_array, pid_array,user_features = uf, item_features = itef, num_threads=4)
		user              = str(list_user[user_id])
		# apply MinMaxScaler for blending later on
		MMS               = MinMaxScaler()
		pred              = MMS.fit_transform(np.ravel(predictions))
		d_user_pred[user] = pred

	# Pickle the predictions for future_use
	d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred}
	with open("../Data/Data_translated/d_pred_lightfm.pickle", "w") as f:
		pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL)

	return d_user_pred, list_user, list_coupon
Beispiel #44
0
class RecSys(object):
    """
    Create rec sys model with lightfm
    """
    def __init__(self,
                 user_column_name: str = "user",
                 item_column_name: str = "item",
                 rating_column_name: str = "rating"):
        self.__user = user_column_name
        self.__item = item_column_name
        self.__rating = rating_column_name

        self.__model = LightFM(learning_rate=0.05, loss='bpr')

        self.__users = {}
        self.__current_user_num = 0

        self.__items = {}
        self.__current_item_num = 0

    def get_user(self, user):
        if user not in self.__users:
            self.__users[user] = self.__current_user_num
            self.__current_user_num += 1

        return self.__users[user]

    def get_item(self, item):
        if item not in self.__items:
            self.__items[item] = self.__current_item_num
            self.__current_item_num += 1

        return self.__items[item]

    def __df_to_sparsematrix(
            self, df: pandas.DataFrame) -> scipy.sparse.coo.coo_matrix:
        sparsematrix = scipy.sparse.dok_matrix(
            (df[self.__user].value_counts().shape[0],
             df[self.__item].value_counts().shape[0]),
            dtype=numpy.int32)

        for _, row in df[[self.__user, self.__item, self.__rating]].iterrows():
            sparsematrix[self.get_user(row[0]), self.get_item(row[1])] = row[2]

        return sparsematrix.tocoo(copy=True)

    def fit(self, df: pandas.DataFrame):
        self.__model.fit(self.__df_to_sparsematrix(df), epochs=20)

    def predict(self,
                users: numpy.array,
                items: numpy.array,
                num_threads: int = 1) -> numpy.array:
        return self.__model.predict(
            numpy.array([self.get_user(x) for x in users]),
            numpy.array([self.get_item(x) for x in items]),
            num_threads=num_threads)
Beispiel #45
0
class LightFMRecommender(object):
    def __init__(self,
                 n_comp=30,
                 loss='warp-kos',
                 learning='adagrad',
                 alpha=1e-3):
        alpha = 1e-3
        self.model = LightFM(no_components=30,
                             loss='warp-kos',
                             learning_schedule='adagrad',
                             user_alpha=alpha,
                             item_alpha=alpha)

        # self.model = LightFM(no_components=n_comp,
        #                 loss=loss,
        #                 learning_schedule= learning,
        #                 user_alpha=alpha, item_alpha=alpha)

    def fit(self, urm, epochs=100):
        self.urm = urm
        self.n_tracks = urm.shape[1]
        for epoch in range(epochs):
            self.model.fit_partial(urm.getCSR(), epochs=1)

    def get_pred_row(self, user_id):
        return self.model.predict(user_id, np.arange(self.n_tracks))

    def s_recommend(self, user_id, nRec=10):
        scores = self.model.predict(user_id, np.arange(self.n_tracks))
        top_items = np.argsort(-scores)

        recommended_items = self._filter_seen(user_id, top_items)
        return recommended_items[0:nRec]

    def _filter_seen(self, user_id, ranking):
        seen = self.urm.extractTracksFromPlaylist(user_id)
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

    def m_recommend(self, target_ids, nRec=10):
        results = []
        for tid in target_ids:
            results.append(self.s_recommend(tid, nRec))
        return results
Beispiel #46
0
def test_user_supplied_features_accuracy():

    model = LightFM()
    model.fit_partial(train,
                      user_features=train_user_features,
                      item_features=train_item_features,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      user_features=train_user_features,
                                      item_features=train_item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     user_features=test_user_features,
                                     item_features=test_item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Beispiel #47
0
def test_movielens_genre_accuracy():

    item_features = movielens_data.get_movielens_item_metadata(use_item_ids=False)

    assert item_features.shape[1] < item_features.shape[0]

    model = LightFM()
    model.fit_partial(train,
                      item_features=item_features,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.75
    assert roc_auc_score(test.data, test_predictions) > 0.69
Beispiel #48
0
def test_movielens_genre_accuracy():

    item_features = fetch_movielens(indicator_features=False,
                                    genre_features=True)['item_features']

    assert item_features.shape[1] < item_features.shape[0]

    model = LightFM(random_state=SEED)
    model.fit_partial(train,
                      item_features=item_features,
                      epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.75
    assert roc_auc_score(test.data, test_predictions) > 0.69
Beispiel #49
0
def test_zero_weights_accuracy():

    # When very small weights are used
    # accuracy should be no better than
    # random.
    weights = train.copy()
    weights.data = np.zeros(train.getnnz(),
                            dtype=np.float32)

    for loss in ('logistic', 'bpr', 'warp'):
        model = LightFM(loss=loss, random_state=SEED)
        model.fit_partial(train,
                          sample_weight=weights,
                          epochs=10)

        train_predictions = model.predict(train.row,
                                          train.col)
        test_predictions = model.predict(test.row,
                                         test.col)

        assert 0.45 < roc_auc_score(train.data, train_predictions) < 0.55
        assert 0.45 < roc_auc_score(test.data, test_predictions) < 0.55
Beispiel #50
0
def test_movielens_both_accuracy():
    """
    Accuracy with both genre metadata and item-specific
    features shoul be no worse than with just item-specific
    features (though more training may be necessary).
    """

    item_features = movielens_data.get_movielens_item_metadata(use_item_ids=True)

    model = LightFM()
    model.fit_partial(train,
                      item_features=item_features,
                      epochs=15)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.75
Beispiel #51
0
def test_movielens_both_accuracy():
    """
    Accuracy with both genre metadata and item-specific
    features shoul be no worse than with just item-specific
    features (though more training may be necessary).
    """

    item_features = fetch_movielens(indicator_features=True,
                                    genre_features=True)['item_features']

    model = LightFM(random_state=SEED)
    model.fit_partial(train,
                      item_features=item_features,
                      epochs=15)

    train_predictions = model.predict(train.row,
                                      train.col,
                                      item_features=item_features)
    test_predictions = model.predict(test.row,
                                     test.col,
                                     item_features=item_features)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.75
Beispiel #52
0
def test_predict_scores(num_threads=2):

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.float32)
    train = sp.rand(no_users, no_items, format='csr')

    model = LightFM()
    model.fit_partial(train)

    # Compute scores and check if results equal to model.predict
    predict_input = sp.csr_matrix(np.ones((no_users, no_items)))
    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()
    for uid in range(no_users):
        scores_arr = model.predict(np.repeat(uid, no_items),
                                   np.arange(no_items))
        score_slice = np.array(scores)[uid, :]
        assert np.array_equal(score_slice, scores_arr)

    # check if precompute and parallelization work correctly
    scores_serial = model.predict_score(predict_input,
                                        num_threads=1).todense()
    scores_no_prec = model.predict_score(predict_input,
                                         num_threads=num_threads,
                                         precompute_representations=False
                                         ).todense()
    scores_ser_no_prec = model.predict_score(predict_input,
                                             num_threads=1,
                                             precompute_representations=False
                                             ).todense()
    assert np.array_equal(scores, scores_serial)
    assert np.array_equal(scores, scores_no_prec)
    assert np.array_equal(scores, scores_ser_no_prec)

    # Compute ranks and compares with ranks computed from scores
    ranks = model.predict_rank(predict_input,
                               num_threads=num_threads).todense()

    def rank_scores(s):
        # ranks from scores as in http://stackoverflow.com/a/14672797/5251962
        u, v = np.unique(s, return_inverse=True)
        return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v]

    check_ranks = np.apply_along_axis(rank_scores, 1, scores)
    assert np.array_equal(ranks, check_ranks)

    # Train set exclusions. All scores should be zero
    # if train interactions is dense.
    scores = model.predict_score(predict_input,
                                 train_interactions=predict_input).todense()
    assert np.all(scores == 0)

    # Make sure invariants hold when there are ties
    model.user_embeddings = np.zeros_like(model.user_embeddings)
    model.item_embeddings = np.zeros_like(model.item_embeddings)
    model.user_biases = np.zeros_like(model.user_biases)
    model.item_biases = np.zeros_like(model.item_biases)

    scores = model.predict_score(predict_input,
                                 num_threads=num_threads).todense()

    assert np.all(scores.min(axis=1) == 0)
    assert np.all(scores.max(axis=1) == 0)

    # Wrong input dimensions
    with pytest.raises(ValueError):
        model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
def do_fiber_training(visualization = False):

    if not os.path.isfile(rc.RECOMMENDER_TRAINING) or not os.path.isfile(rc.RECOMMENDER_MODEL):

        yarn_data_matrix = pickle.load(open( rc.YARN_DATA_MATRIX, "rb" ))
        yarn_data_train = sps.coo_matrix(
                                yarn_data_matrix[:int(len(yarn_data_matrix)*0.5)]
                        ) > 0
        yarn_data_test = sps.coo_matrix(
                                yarn_data_matrix[int(len(yarn_data_matrix)*0.5):]
                        ) > 0
        if visualization:
            print yarn_data_train.shape[0],yarn_data_test.shape[0], len(yarn_data_matrix)

        # Taken from: https://github.com/lyst/lightfm/blob/master/examples/stackexchange/hybrid_crossvalidated.ipynb
        # Set the number of threads; you can increase this
        # ify you have more physical cores available.
        NUM_THREADS = 2
        NUM_COMPONENTS = 30
        NUM_EPOCHS = 3
        ITEM_ALPHA = 1e-6

        # Let's fit a WARP model: these generally have the best performance.
        model = LightFM(loss='warp',
                        item_alpha=ITEM_ALPHA,
                       no_components=NUM_COMPONENTS)

        # Run 3 epochs and time it.
        model = model.fit(yarn_data_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)



        # Compute and print the AUC score
        train_auc = auc_score(model, yarn_data_train, num_threads=NUM_THREADS).mean()
        print('Collaborative filtering train AUC: %s' % train_auc)


        # We pass in the train interactions to exclude them from predictions.
        # This is to simulate a recommender system where we do not
        # re-recommend things the user has already interacted with in the train
        # set.
        test_auc = auc_score(model, yarn_data_test, train_interactions=yarn_data_train, num_threads=NUM_THREADS).mean()
        print('Collaborative filtering test AUC: %s' % test_auc)

        pickle.dump(yarn_data_matrix,open(rc.RECOMMENDER_TRAINING, 'wb'))
        pickle.dump(model,open(rc.RECOMMENDER_MODEL, 'wb'))
    else:
        yarn_data_matrix = pickle.load(open(rc.RECOMMENDER_TRAINING, 'rb'))
        model = pickle.load(open(rc.RECOMMENDER_MODEL, 'rb'))


    translation_dict = pickle.load(open(rc.YARN_TRANSLATION_DATA, 'rb'))
    print len(yarn_data_matrix)
    for matrix_id in xrange(0,len(yarn_data_matrix)):
        print matrix_id
        predictions = model.predict(matrix_id,yarn_data_matrix[matrix_id])
        matches = []
        predictions += abs(np.min(predictions)) # make non-negative
        _max = np.max(predictions) # find max for normalization
        predictions /= _max # Normalize predictions
        for prediction in xrange(0,len(predictions)):

            if predictions[prediction] > 0.9:
                matches.append([translation_dict[prediction],prediction,predictions[prediction]])

        print translation_dict[matrix_id],matches
Beispiel #54
0
def fit_model(week_ID, no_comp, lr, ep):
	""" Fit the lightFM model to all weeks in list_week_ID.
	Then print the results for MAPat10
	
	args : week_ID validation test week
	no_comp, lr, ep = (int, float, int) number of components, learning rate, number of epochs for lightFM model

    returns: d_user_pred, list_user, list_coupon
    list_coupon = list of test coupons 
    list_user = list of user ID 
    d_user_pred : key = user, value = predicted ranking of coupons in list_coupon

	"""

	print("Fit lightfm model for %s" % week_ID)

	#Load data
	Mui_train = spi.mmread("../Data/Validation/%s/biclass_user_item_train_mtrx_%s.mtx" % (week_ID, week_ID))
	uf        = spi.mmread("../Data/Validation/%s/user_feat_mtrx_%s.mtx" % (week_ID, week_ID))
	itrf      = spi.mmread("../Data/Validation/%s/train_item_feat_mtrx_%s.mtx" % (week_ID, week_ID))
	itef      = spi.mmread("../Data/Validation/%s/test_item_feat_mtrx_%s.mtx" % (week_ID, week_ID))

	#Print shapes as a check
	print("user_features shape: %s,\nitem train features shape: %s,\nitem test features shape: %s"   % (uf.shape, itrf.shape, itef.shape))

	#Load test coupon  and user lists
	cplte       = pd.read_csv("../Data/Validation/" + week_ID + "/coupon_list_test_validation_" + week_ID +".csv")
	ulist       = pd.read_csv("../Data/Validation/" + week_ID + "/user_list_validation_" + week_ID +".csv")
	list_coupon = cplte["COUPON_ID_hash"].values
	list_user   = ulist["USER_ID_hash"].values

	#Build model
	no_comp, lr, ep = 10, 0.01, 5
	model = LightFM(no_components=no_comp, learning_rate=lr, loss='warp')
	model.fit_partial(Mui_train, user_features = uf, item_features = itrf, epochs = ep, num_threads = 4, verbose = True)

	test               = sps.csr_matrix((len(list_user), len(list_coupon)), dtype = np.int32)
	no_users, no_items = test.shape
	pid_array          = np.arange(no_items, dtype=np.int32)

	#Create and initialise dict to store predictions
	d_user_pred = {}
	for user in list_user :
		d_user_pred[user] = []
	
	# Loop over users and compute predictions
	for user_id, row in enumerate(test):
		sys.stdout.write("\rProcessing user " + str(user_id)+"/ "+str(len(list_user)))
		sys.stdout.flush()
		uid_array         = np.empty(no_items, dtype=np.int32)
		uid_array.fill(user_id)
		predictions       = model.predict(uid_array, pid_array,user_features = uf, item_features = itef, num_threads=4)
		user              = str(list_user[user_id])
		d_user_pred[user] = predictions

	# Pickle the predictions for future_use
	d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred}
	with open("../Data/Validation/%s/d_pred_lightfm_%s.pickle" % (week_ID, week_ID), "w") as f:
		pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL)

	return d_user_pred, list_user, list_coupon