Ejemplo n.º 1
0
def train_fastai(params, data):
    model = collab_learner(
        data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"]
    )
    with Timer() as t:
        model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"])
    return model, t
Ejemplo n.º 2
0
def train_fastai(params, data):
    model = collab_learner(data, 
                           n_factors=params["n_factors"],
                           y_range=params["y_range"],
                           wd=params["wd"]
                          )
    with Timer() as t:
        model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"])
    return model, t
Ejemplo n.º 3
0
 def train(self, dataset: RecommendationDataset) -> None:
     self.data_pd = dataset.data
     self.data = CollabDataLoaders.from_df(
         dataset.data,
         user_name=dataset.user_col,
         item_name=dataset.item_col,
         rating_name=dataset.score_col,
         valid_pct=0)
     self.learner = collab_learner(self.data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1)
     self.learner.fit_one_cycle(self.epochs)
Ejemplo n.º 4
0
        #st.subheader("data loaded")

        data_train, data_test = python_random_split(data, ratio=0.7)

        split =data_train.shape[0], data_test.shape[0]
        st.write("Splitting_Ratio:",split)

        data = CollabDataBunch.from_df(data_train, seed=42, valid_pct=0.1)

        y_range = [0.5,5.5]
        st.write(y_range)

        factor=N_FACTORS
        st.write("No. of factors:",factor)

        learn = collab_learner(data, n_factors=factor, y_range=y_range, wd=1e-1)
        
        learn.model

        #st.subheader("data loaded")

        fit_onecycle=learn.fit_one_cycle(5, 3e-4)
        st.write(fit_onecycle)


        #st.subheader("data loaded")

        learn.recorder.plot()
        st.pyplot()

books_dataset = pd.read_csv("rcsystem/static/books_dataset.csv")
books_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
books_tfidf_matrix = books_tf.fit_transform(books_dataset['soup'])
books_cosine_sim = cosine_similarity(books_tfidf_matrix, books_tfidf_matrix)

# 1B) Books users
books_ratings = pd.read_csv("rcsystem/static/books_ratings.csv")

# surprise model
_, user_based_book_algo = dump.load("rcsystem/static/user_based_book.dump")

# fastai model
# Important: Used seed=1 to match size of the vocabulary. It may cause problems after training different day
books_dls = CollabDataLoaders.from_df(books_ratings, item_name='title', seed=1)
books_collab_filtering = collab_learner(books_dls, y_range=(0.5, 5.5))
books_collab_filtering.model_dir = "."
books_collab_filtering = books_collab_filtering.load("rcsystem/static/books_collab_filtering")

# 2A) Movies items
"""
movies_count_vec = CountVectorizer(stop_words='english')
movies_count_matrix = movies_count_vec.fit_transform(movies_dataset['soup'][:10000])
movies_cosine_sim = cosine_similarity(movies_count_matrix, movies_count_matrix)
"""

movies_dataset = pd.read_csv("rcsystem/static/movies_dataset.csv")
movies_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
movies_tfidf_matrix = movies_tf.fit_transform(movies_dataset['soup'][:10000])
movies_cosine_sim = cosine_similarity(movies_tfidf_matrix, movies_tfidf_matrix)
Ejemplo n.º 6
0
            continue
        if anime_name != correct_anime:
            use_name = input(f"Did you mean '{correct_anime}'? [y/n]")
            while use_name.lower() not in ["y", "n", "yes", "no"]:
                print("Please input y or n")
                use_name = input(f"Did you mean '{correct_anime}'? [y/n]")
            if use_name.lower() in ["n", "no"]:
                continue
        entries[correct_anime] = rating
    return entries


path = Path("data")
data = torch.load(path / "data.feather")
y_range = [-0.5, 10.5]
learner = fc.collab_learner(data, n_factors=50, use_nn=False, y_range=y_range)
learner.load('anime')

anime2id = torch.load(path / "anime2id.feather")

input_dict = {
    "psychopass": 9,
    "durarara!!": 8,
    "shiki": 7,
    "shigatsu wa kimi no uso": 10,
    "boku dake ga inai machi": 9,
    "kimi no na wa": 9,
    "Ansatsu Kyoushitsu": 8,
    "Shokugeki no Soma": 6,
    "clannad": 8,
    "clannad after story": 2,
Ejemplo n.º 7
0
# Absence of a rating does not mean that the user did not like the book.
# Actual rating values vary between 1 and 10. If we leave 0 ratings unchanged,
# the model could implicitly assume that 0 ratings are worse than the lowest actual rating.
# 0 ratings prevail in the dataset and outnumber all other rating values.
# Assigning average or median values to the majority of samples will distort the data
# and lead to inaccurate and unreliable recommendation model.
# To avoid this we will drop all rows with 0 values before passing the data to collaborative model.
data_col = data[data['Book-Rating'] > 0]
print(f'Number of samples before dropping 0 values: {len(data)}\n'
      f'Number of samples after dropping 0 values: {len(data_col)}')

# Prepare data for the model
dls = CollabDataLoaders.from_df(
    data_col[['User-ID', 'Content_ID', 'Book-Rating']], bs=64, valid_pct=0.1)

# Create a collaborative model and train for 5 epochs
learner = collab_learner(dls, y_range=(1, 10))
learner.fine_tune(5)

# After 5 epochs of training validation loss stalls. If training continues,
# the model just overfits on the train data without any considerable improvement on the validation set.
# Validation error is about 3.7, which is high taking into account that ratings vary between 1 and 10.
# This magnitude of errors could easily result in recommending the books that the user wouldn't like
# and not recommending books that in reality would get high ratings.

# Display predicted and actual ratings for each user-book pair
learner.show_results()

# Prediction for all users
result = learner.predict(dls)
Ejemplo n.º 8
0
    def handle(self, *args, **options):
        if options["which"] == "books":
            """
            rec_models.books_count_vec = CountVectorizer(stop_words='english')
            rec_models.books_count_matrix = rec_models.books_count_vec.fit_transform(rec_models.books_dataset['soup'])
            rec_models.books_cosine_sim = cosine_similarity(rec_models.books_count_matrix, rec_models.books_count_matrix)
            """

            rec_models.books_dataset = pd.read_csv("rcsystem/static/books_dataset.csv")
            rec_models.books_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
            rec_models.books_tfidf_matrix = rec_models.books_tf.fit_transform(rec_models.books_dataset['soup'])
            rec_models.books_cosine_sim = cosine_similarity(rec_models.books_tfidf_matrix, rec_models.books_tfidf_matrix)

            self.stdout.write(self.style.SUCCESS('books updated'))

        if options["which"] == "books_rated":
            # surprise model
            rec_models.books_ratings = pd.read_csv("rcsystem/static/books_ratings.csv")
            reader = Reader()
            data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'book_id', 'rating']], reader)
            kf = KFold(n_splits=5)
            svd = SVD()

            for trainset, testset in kf.split(data):
                svd.fit(trainset)
                predictions = svd.test(testset)
                accuracy.rmse(predictions, verbose=True)

            trainset = data.build_full_trainset()
            svd.fit(trainset)

            dump.dump("rcsystem/static/user_based_book.dump", algo=svd)
            rec_models.user_based_book_algo = svd

            # fastai model
            rec_models.books_ratings_title = pd.read_csv("rcsystem/static/books_ratings_with_title.csv")
            rec_models.books_dls = CollabDataLoaders.from_df(rec_models.books_ratings, item_name='title', seed=1)
            rec_models.books_collab_filtering = collab_learner(rec_models.books_dls, y_range=(0.5, 5.5))
            rec_models.books_collab_filtering.model_dir = "."
            rec_models.books_collab_filtering.fine_tune(1, wd=0.1)  # could be more epochs
            rec_models.books_collab_filtering.save("rcsystem/static/books_collab_filtering")

            self.stdout.write(self.style.SUCCESS('books collaborative filtering updated'))

        if options["which"] == "movies":
            """
            rec_models.movies_count_vec = CountVectorizer(stop_words='english')
            rec_models.movies_count_matrix = rec_models.movies_count_vec.fit_transform(rec_models.movies_dataset['soup'][:10000])
            rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_count_matrix, rec_models.movies_count_matrix)
            """

            rec_models.movies_dataset = pd.read_csv("rcsystem/static/movies_dataset.csv")
            rec_models.movies_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
            rec_models.movies_tfidf_matrix = rec_models.movies_tf.fit_transform(rec_models.movies_dataset['soup'])
            rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_cosine_sim, rec_models.movies_cosine_sim)

            self.stdout.write(self.style.SUCCESS('movies updated'))

        if options["which"] == "movies_rated":
            # surprise model
            rec_models.books_ratings = pd.read_csv("rcsystem/static/movies_ratings.csv")
            reader = Reader()
            data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'movie_id', 'rating']], reader)
            kf = KFold(n_splits=5)
            svd = SVD()

            for trainset, testset in kf.split(data):
                svd.fit(trainset)
                predictions = svd.test(testset)
                accuracy.rmse(predictions, verbose=True)

            trainset = data.build_full_trainset()
            svd.fit(trainset)

            dump.dump("rcsystem/static/user_based_movie.dump", algo=svd)
            rec_models.user_based_movie_algo = svd

            # fastai model
            rec_models.movies_ratings_title = pd.read_csv("rcsystem/static/movies_ratings_with_title.csv")
            rec_models.movies_dls = CollabDataLoaders.from_df(rec_models.movies_ratings, item_name='title', seed=1)
            rec_models.movies_collab_filtering = collab_learner(rec_models.movies_dls, y_range=(0.5, 5.5))
            rec_models.movies_collab_filtering.model_dir = "."
            rec_models.movies_collab_filtering.fine_tune(1, wd=0.1)  # could be more epochs
            rec_models.movies_collab_filtering.save("rcsystem/static/movies_collab_filtering")

            self.stdout.write(self.style.SUCCESS('movies collaborative filtering updated'))
!git clone https://github.com/SonaliDasgupta/goodbooks-10k

import pandas as pd

!pip install fastai pytorch

ratings_df = pd.read_csv('goodbooks-10k/ratings.csv')

from fastai.collab import CollabDataBunch
from fastai.collab import collab_learner

import torch.optim as optim
from fastai.metrics import exp_rmspe
data = CollabDataBunch.from_df(ratings_df) #no test dataset here
wd = 1e-3
m = collab_learner(data, n_factors = 50, y_range = (1,5), metrics = [exp_rmspe])
#m.opt_fn = optim.Adam(params = m.parameters, lr = 0.5)
#choosing 50 factors as of now, might try with half the size of dataset later
#ratings go from 1 to 5 hence y_range

from fastai.train import lr_find
lr_find(m)
m.recorder.plot_metrics

m.recorder.plot()

m.opt = optim.Adam(params = m.model.parameters(), lr = 0.03)
m.opt.mom = 0.9
m.fit(3, lr = 0.03, wd = 1e-5) #PULL CODE IN SPYDER  AND SEE OPTIM WRAPPER USAGE , TRY WITH ADAM LATER AND ALSO WEIGHT DECAY AND MOMENTUM
#TRY WITH BOTH ADAM AND SGD
Ejemplo n.º 10
0
                               item_name=ITEM,
                               rating_name=RATING,
                               valid_pct=0)
# data.show_batch()
"""Now we will create a `collab_learner` for the data, which by default uses 
the `EmbeddingDotBias` model. We will be using 40 latent factors. This will 
create an embedding for the users and the items that will map each of these 
to 40 floats as can be seen below. Note that the embedding parameters are not 
predefined, but are learned by the model.

Although ratings can only range from 1-5, we are setting the range of possible 
ratings to a range from 0 to 5.5 -- that will allow the model to predict values 
around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay 
for regularization."""

learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1)
# learn.model

# Now train the model for 5 epochs setting the maximal learning rate.
# The learner will reduce the learning rate with each epoch using cosine annealing
learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

# save the learner
learn.export('movielens_model.pkl')

## EVALUATION

# load the learner
learner = load_learner(path=".", file='movielens_model.pkl')

# get all users and items that the model knows