def train_fastai(params, data): model = collab_learner( data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"] ) with Timer() as t: model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"]) return model, t
def train_fastai(params, data): model = collab_learner(data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"] ) with Timer() as t: model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"]) return model, t
def train(self, dataset: RecommendationDataset) -> None: self.data_pd = dataset.data self.data = CollabDataLoaders.from_df( dataset.data, user_name=dataset.user_col, item_name=dataset.item_col, rating_name=dataset.score_col, valid_pct=0) self.learner = collab_learner(self.data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1) self.learner.fit_one_cycle(self.epochs)
#st.subheader("data loaded") data_train, data_test = python_random_split(data, ratio=0.7) split =data_train.shape[0], data_test.shape[0] st.write("Splitting_Ratio:",split) data = CollabDataBunch.from_df(data_train, seed=42, valid_pct=0.1) y_range = [0.5,5.5] st.write(y_range) factor=N_FACTORS st.write("No. of factors:",factor) learn = collab_learner(data, n_factors=factor, y_range=y_range, wd=1e-1) learn.model #st.subheader("data loaded") fit_onecycle=learn.fit_one_cycle(5, 3e-4) st.write(fit_onecycle) #st.subheader("data loaded") learn.recorder.plot() st.pyplot()
books_dataset = pd.read_csv("rcsystem/static/books_dataset.csv") books_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english') books_tfidf_matrix = books_tf.fit_transform(books_dataset['soup']) books_cosine_sim = cosine_similarity(books_tfidf_matrix, books_tfidf_matrix) # 1B) Books users books_ratings = pd.read_csv("rcsystem/static/books_ratings.csv") # surprise model _, user_based_book_algo = dump.load("rcsystem/static/user_based_book.dump") # fastai model # Important: Used seed=1 to match size of the vocabulary. It may cause problems after training different day books_dls = CollabDataLoaders.from_df(books_ratings, item_name='title', seed=1) books_collab_filtering = collab_learner(books_dls, y_range=(0.5, 5.5)) books_collab_filtering.model_dir = "." books_collab_filtering = books_collab_filtering.load("rcsystem/static/books_collab_filtering") # 2A) Movies items """ movies_count_vec = CountVectorizer(stop_words='english') movies_count_matrix = movies_count_vec.fit_transform(movies_dataset['soup'][:10000]) movies_cosine_sim = cosine_similarity(movies_count_matrix, movies_count_matrix) """ movies_dataset = pd.read_csv("rcsystem/static/movies_dataset.csv") movies_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english') movies_tfidf_matrix = movies_tf.fit_transform(movies_dataset['soup'][:10000]) movies_cosine_sim = cosine_similarity(movies_tfidf_matrix, movies_tfidf_matrix)
continue if anime_name != correct_anime: use_name = input(f"Did you mean '{correct_anime}'? [y/n]") while use_name.lower() not in ["y", "n", "yes", "no"]: print("Please input y or n") use_name = input(f"Did you mean '{correct_anime}'? [y/n]") if use_name.lower() in ["n", "no"]: continue entries[correct_anime] = rating return entries path = Path("data") data = torch.load(path / "data.feather") y_range = [-0.5, 10.5] learner = fc.collab_learner(data, n_factors=50, use_nn=False, y_range=y_range) learner.load('anime') anime2id = torch.load(path / "anime2id.feather") input_dict = { "psychopass": 9, "durarara!!": 8, "shiki": 7, "shigatsu wa kimi no uso": 10, "boku dake ga inai machi": 9, "kimi no na wa": 9, "Ansatsu Kyoushitsu": 8, "Shokugeki no Soma": 6, "clannad": 8, "clannad after story": 2,
# Absence of a rating does not mean that the user did not like the book. # Actual rating values vary between 1 and 10. If we leave 0 ratings unchanged, # the model could implicitly assume that 0 ratings are worse than the lowest actual rating. # 0 ratings prevail in the dataset and outnumber all other rating values. # Assigning average or median values to the majority of samples will distort the data # and lead to inaccurate and unreliable recommendation model. # To avoid this we will drop all rows with 0 values before passing the data to collaborative model. data_col = data[data['Book-Rating'] > 0] print(f'Number of samples before dropping 0 values: {len(data)}\n' f'Number of samples after dropping 0 values: {len(data_col)}') # Prepare data for the model dls = CollabDataLoaders.from_df( data_col[['User-ID', 'Content_ID', 'Book-Rating']], bs=64, valid_pct=0.1) # Create a collaborative model and train for 5 epochs learner = collab_learner(dls, y_range=(1, 10)) learner.fine_tune(5) # After 5 epochs of training validation loss stalls. If training continues, # the model just overfits on the train data without any considerable improvement on the validation set. # Validation error is about 3.7, which is high taking into account that ratings vary between 1 and 10. # This magnitude of errors could easily result in recommending the books that the user wouldn't like # and not recommending books that in reality would get high ratings. # Display predicted and actual ratings for each user-book pair learner.show_results() # Prediction for all users result = learner.predict(dls)
def handle(self, *args, **options): if options["which"] == "books": """ rec_models.books_count_vec = CountVectorizer(stop_words='english') rec_models.books_count_matrix = rec_models.books_count_vec.fit_transform(rec_models.books_dataset['soup']) rec_models.books_cosine_sim = cosine_similarity(rec_models.books_count_matrix, rec_models.books_count_matrix) """ rec_models.books_dataset = pd.read_csv("rcsystem/static/books_dataset.csv") rec_models.books_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english') rec_models.books_tfidf_matrix = rec_models.books_tf.fit_transform(rec_models.books_dataset['soup']) rec_models.books_cosine_sim = cosine_similarity(rec_models.books_tfidf_matrix, rec_models.books_tfidf_matrix) self.stdout.write(self.style.SUCCESS('books updated')) if options["which"] == "books_rated": # surprise model rec_models.books_ratings = pd.read_csv("rcsystem/static/books_ratings.csv") reader = Reader() data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'book_id', 'rating']], reader) kf = KFold(n_splits=5) svd = SVD() for trainset, testset in kf.split(data): svd.fit(trainset) predictions = svd.test(testset) accuracy.rmse(predictions, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) dump.dump("rcsystem/static/user_based_book.dump", algo=svd) rec_models.user_based_book_algo = svd # fastai model rec_models.books_ratings_title = pd.read_csv("rcsystem/static/books_ratings_with_title.csv") rec_models.books_dls = CollabDataLoaders.from_df(rec_models.books_ratings, item_name='title', seed=1) rec_models.books_collab_filtering = collab_learner(rec_models.books_dls, y_range=(0.5, 5.5)) rec_models.books_collab_filtering.model_dir = "." rec_models.books_collab_filtering.fine_tune(1, wd=0.1) # could be more epochs rec_models.books_collab_filtering.save("rcsystem/static/books_collab_filtering") self.stdout.write(self.style.SUCCESS('books collaborative filtering updated')) if options["which"] == "movies": """ rec_models.movies_count_vec = CountVectorizer(stop_words='english') rec_models.movies_count_matrix = rec_models.movies_count_vec.fit_transform(rec_models.movies_dataset['soup'][:10000]) rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_count_matrix, rec_models.movies_count_matrix) """ rec_models.movies_dataset = pd.read_csv("rcsystem/static/movies_dataset.csv") rec_models.movies_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english') rec_models.movies_tfidf_matrix = rec_models.movies_tf.fit_transform(rec_models.movies_dataset['soup']) rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_cosine_sim, rec_models.movies_cosine_sim) self.stdout.write(self.style.SUCCESS('movies updated')) if options["which"] == "movies_rated": # surprise model rec_models.books_ratings = pd.read_csv("rcsystem/static/movies_ratings.csv") reader = Reader() data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'movie_id', 'rating']], reader) kf = KFold(n_splits=5) svd = SVD() for trainset, testset in kf.split(data): svd.fit(trainset) predictions = svd.test(testset) accuracy.rmse(predictions, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) dump.dump("rcsystem/static/user_based_movie.dump", algo=svd) rec_models.user_based_movie_algo = svd # fastai model rec_models.movies_ratings_title = pd.read_csv("rcsystem/static/movies_ratings_with_title.csv") rec_models.movies_dls = CollabDataLoaders.from_df(rec_models.movies_ratings, item_name='title', seed=1) rec_models.movies_collab_filtering = collab_learner(rec_models.movies_dls, y_range=(0.5, 5.5)) rec_models.movies_collab_filtering.model_dir = "." rec_models.movies_collab_filtering.fine_tune(1, wd=0.1) # could be more epochs rec_models.movies_collab_filtering.save("rcsystem/static/movies_collab_filtering") self.stdout.write(self.style.SUCCESS('movies collaborative filtering updated'))
!git clone https://github.com/SonaliDasgupta/goodbooks-10k import pandas as pd !pip install fastai pytorch ratings_df = pd.read_csv('goodbooks-10k/ratings.csv') from fastai.collab import CollabDataBunch from fastai.collab import collab_learner import torch.optim as optim from fastai.metrics import exp_rmspe data = CollabDataBunch.from_df(ratings_df) #no test dataset here wd = 1e-3 m = collab_learner(data, n_factors = 50, y_range = (1,5), metrics = [exp_rmspe]) #m.opt_fn = optim.Adam(params = m.parameters, lr = 0.5) #choosing 50 factors as of now, might try with half the size of dataset later #ratings go from 1 to 5 hence y_range from fastai.train import lr_find lr_find(m) m.recorder.plot_metrics m.recorder.plot() m.opt = optim.Adam(params = m.model.parameters(), lr = 0.03) m.opt.mom = 0.9 m.fit(3, lr = 0.03, wd = 1e-5) #PULL CODE IN SPYDER AND SEE OPTIM WRAPPER USAGE , TRY WITH ADAM LATER AND ALSO WEIGHT DECAY AND MOMENTUM #TRY WITH BOTH ADAM AND SGD
item_name=ITEM, rating_name=RATING, valid_pct=0) # data.show_batch() """Now we will create a `collab_learner` for the data, which by default uses the `EmbeddingDotBias` model. We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model. Although ratings can only range from 1-5, we are setting the range of possible ratings to a range from 0 to 5.5 -- that will allow the model to predict values around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay for regularization.""" learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0, 5.5], wd=1e-1) # learn.model # Now train the model for 5 epochs setting the maximal learning rate. # The learner will reduce the learning rate with each epoch using cosine annealing learn.fit_one_cycle(EPOCHS, max_lr=5e-3) # save the learner learn.export('movielens_model.pkl') ## EVALUATION # load the learner learner = load_learner(path=".", file='movielens_model.pkl') # get all users and items that the model knows