def setup_test_data(min_reviews=2, local=False): """ Creating a dataframe per user with the inputs and target for testing """ # Fetching the review dataframe print("Fetching the review dataframe...") if local: input_csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") data = pd.read_csv(f'{input_csv_path}/review_pp.csv') else: data = storage.import_file('data/preprocessed', 'review_pp.csv') # Creating user / reviews dict print("Creating user and reviews dict...") user_reviews = data.groupby('user_id') \ .agg({'recipe_id': (lambda x: list(x)), 'rating': (lambda x: list(x)), 'liked': (lambda x: list(x))}) \ .reset_index() # Selecting only users with at least min_reviews (2 by default) print(f"Selecting only users with at least {min_reviews} reviews...") selected_user_reviews = user_reviews[user_reviews['recipe_id'].str.len() >= min_reviews] print("Create and filling evaluation dataframe...") # Create evaluation dataframe test_df = pd.DataFrame(columns=['user_id', 'inputs', 'target', 'rating', 'liked']) # Filling in the evaluation dataframe for index, row in selected_user_reviews.iterrows(): user = row['user_id'] target = row['recipe_id'][-1] liked = row['liked'][-1] rating = row['rating'][-1] inputs = {row['recipe_id'][i]: row['liked'][i] for i in range(len(row['recipe_id']) - 1)} new_row = {'user_id': user, 'inputs': inputs, 'target': target, 'rating': rating, 'liked': liked} test_df = test_df.append(new_row, ignore_index=True) print("Saving test input dataframe...") # timestamp = '{:%Y%m%d_%H%M}'.format(datetime.datetime.now()) if local: output_csv_path = os.path.join(os.path.dirname(__file__), "data/test") test_df.to_csv(f'{output_csv_path}/test_inputs.csv', index=False) else: storage.upload_file(test_df, 'data/test', 'test_inputs.csv') return test_df
def run_recommendations(user_id=None, collaborative=0.5, clear_neg=False, vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0, local = False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data") test_df = pd.read_csv(f'{csv_path}/test/test_inputs.csv') recipe_df = pd.read_csv(f"{csv_path}/preprocessed/recipe_pp.csv") else: test_df = storage.import_file('data/test', 'test_inputs.csv') recipe_df = storage.import_file('data/preprocessed', 'recipe_pp.csv') if user_id: test_case = test_df[test_df.user_id == user_id] else: test_case = test_df.sample() inputs = eval(test_case.inputs.values[0]) input_df = pd.DataFrame(columns=['recipe_id', 'liked']) for recipe, liked in inputs.items(): input_df = input_df.append({'recipe_id': recipe, 'liked': liked}, ignore_index=True) input_df = input_df.merge(recipe_df, on='recipe_id', how='left')\ [['recipe_id', 'name', 'liked']] display(input_df) recommendations = get_user_recommendations(user_inputs = inputs, collaborative = collaborative, clear_neg = clear_neg, vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0) output_df = recommendations.merge(recipe_df, on='recipe_id', how='left') \ [['recipe_id', 'name', 'content', 'collaborative', 'hybrid', 'rec_score']] display(output_df.head(10)) return input_df, output_df
def run_test(predict = True, sample = None, vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0, local = False): """ Running the test, by computing predictions and preparing the result dataframe """ pd.options.mode.chained_assignment = None print("Fetching the test inputs...") if local: input_csv_path = os.path.join(os.path.dirname(__file__), "data/test") input_data = pd.read_csv(f'{input_csv_path}/test_inputs.csv') else: input_data = storage.import_file('data/test', 'test_inputs.csv') print("Calculating predictions...") if sample is None: data = input_data.copy() else: data = input_data.sample(sample, random_state=42) if predict: predictions = list() for index, test in data.iterrows(): prediction_matrix = get_user_recommendations(user_inputs=eval(test['inputs']), clear_neg=False, user_id=test['user_id'], forced_recipes=[test['target']], vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0) prediction_row = prediction_matrix[prediction_matrix.index == test['target']] if len(prediction_row) > 0: pred = float(prediction_row['rec_score']) predictions.append(np.round(pred, 3)) print(f"> ({index}) Prediction for user {test['user_id']} done: {pred}!") else: predictions.append(None) print(f"> ({index}) Prediction for user {test['user_id']} not found!") else: ln = lognorm.rvs(0.2, size=data.shape[0]) predictions = (ln - ln.min()) / (ln.max() - ln.min()) print("Preparing results dataframe...") data['rec_score'] = predictions print("Cleaning up failed scores...") data = data[data['rec_score'] >= 0] data['rec_rating'] = __convert_to_rating(data[['rating', 'rec_score']]) data['rec_liked'] = 0 data['rec_classify'] = '' print("Iterating and filling results dataframe...") for index, row in data.iterrows(): if data.loc[index, 'rec_rating'] >= 4: data.loc[index, 'rec_liked'] = 1 actual = data.loc[index, 'liked'] == 1 predict = data.loc[index, 'rec_liked'] == 1 if predict and actual: data.loc[index, 'rec_classify'] = 'TP' elif predict and not actual: data.loc[index, 'rec_classify'] = 'FP' elif not predict and not actual: data.loc[index, 'rec_classify'] = 'TN' elif not predict and actual: data.loc[index, 'rec_classify'] = 'FN' print("Saving results dataframe...") timestamp = '{:%Y%m%d_%H%M}'.format(datetime.datetime.now()) if local: csv_path = os.path.join(os.path.dirname(__file__), "data/test") data.to_csv(f'{csv_path}/test_outputs_{timestamp}.csv', index=False) else: storage.upload_file(data, 'data/test', f'test_outputs_{timestamp}.csv') print("Calculating metrics for tests...") metrics = get_scoring_metrics(data) # Returns tuple of data and metrics return data, metrics
def __create_latent_matrices(pool = 2000, content_reduction = 250, rating_reduction = 800, user_inputs = None, user_id = None, forced_recipes = [], goal = '', diet = '', allergies = [], dislikes = [], custom_dsl = '', time = None, steps = None, vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0, local = False): ''' Generates the latent dataframes used for the prediction model ''' #### First the data needs to be loaded if user_inputs is None: user_inputs = TEST_USER user_recipes = list(user_inputs.keys()) if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") recipes_df_raw = pd.read_csv(f"{csv_path}/recipe_pp.csv") reviews_df_raw = pd.read_csv(f"{csv_path}/review_pp.csv") else: recipes_df_raw = storage.import_file('data/preprocessed', 'recipe_pp.csv') reviews_df_raw = storage.import_file('data/preprocessed', 'review_pp.csv') # For test purposes only if forced_recipes and user_id: for fr in forced_recipes: reviews_df_raw = reviews_df_raw[~((reviews_df_raw['recipe_id'] == fr) & (reviews_df_raw['user_id'] == user_id))] user_recipe_df = recipes_df_raw[recipes_df_raw.recipe_id.isin(user_recipes)] other_recipes_df = recipes_df_raw[~recipes_df_raw.recipe_id.isin(user_recipes + forced_recipes)] forced_recipes_df = recipes_df_raw[recipes_df_raw.recipe_id.isin(forced_recipes)] sample = np.min([pool, (len(other_recipes_df) + len(forced_recipes_df))]) target_df = pd.concat([other_recipes_df.sample(sample - len(forced_recipes_df), random_state=42), forced_recipes_df], axis=0) # print(target_df.shape) ### Filter method here: filtered_df = filters.all_filters(target_df, goal=goal, diet=diet, allergies=allergies, dislikes=dislikes, custom_dsl=custom_dsl, time=time, steps=steps) # print(filtered_df.shape) input_df = pd.concat([user_recipe_df, filtered_df], axis=0) # print(input_df.shape) merge_df = pd.merge(input_df[['recipe_id', 'metadata']], reviews_df_raw, on="recipe_id", how="left").dropna() recipes_df = merge_df[['recipe_id', 'metadata']].groupby(by="recipe_id").first().reset_index() reviews_df = merge_df.drop(['metadata'], axis="columns").reset_index() # print(recipes_df.shape) ###################################################################### #### Using count vectorizer to create content based latent matrix #### #### use dimension reduction with TruncatedSVD #### ###################################################################### if vectorizer == 'count': vector = CountVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df) vector_matrix = vector.fit_transform(recipes_df['metadata']) elif vectorizer == 'tfidf': vector = TfidfVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df) vector_matrix = vector.fit_transform(recipes_df['metadata']) vector_df = pd.DataFrame(vector_matrix.toarray(), index=recipes_df.recipe_id.tolist()) if dimred == 'svd': base_case = TruncatedSVD(n_components = 1000) base_case.fit_transform(vector_df) cumsum = base_case.explained_variance_ratio_.cumsum() content_reduction = max(100, len(cumsum[cumsum <= 0.8])) redutor = TruncatedSVD(n_components = content_reduction) elif dimred == 'nmf': base_case = NMF(n_components = 1000) base_case.fit_transform(vector_df) cumsum = base_case.explained_variance_ratio_.cumsum() content_reduction = max(100, len(cumsum[cumsum <= 0.8])) redutor = NMF(n_components = content_reduction) latent_df = redutor.fit_transform(vector_df) latent_df = pd.DataFrame(latent_df[:,0:content_reduction], index=recipes_df.recipe_id.tolist()) ################################################################## #### Using user ratings to create content based latent matrix #### #### use dimension reduction with TruncatedSVD #### ################################################################## ratings_basis = pd.merge(recipes_df[['recipe_id']], reviews_df, on="recipe_id", how="right") ratings = ratings_basis.pivot(index = 'recipe_id', columns ='user_id', values = 'rating').fillna(0) if dimred == 'svd': base_case = TruncatedSVD(n_components = 1000) base_case.fit_transform(ratings) cumsum = base_case.explained_variance_ratio_.cumsum() rating_reduction = max(100, len(cumsum[cumsum <= 0.8])) redutor = TruncatedSVD(n_components = rating_reduction) elif dimred == 'nmf': base_case = NMF(n_components = 1000) base_case.fit_transform(ratings) cumsum = base_case.explained_variance_ratio_.cumsum() rating_reduction = max(100, len(cumsum[cumsum <= 0.8])) redutor = NMF(n_components = rating_reduction) latent_df_2 = redutor.fit_transform(ratings) index_list = reviews_df.groupby(by="recipe_id").mean().index.tolist() latent_df_2 = pd.DataFrame(latent_df_2, index=index_list) ##################################### #### Exporting latent DataFrames #### ##################################### return latent_df, latent_df_2
def train(self, algo='SVD', like=True, test='cv', local=False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv") self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv") else: self.recipes = storage.import_file('data/preprocessed', 'recipe_pp.csv') self.reviews = storage.import_file('data/preprocessed', 'review_pp.csv') if like: self.target = 'liked' self.s_min = 0 self.s_max = 1 else: self.target = 'rating' self.s_min = 1 self.s_max = 5 reader = Reader(rating_scale=(self.s_min, self.s_max)) self.relevant_data = self.reviews[[ 'user_id', 'recipe_id', self.target ]] model_data = Dataset.load_from_df(self.relevant_data, reader) # Algos if 'NormalPredictor': self.algorithm = NormalPredictor() elif 'BaselineOnly': self.algorithm = BaselineOnly() elif 'KNNBasic': self.algorithm = KNNBasic() elif 'KNNWithMeans': self.algorithm = KNNWithMeans() elif 'KNNWithZScore': self.algorithm = KNNWithZScore() elif 'KNNBaseline': self.algorithm = KNNBaseline() elif 'SVD': params = { 'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02 } self.algorithm = SVD(params) # Tuned with svd_grid elif 'SVDpp': self.algorithm = SVDpp() elif 'NMF': self.algorithm = NMF() elif 'SlopeOne': self.algorithm = SlopeOne() elif 'CoClustering': self.algorithm = CoClustering() if test == 'cv': cv_results = cross_validate(self.algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) rmse = np.round(cv_results['test_rmse'].mean(), 3) mae = np.round(cv_results['test_mae'].mean(), 3) train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) elif test == 'svd_grid': param_grid = { 'n_epochs': [10, 20], 'n_factors': [100, 200], 'lr_all': [0.001, 0.002], 'reg_all': [0.01, 0.02] } train_data = model_data.build_full_trainset() gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(model_data) rmse = gs.best_score['rmse'] mae = gs.best_score['mae'] print(gs.best_params['rmse'], gs.best_params['mae']) self.algorithm = gs.best_estimator['rmse'] train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) else: train, test = train_test_split(model_data, test_size=0.3, random_state=42) self.algorithm.fit(train) predictions = self.algorithm.test(test) rmse = np.round(accuracy.rmse(predictions), 3) mae = np.round(accuracy.mae(predictions), 3) return rmse, mae
def create_latent_matrices(vectorizer='tfidf', dimred='svd', ngram=(1, 1), min_df=1, max_df=1.0, local=False): ''' Generates the latent dataframes used for the prediction model ''' print("\n***** Creating Latent Matrices *****") print("Loading preprocessed data for recipes and reviews...") if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") recipes_df = pd.read_csv(f"{csv_path}/recipe_pp.csv") reviews_df = pd.read_csv(f"{csv_path}/review_pp.csv") else: recipes_df = storage.import_file('data/preprocessed', 'recipe_pp.csv') reviews_df = storage.import_file('data/preprocessed', 'review_pp.csv') # Test purposes: # recipes_df = recipes_df.sample(100) # reviews_df = reviews_df[reviews_df['recipe_id'].isin(recipes_df['recipe_id'])] print(f"Vectorizing metadata using {vectorizer.upper()} approach...") print(f"> Applying ngram {ngram}, min_df {min_df} and max_df {max_df}") if vectorizer == 'count': vector = CountVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df) vector_matrix = vector.fit_transform(recipes_df['metadata']) elif vectorizer == 'tfidf': vector = TfidfVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df) vector_matrix = vector.fit_transform(recipes_df['metadata']) vector_df = pd.DataFrame(vector_matrix.toarray(), index=recipes_df.recipe_id.tolist()) print( f"Reducing metadata vector dimensions using the {dimred.upper()} approach..." ) if dimred == 'svd': m_base_case = TruncatedSVD(n_components=min(vector_df.shape[1] - 1, 1000)) m_base_case.fit_transform(vector_df) m_cumsum = m_base_case.explained_variance_ratio_.cumsum() content_reduction = len(m_cumsum[m_cumsum <= 0.8]) print(f"> {content_reduction} components considered...") m_redutor = TruncatedSVD(n_components=content_reduction) elif dimred == 'nmf': m_base_case = NMF(n_components=min(vector_df.shape[1] - 1, 1000)) m_base_case.fit_transform(vector_df) m_cumsum = m_base_case.explained_variance_ratio_.cumsum() content_reduction = len(m_cumsum[m_cumsum <= 0.8]) print(f"> {content_reduction} components considered...") m_redutor = NMF(n_components=content_reduction) print("Creating metadata's latent dataframe...") m_latent_matrix = m_redutor.fit_transform(vector_df) content_latent = pd.DataFrame(m_latent_matrix[:, 0:content_reduction], index=recipes_df.recipe_id.tolist()) print("Pivoting ratings to user/recipe matrix...") ratings_basis = reviews_df.sort_values(by="recipe_id") ratings = (ratings_basis.groupby(['recipe_id', 'user_id' ]).rating.first().unstack()).fillna(0) print( f"Reducing rating vector dimensions using the {dimred.upper()} approach..." ) if dimred == 'svd': r_base_case = TruncatedSVD(n_components=min(ratings.shape[1] - 1, 1000)) r_base_case.fit_transform(ratings) r_cumsum = r_base_case.explained_variance_ratio_.cumsum() rating_reduction = len(r_cumsum[r_cumsum <= 0.8]) r_redutor = TruncatedSVD(n_components=rating_reduction) elif dimred == 'nmf': r_base_case = NMF(n_components=min(ratings.shape[1] - 1, 1000)) r_base_case.fit_transform(ratings) r_cumsum = r_base_case.explained_variance_ratio_.cumsum() rating_reduction = len(r_cumsum[r_cumsum <= 0.8]) r_redutor = NMF(n_components=rating_reduction) print("Creating rating's latent dataframe...") r_latent_matrix = r_redutor.fit_transform(ratings) r_index_list = reviews_df.groupby(by="recipe_id").mean().index.tolist() rating_latent = pd.DataFrame(r_latent_matrix, index=r_index_list) print("Exporting latent matrixes as CSV...") storage.upload_file(content_latent, 'data/models', 'content_latent.csv') storage.upload_file(rating_latent, 'data/models', 'rating_latent.csv') print("Latent matrix preparation and exporting done!") return content_latent, rating_latent
def get_user_recommendations(user_inputs=None, n_recommendations=None, collaborative=0.5, clear_neg=False, content_latent=None, rating_latent=None): ''' Gets the recommendations for one user by taking all of its liked and disliked dishes, getting the recommendation based on each recipe and then summing the scores ''' print("\n***** Calculating Recommendations *****") if user_inputs is None: user_inputs = TEST_USER print("Loading latent matrixes from CSV...") if content_latent is None: content_latent = storage.import_file( 'data/models', 'content_latent.csv').rename(columns={ 'Unnamed: 0': 'recipe_id' }).set_index("recipe_id") if rating_latent is None: rating_latent = storage.import_file( 'data/models', 'rating_latent.csv').rename(columns={ 'Unnamed: 0': 'recipe_id' }).set_index("recipe_id") print("Listing likes/dislikes and running individual recommendations...") user_likes = [ recipe for recipe, liked in user_inputs.items() if liked == 1 ] user_dislikes = [ recipe for recipe, liked in user_inputs.items() if liked == 0 ] if user_likes: recommendations = [ get_recommendation(recipe, content_latent, rating_latent, collaborative) for recipe in user_likes ] recommendations_df = pd.concat(recommendations) if user_dislikes: dislikes = [ get_recommendation(recipe, content_latent, rating_latent, collaborative) for recipe in user_dislikes ] dislike_df = pd.concat(dislikes) dislike_df[[ 'content', 'collaborative', 'hybrid' ]] = dislike_df[['content', 'collaborative', 'hybrid']] * (-1) print("Grouping and summing recommendation matrixes...") if user_likes and user_dislikes: complete_recs = pd.concat([recommendations_df, dislike_df], axis=0) elif user_likes and (not user_dislikes): complete_recs = recommendations_df elif (not user_likes) and user_dislikes: complete_recs = dislike_df grouped_recommendations = complete_recs.groupby( by="recipe_id").sum().sort_values(by="hybrid", ascending=False) grouped_recommendations = grouped_recommendations[ ~grouped_recommendations.index.isin(user_likes + user_dislikes)] if clear_neg: grouped_recommendations = grouped_recommendations[ grouped_recommendations['hybrid'] > 0] print("Generating recommendation scores...") score_min = grouped_recommendations['hybrid'].min() score_max = grouped_recommendations['hybrid'].max() score_dif = score_max - score_min grouped_recommendations['rec_score'] = np.round( (grouped_recommendations['hybrid'] - score_min) / score_dif, 3) grouped_recommendations.sort_values(by='rec_score', ascending=False, inplace=True) print("Returning final recommendation matrix!") if n_recommendations: grouped_recommendations = grouped_recommendations.head( n_recommendations) return grouped_recommendations