def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def main(): current_stage = 6 model = LightFM(no_components=30) dataset = Dataset() for c in range(0, current_stage + 1): click_train = pd.read_csv( train_path + "/underexpose_train_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) click_test = pd.read_csv( test_path + "/underexpose_test_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) dataset.fit_partial(click_train["user_id"], click_train["item_id"]) num_users, num_items = dataset.interactions_shape() log('Num users: {}, num_items {}.'.format(num_users, num_items))
def load_parameter(): ratings = get_ratings() books = get_books() users = get_users() books_pd = convert_pd(books) id_users_books = StoreValue() for x in ratings: id_users_books._user_id.append(x[0]) id_users_books._book_id.append(x[1]) # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html dataset_explicit = Dataset() dataset_explicit.fit(id_users_books._user_id, id_users_books._book_id) num_users, num_items = dataset_explicit.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset_explicit.fit_partial(items=(x[0] for x in books), item_features=(x[7] for x in books)) dataset_explicit.fit_partial(users=(x[0] for x in users)) # create ---> mapping # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id # Trọng số voting (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings))) # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books)) # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users)) model_explicit_ratings = LightFM_ext(loss='warp') (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02) model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4) return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
def run_learning_curve(test_fraction, max_epoch): # create data_train data = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() # create map between user_id, post_id, user_features and internal indices data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions)) # retrieve mapping from dataset user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping() # split test and train interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) precision_cs = [] precision_ws = [] recall_cs = [] recall_ws = [] for epoch in range(int(max_epoch/2)): model_cs.fit(interaction_train, epochs=int(epoch*2)) model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2)) # calculate precision and recall for each epoch precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) # append to result precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) df_result = pd.DataFrame({ "precision_cs": precision_cs, "precision_ws": precision_ws, "recall_cs": recall_cs, "recall_ws": recall_ws, }) # save to file df_result.to_csv("data/validation/df.epoch.csv", index=False) return
def run_lightfm(ratings, train, test, k_items, dataset): def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None): ''' Function to create an interaction matrix dataframe from transactional type interactions Required Input - - df = Pandas DataFrame containing user-item interactions - user_col = column name containing user's identifier - item_col = column name containing item's identifier - rating col = column name containing user feedback on interaction with a given item - norm (optional) = True if a normalization of ratings is needed - threshold (required if norm = True) = value above which the rating is favorable Expected output - - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm ''' interactions = df.groupby([user_col, item_col])[rating_col] \ .sum().unstack().reset_index(). \ fillna(0).set_index(user_col) if norm: interactions = interactions.applymap(lambda x: 1 if x > threshold else 0) return interactions test_interactions = create_interaction_matrix(df=test, user_col='userId', item_col='movieId', rating_col='rating') budget_l = dataset.budget.unique().tolist() gross_l = dataset.gross.unique().tolist() awards_l = dataset.awards.unique().tolist() nom_l = dataset.nominations.unique().tolist() votes_l = dataset.votes.unique().tolist() item_ids = np.unique(train.movieId.astype(int)) print(f'length dataset: {len(dataset)}') dataset = dataset[dataset.movieId.isin(item_ids)] print(f'length dataset: {len(dataset)}') item_features_list = [f'rating_{f}' for f in range(11)] gen = [ 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] # 'unknown' add unknown for movielens100k item_features_list += gen item_features_list += budget_l item_features_list += gross_l item_features_list += awards_l item_features_list += nom_l item_features_list += votes_l item_features = [] for y, x in dataset.iterrows(): genres = x['genres'] tmp_row = (int(x['movieId']), [ x['rating'], x['budget'], x['gross'], x['awards'], x['nominations'], x['votes'] ]) for g in genres: tmp_row[1].append(g) item_features.append(tmp_row) #item_features = [(int(x['movieId']), [x['rating'], z, x['budget'], x['gross'], x['awards'], x['votes']]) for y, x in dataset.iterrows() for z in x['genres']] #x['nominations'] user_ids = np.unique(train.userId) built_dif = Dataset() built_dif.fit_partial(users=user_ids) built_dif.fit_partial(items=item_ids) built_dif.fit_partial(item_features=item_features_list) dataset_item_features = built_dif.build_item_features(item_features) (interactions, weights) = built_dif.build_interactions( ((int(x['userId']), int(x['movieId'])) for y, x in train.iterrows())) modelx = LightFM(no_components=30, loss='bpr', k=15, random_state=1) modelx.fit(interactions, epochs=30, num_threads=4, item_features=dataset_item_features ) #item_features=dataset_item_features test = sparse.csr_matrix(test_interactions.values) test = test.tocoo() num_users, num_items = built_dif.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) prec_list = dict() rec_list = dict() for num_k in k_items: trainprecision = precision_at_k( modelx, test, k=num_k, item_features=dataset_item_features).mean( ) #item_features=dataset_item_features, print('Hybrid training set precision: %s' % trainprecision) trainrecall = recall_at_k(modelx, test, k=num_k, item_features=dataset_item_features).mean( ) #item_features=dataset_item_features print('Hybrid training set recall: %s' % trainrecall) if num_k in prec_list: prec_list[num_k].append(trainprecision) else: prec_list[num_k] = trainprecision if num_k in rec_list: rec_list[num_k].append(trainrecall) else: rec_list[num_k] = trainrecall return prec_list, rec_list
def main(train_file, val_file, test_file, weight, output_file): # Read data from parquet print('Reading data ...') train_df = pd.read_parquet(train_file) val_df = pd.read_parquet(val_file) test_df = pd.read_parquet(test_file) train_df = train_df[['user_id', 'book_id', 'rating']] val_df = val_df[['user_id', 'book_id', 'rating']] test_df = test_df[['user_id', 'book_id', 'rating']] # Build the ID mappings print('Building the ID mappings ...') train = Dataset() train.fit((x for x in train_df.user_id), (x for x in train_df.book_id)) user_map = train.mapping()[0] item_map = train.mapping()[2] train_size = train.interactions_shape() with open(output_file, "a") as f: f.write( 'There are {} interactions in the training data, including {} users and {} items \n' .format(len(train_df), train_size[0], train_size[1])) print( 'There are {} interactions in the training data, including {} users and {} items' .format(len(train_df), train_size[0], train_size[1])) # Build the interactions matrix print('Building the interactions and weights matrix ...') if weight == 'True': train_df.rating = train_df.rating + 1 # use rating +1 as weights (train_int, train_weight) = train.build_interactions( ((i[1][0], i[1][1], i[1][2]) for i in train_df.iterrows())) else: (train_int, train_weight) = train.build_interactions( ((i[1][0], i[1][1]) for i in train_df.iterrows())) # filter out interactions with rating >= 3 as true label val_df = val_df[val_df.rating >= 3].reset_index(drop=True) val_user = np.array([user_map[i] for i in val_df.user_id]) val_item = np.array([item_map[i] for i in val_df.book_id]) val_data = val_df.rating val_int = coo_matrix((val_data, (val_user, val_item)), shape=train_size) test_df = test_df[test_df.rating >= 3].reset_index(drop=True) test_user = np.array([user_map[i] for i in test_df.user_id]) test_item = np.array([item_map[i] for i in test_df.book_id]) test_data = test_df.rating test_int = coo_matrix((test_data, (test_user, test_item)), shape=train_size) print('Running grid search on ranks and regularizations ...') ranks = [10, 20, 30] regs = [0, 1e-5, 5e-5] max_precision = -1 best_rank = None best_reg = None best_training_time = None best_eval_time = None best_model = None # Do grid search on ranks and regularizations using training and validation data for rank in ranks: for reg in regs: start_time = time.time() model = LightFM(no_components=rank, item_alpha=reg, user_alpha=reg, loss='warp', random_state=1211) # OPTIMIZE: precision@k model.fit(train_int, sample_weight=train_weight, epochs=10) train_end_time = time.time() val_precision = precision_at_k(model, val_int, train_interactions=train_int, k=500).mean() eval_end_time = time.time() with open(output_file, "a") as f: f.write( 'Rank %2d & Reg %.5f Validation Precision@500: %.5f \n' % (rank, reg, val_precision)) print('Rank %2d & Reg %.5f Validation Precision@500: %.5f' % (rank, reg, val_precision)) if val_precision > max_precision: max_precision = val_precision best_rank = rank best_reg = reg best_training_time = train_end_time - start_time best_eval_time = eval_end_time - train_end_time best_model = model # Evaluate best model performance on test set test_precision = precision_at_k(best_model, test_int, train_interactions=train_int, k=500).mean() with open(output_file, "a") as f: f.write( 'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f \n' % (best_rank, best_reg, test_precision)) f.write('The training takes %ss and evaluation takes %ss \n' % (best_training_time, best_eval_time)) print( 'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f' % (best_rank, best_reg, test_precision)) print('The training takes %ss and evaluation takes %ss' % (best_training_time, best_eval_time))
def main(): if request.method == 'POST': global df_movies # global top_trending_ids # print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) ) print(request.form) # Get recommendations! if 'run-mf-model' in request.form: for i, user_rating in enumerate(session['arr']): session['arr'][i] = user_rating[:-2] session['movieIds'] = session['movieIds'][:-2] rated_movies = min(len(session['arr'][0]), len(session['movieIds'])) for i, user_rating in enumerate(session['arr']): session['arr'][i] = user_rating[:rated_movies] session['movieIds'] = session['movieIds'][:rated_movies] pu = recommendation_mf(session['arr'], session['members'], session['movieIds']) session.clear() top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml) session['counter'] = 0 session['members'] = 0 session['userAges'] = [] session['userGenders'] = [] session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url) session['arr'] = None return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu})) if 'run-siamese-model' in request.form: # global df global friends global ratings global new_friend_id new_ratings = [] for mid, movie_real_id in enumerate(session['movieIds']): avg_mv_rating = np.median(np.array([user_ratings[mid] for user_ratings in session['arr']])) new_ratings.append({'movie_id_ml':movie_real_id, 'rating': avg_mv_rating, 'friend_id': new_friend_id}) new_friend = {'friend_id': new_friend_id, 'friends_age': np.mean(np.array(session['userAges'])), 'friends_gender': np.mean(np.array(session['userGenders']))} friends.append(new_friend) ratings.extend(new_ratings) dataset = LightFMDataset() item_str_for_eval = "x['title'],x['release'], x['unknown'], x['action'], x['adventure'],x['animation'], x['childrens'], x['comedy'], x['crime'], x['documentary'], x['drama'], x['fantasy'], x['noir'], x['horror'], x['musical'],x['mystery'], x['romance'], x['scifi'], x['thriller'], x['war'], x['western'], *soup_movie_features[x['soup_id']]" friend_str_for_eval = "x['friends_age'], x['friends_gender']" dataset.fit(users=(int(x['friend_id']) for x in friends), items=(int(x['movie_id_ml']) for x in movies), item_features=(eval("("+item_str_for_eval+")") for x in movies), user_features=((eval(friend_str_for_eval)) for x in friends)) num_friends, num_items = dataset.interactions_shape() print(f'Num friends: {num_friends}, num_items {num_items}. {datetime.datetime.now()}') (interactions, weights) = dataset.build_interactions(((int(x['friend_id']), int(x['movie_id_ml'])) for x in ratings)) item_features = dataset.build_item_features(((x['movie_id_ml'], [eval("("+item_str_for_eval+")")]) for x in movies) ) user_features = dataset.build_user_features(((x['friend_id'], [eval(friend_str_for_eval)]) for x in friends) ) print(f"Item and User features created {datetime.datetime.now()}") epochs = 50 #150 lr = 0.015 max_sampled = 11 loss_type = "warp" # "bpr" model = LightFM(learning_rate=lr, loss=loss_type, max_sampled=max_sampled) model.fit_partial(interactions, epochs=epochs, user_features=user_features, item_features=item_features) train_precision = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean() train_auc = auc_score(model, interactions, user_features=user_features, item_features=item_features).mean() print(f'Precision: {train_precision}, AUC: {train_auc}, {datetime.datetime.now()}') k = 18 top_movie_ids, scores = predict_top_k_movies(model, new_friend_id, k, num_items, user_features=user_features, item_features=item_features, use_features = False) top_movies = df_movies[df_movies.movie_id_ml.isin(top_movie_ids)] pu = recommendation_siamese(top_movies, scores) return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu})) # Collect friends info elif 'person-select-gender-0' in request.form: for i in range(session['members']): session['userAges'].append(int(request.form.get(f'age-{i}'))) session['userGenders'].append(int(request.form.get(f'person-select-gender-{i}'))) return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': session['members'], 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) # Choose number of people in the group elif 'people-select' in request.form: count = int(request.form.get('people-select')) session['members'] = count session['arr'] = [[0 for x in range(15)] for y in range(count)] return(render_template('main.html', settings = {'friendsInfo':True, 'showVote': False, 'people': count, 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) # All people voting elif 'person-select-0' in request.form: for i in range(session['members']): session['arr'][i][session['counter']] = int(request.form.get(f'person-select-{i}')) session['counter'] += 1 if session['counter'] < 15: return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) else: return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':True, 'recommendation': None})) elif request.method == 'GET': session.clear() top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml) print(top_trending_ids) print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) ) session['counter'] = 0 session['members'] = 0 session['userAges'] = [] session['userGenders'] = [] session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url) session['arr'] = None return(render_template('main.html', settings = {'showVote': False, 'people': 0, 'buttonDisable': False, 'recommendation': None}))
#The first thing we need to do is to create a mapping between the user and item ids from our input data to indices that will be used internally by our model. #We do this because LightFM works with user and item ids that are consecutive non-negative integers. The `Dataset` class allow us to create a mapping between the IDs we use in our systems and the consecutive indices preferred by the model. #To do this, we create a dataset and call its `fit` method. The first argument is an iterable of all user ids in our data, and the second is an iterable of all item ids. In this case, we use generator expressions to lazily iterate over our data and yield user and item ids: dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) #This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model. #We can check that the mappings have been created by querying the dataset on how many users and books it knows about: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings: dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) #This will create a feature for every unique author name in the dataset. #(Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.) ## Building the interactions matrix
print("Load in movie ratings file") # reader2 = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ratingsDS = Dataset.load_from_file('./data/movie/ratings.csv', reader2) # ratingsDF = pd.read_csv('./data/movie/ratings.csv') # ratingsDF = ratingsDF.rename(columns={'userId': 'user_id', 'movieId': 'movie_id', 'rating': 'rating'}) newUser = pd.DataFrame(data=[[60000, 264, 5, 'heroic'], [60000, 18, 3, 'historic'], [60000, 70, 4, 'sci-fi']], columns=['user_id', 'book_id', 'rating', 'tag_name']) userDS = Dataset() userDS.fit((x['user_id'] for _, x in newUser.iterrows()), (x['book_id'] for _, x in newUser.iterrows()), item_features=(x['tag_name'] for _, x in newUser.iterrows())) print("Building training set") # ratingsTrain = ratingsDS.build_full_trainset() num_users, num_items = userDS.interactions_shape() (interactions, weights) = userDS.build_interactions( ((x['user_id'], x['book_id']) for _, x in newUser.iterrows())) print(newUser.head()) print("Starting fit") # movieAlgo.fit(ratingsTrain) lightFMAlgo.fit_partial(interactions, sample_weight=weights) print("Finished fit") print("Sending to dump file") # joblib.dump(lightFMAlgo, "./RecommenderDump/algorithm2_dump") print("Sent to dump file")
# print out the ratings #for line in islice(ratings, 2): #print(json.dumps(line, indent=4)) # print out the book features #for line in islice(book_features, 1): #print(json.dumps(line, indent=4)) # create a dataset and build the ID mappings dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) # query the dataset to check how many users and items (i.e. books) it knows num_users, num_items = dataset.interactions_shape() print('Num users : {}, num_items {}.'.format(num_users, num_items)) # add some item feature mappings, and creates a unique feature for each author # NOTE: more item ids are fitted than usual, to make sure our mappings are complete # even if there are items in the features dataset that are not in the interaction set dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) # build the interaction matrix which is a main input to the LightFM model # it encodes the interactions between the users and the items (interactions, weights) = dataset.build_interactions( ((x['User-ID'], x['ISBN']) for x in get_ratings())) # item_features matrix can also be created
def run_validation(test_fraction, max_val): # containers to hold results ave_precision_at_k_cs = [] ave_recall_at_k_cs = [] ave_auc_score_cs = [] ave_precision_at_k_ws = [] ave_recall_at_k_ws = [] ave_auc_score_ws = [] # perform validation validation_itr = 0 while (validation_itr < max_val): print("Start validating cold, warm start, iteration %s" %validation_itr) # prevent random failure to abort entire job try: # count validation_itr += 1 # create data_train data_cs = Dataset() data_ws = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() print(user_feature_names) # create map between user_id, post_id, user_features and internal indices data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data())) data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data_ws.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions_ws)) # retrieve mapping from dataset user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping() user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping() # split test and train interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction) interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) model_cs.fit(interaction_train_cs, epochs=30) model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30) #--------------------------- # make predictions #--------------------------- precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs) auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) # append score from each iteration to results ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs)) ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws)) except: print("teration %s failed. Skipping.." %validation_itr) print("Validation score for test") print(ave_precision_at_k_cs ) print(ave_recall_at_k_cs ) print(ave_auc_score_cs ) print(ave_precision_at_k_ws ) print(ave_recall_at_k_ws ) print(ave_auc_score_ws ) df_result = pd.DataFrame({ 'precision_at_k_cs': ave_precision_at_k_cs, 'recall_at_k_cs': ave_recall_at_k_cs, 'auc_score_cs': ave_auc_score_cs, 'precision_at_k_ws': ave_precision_at_k_ws, 'recall_at_k_ws': ave_recall_at_k_ws, 'auc_score_ws': ave_auc_score_ws, }) # save to file df_result.to_csv("data/validation/df.csv", index=False) return
def preprocess(): import pandas as pd import math import numpy as np data_users = pd.read_csv('users_tag.csv',index_col=0) data_business = pd.read_csv('business_Nora.csv',index_col=0) data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0) data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count]) data_users.useful = pd.Series([math.log(x+1) for x in data_users.useful]) #cleam business skewness data_business.review_count = pd.Series([math.log(x+1) for x in data_business.review_count]) from lightfm.data import Dataset #model establishment dataset = Dataset() dataset.fit(data_review.user_id,data_review.business_id) type(dataset) num_users, num_items = dataset.interactions_shape() # fit item and user features. dataset.fit_partial(items=data_business.business_id, item_features=['stars']) dataset.fit_partial(items=data_business.business_id, item_features=['review_count']) tar_cols = [x for x in data_business.columns[24:]] dataset.fit_partial(items = data_business.business_id, item_features = tar_cols) user_cols = [x for x in data_users[['review_count', 'useful', 'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates', 'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges', 'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms', 'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer', 'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque', 'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners', 'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal', 'Caterers', 'Arts & Entertainment']]] dataset.fit_partial(users=data_users.user_id, user_features = user_cols) print("Building Interactions") (interactions, weights) = dataset.build_interactions([(x['user_id'], x['business_id'], x['stars']) for index,x in data_review.iterrows()]) print("Interactions Build") # build user and item features def build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst def user_build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst # get max of each column to regularize value to [0,1] max_star = max(data_business.stars) max_b_rc = max(data_business.review_count) print('max_b_rc') print(max_b_rc) # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25 item_features = dataset.build_item_features(((x['business_id'], {'stars':0.5*x['stars']/max_star, 'review_count':0.5*x['review_count']/max_b_rc, **build_dict(x,tar_cols,[0.5*x['stars']/max_star, 0.5*x['review_count']/max_b_rc])}) for index,x in data_business.iterrows())) # user_features = dataset.build_user_features(((x['user_id'], # [x['is_elite'],x['year']]) # for index, x in data_users.iterrows())) max_u_rc = max(data_users.review_count) max_useful = max(data_users.useful) user_features = dataset.build_user_features(((x['user_id'], {'review_count':0.35*x['review_count']/max_u_rc, 'useful':0.35*x['useful']/max_useful, **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows())) #train-test split # seed = 12345 #has multiple seeds set up to account for split biases # seed = 101 # seed = 186 seed = 123 from lightfm.cross_validation import random_train_test_split train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed)) print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) train.multiply(test).nnz == 0 # make sure train and test are truly disjoint return train,test,data_business,dataset,user_features,item_features
item_meta = pd.read_csv('data/books.csv') item_meta = item_meta[['book_id', 'authors', 'average_rating', 'original_title']] item_features_source = [(item_meta['book_id'][i], [item_meta['authors'][i], item_meta['average_rating'][i]]) for i in range(item_meta.shape[0])] # Construct Data-set # set, list, pandas series 모두 가능하다. # 먼저 User/Item Index를 mapping하고, User Features/Item Features를 추가한 후 # occurence 데이터를 fit한다. # 혹은 scipy.csr_matrix를 바로 fit하는 것도 가능하다. # 주의: Null 값은 다 채운 후여야 한다. dataset = Dataset() dataset.fit(users=ratings['user_id'].unique(), items=ratings['book_id'].unique(), item_features=item_meta[item_meta.columns[1:]].values.flatten() ) print("Num Users: {}, Num Items: {}".format(*dataset.interactions_shape())) print(dataset.user_features_shape(), dataset.item_features_shape()) interactions, weights = dataset.build_interactions(ratings_source) item_features = dataset.build_item_features(item_features_source) # mappings = dataset.mapping() # Save # mmwrite('data/interactions.mtx', interactions) # mmwrite('data/item_features.mtx', item_features) # mmwrite('data/weights.mtx', weights)
def main(): # n = len(sys.argv) # if n > 0: # f = sys.argv[0] # else: # f = 'new_sample.csv' # Start imports from s3 bucket_name = 'forumrecbucket' samplecsv_key = 'new_sample.csv' pickle_key = 'savefile.pickle' item_features_key = 'item_features.npz' post_mappings_key = 'post_mappings.csv' client = boto3.client( 's3') #, aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) csv_obj = client.get_object( Bucket=bucket_name, Key=samplecsv_key)['Body'].read().decode('utf-8') new = pd.read_csv(StringIO(csv_obj)) s3 = S3FileSystem() user_indicies_key = 'user_indicies.npy' post_indicies_key = 'post_indicies.npy' user_indicies = np.load( s3.open('{}/{}'.format(bucket_name, user_indicies_key))) post_indicies = np.load( s3.open('{}/{}'.format(bucket_name, post_indicies_key))) post_mappings_obj = client.get_object( Bucket=bucket_name, Key=post_mappings_key)['Body'].read().decode('utf-8') post_mappings = pd.read_csv(StringIO(post_mappings_obj)) post_mappings.columns = ['ParentId', 'post_indicies'] post_mappings.index = post_mappings['ParentId'] post_mappings = post_mappings['post_indicies'] post_ind = lambda x: post_mappings.loc[x] model_client = client.get_object(Bucket=bucket_name, Key=pickle_key)['Body'].read() model = pickle.loads(model_client) print('user_indicies length: ', len(user_indicies)) print('post_indicies length: ', len(post_indicies)) # item_features_npz = client.get_object(Bucket=bucket_name, Key=item_features_key)['Body'].read() # item_features_npz = csr_matrix(item_features_npz) # user_indicies = np.load('user_indicies.npy') # print(max(user_indicies)) # post_indicies = np.load('post_indicies.npy') # print(max(post_indicies)) # model = pickle.load(open("savefile.pickle", "rb")) dataset = Dataset() dataset.fit((x for x in user_indicies), (x for x in post_indicies)) dummies = range(max(user_indicies) + 1, 876) dataset.fit_partial((x for x in dummies)) print(dataset.interactions_shape()) # new = pd.read_csv(f) new['post_indicies'] = new['ParentId'].apply(post_ind) new_user_indicies = dict() for i in range(len(new.OwnerUserId.unique())): new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i] new['user_indicies'] = new.OwnerUserId.apply( lambda x: new_user_indicies[x]) print(new['user_indicies'].values) new_user_indicies = dict() for i in range(len(new.OwnerUserId.unique())): new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i] new['user_indicies'] = new.OwnerUserId.apply( lambda x: new_user_indicies[x]) #user_indicies = np.append(user_indicies, new.user_indicies.unique()) ####### #np.save('user_indicies.npy', user_indicies) ####### new = new[[ 'user_indicies', 'post_indicies', 'Score', 'OwnerUserId', 'ParentId' ]] dataset.fit_partial((x for x in new.user_indicies.values), (x for x in new.post_indicies.values)) (new_interactions, new_weights) = dataset.build_interactions( ((x[0], x[1], x[2]) for x in new.values)) print(new_interactions.shape) #interactions = sparse.load_npz("interactions.npz") item_features = sparse.load_npz("item_features.npz") print(item_features.shape) # item_features = sparse.load_npz(item_features_npz) for i in new.user_indicies.unique(): print(i, 'mean user embedding before refitting :', np.mean(model.user_embeddings[i])) print(new_interactions.shape) model = model.fit_partial(new_interactions, item_features=item_features, sample_weight=new_weights, epochs=10, verbose=True) for i in new.user_indicies.unique(): print(i, 'mean user embedding after refitting:', np.mean(model.user_embeddings[i])) nq = pd.read_csv('new_questions.csv') csv_buffer = StringIO() s3_resource = boto3.resource('s3') for i in new.user_indicies.unique(): scores = pd.Series( model.predict(int(i), nq.post_indicies.values, item_features=item_features)) temp = nq.copy() temp['reccomendation'] = scores.values temp.to_csv(csv_buffer, index=False) s3_resource.Object(bucket_name, 'new_recs.csv').put(Body=csv_buffer.getvalue()) # with open('savefile.pickle', 'wb') as fle: # pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL) s3_resource.Object(bucket_name, pickle_key).put( Body=pickle.dumps(model)) #, protocol=pickle.HIGHEST_PROTOCOL))
def lambda_handler(event, context): try: ## Fetch data from RDS code connection = pymysql.connect( host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com', user='******', passwd='postgres', db='fitbookdb', cursorclass=pymysql.cursors.DictCursor) print("Connection successful") except: print("Connection error") # In[3]: #Get Food DataFrame dict_list = [] with connection.cursor() as cur: cur.execute("select * from food_dataset") for row in cur: dict_list.append(row) food_rds_df = pd.DataFrame(dict_list) food_df = food_rds_df.copy() food_df.drop([ 'Portion_Default', 'Portion_Amount', 'Factor', 'Increment', 'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name' ], axis=1, inplace=True) # food_df.head() print('Food Dataframe imported') # In[4]: # # TODO: Perform Binning # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats'] # for each_column in food_30_bins: # bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30) # food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1)) # food_df # In[5]: # for each_column in food_30_bins: # print(food_df[each_column].min()) # In[6]: #Get User Dataframe # user_df = pd.read_csv('user_db_try.csv') # user_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserData") for row in cur: dict_list.append(row) user_rds_df = pd.DataFrame(dict_list) user_df = user_rds_df.copy() user_df.drop([ 'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken', 'fitbitAccessToken', 'fitbitUserID', 'userName' ], axis=1, inplace=True) # user_df.head() print('User Dataframe imported') # In[7]: #Get userItem DataFrame # userItem_df = pd.read_csv('userItem_db_try_new.csv') # userItem_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserRating") for row in cur: dict_list.append(row) userItem_rds_df = pd.DataFrame(dict_list) userItem_df = userItem_rds_df.copy() # userItem_df.head() print('UserItem Dataframe imported') # In[8]: #Make all the feature values unique for column_name in food_df.columns: if column_name != 'food_ID': food_df[column_name] = str( column_name) + ":" + food_df[column_name].astype(str) # food_df.head() # In[9]: #This Dict will be useful while creating tupples food_features_df = food_df.drop(['food_ID'], axis=1).copy() food_features_dict = food_features_df.to_dict('split') # food_features_dict # In[10]: food_feature_values = [] for column_name in food_features_df.columns: food_feature_values.extend(food_features_df[column_name].unique()) # food_feature_values # In[11]: for column_name in user_df.columns: if column_name != 'userID': user_df[column_name] = str( column_name) + ":" + user_df[column_name].astype(str) user_features_df = user_df.drop(['userID'], axis=1).copy() user_features_dict = user_features_df.to_dict('split') # user_features_dict # In[12]: user_feature_values = [] for column_name in user_features_df.columns: user_feature_values.extend(user_features_df[column_name].unique()) # user_feature_values # In[13]: user_tuples = [] food_tuples = [] for index, row in user_df.iterrows(): user_tuples.append((row['userID'], user_features_dict['data'][index])) for index, row in food_df.iterrows(): food_tuples.append((row['food_ID'], food_features_dict['data'][index])) # food_tuples # In[14]: print("Creating LightFm dataset") dataset = Dataset() dataset.fit(users=(user_id for user_id in user_df['userID']), items=(food_id for food_id in food_df['food_ID'])) print("Dataset Created") # In[15]: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) # In[16]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data'])) # In[17]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows())) # In[18]: print("fittng item partial features") dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']), item_features=(each_value for each_value in food_feature_values)) # In[19]: # dataset.fit_partial(users=(user_id for user_id in user_df['Id']), # user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data'])) # In[20]: print("fittng user partial features") dataset.fit_partial(users=(user_id for user_id in user_df['userID']), user_features=(each_value for each_value in user_feature_values)) # In[21]: # dataset.item_features_shape() # dataset.user_features_shape() # In[22]: print("Building Interactions") (interactions, weights) = dataset.build_interactions( ((x['userID'], x['food_ID'], x['rating']) for y, x in userItem_df.iterrows())) # print(repr(interactions)) # print(weights) # In[23]: # interactions.shape # In[24]: print("Building item features") item_features = dataset.build_item_features(each_tuple for each_tuple in food_tuples) # print(item_features) # In[25]: user_features = dataset.build_user_features(each_tuple for each_tuple in user_tuples) # print(user_features) # In[26]: print("Fitting Model") model = LightFM(loss='warp') model.fit(interactions, item_features=item_features, user_features=user_features) print("Model trained!!") print("Pickle started!!") pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2) bucketName = "fitbook-lambda-packages" Key = "/tmp/model.pkl" outPutname = "model.pkl" print("Uploading to S3") s3 = boto3.client('s3') s3.upload_file(Key, bucketName, outPutname) print("Upload done") os.remove("/tmp/model.pkl") print("Pickle file deleted") print("Successssss!!!!!")
mov_features = ((row[0], row[2].split('|') + [row[3], row[0]]) for rid, row in movies.iterrows()) # print(mov_features[0]) item_features = dataset.build_item_features(mov_features) model = LightFM(loss='warp', no_components=28, item_alpha=0.0001, learning_rate=0.05) model.fit(interactions, item_features=item_features, num_threads=16) movie2name = {} for rid, row in movies.iterrows(): movie2name[row[0]] = row[1] n_users, n_items = dataset.interactions_shape() # Adjust using base ratings base_mat = model.predict(0, np.arange(n_items), num_threads=16) base_mat = (base_mat + np.min(base_mat)) # base_mat = np.log2(base_mat + np.min(base_mat)) def sample_recommendation(model, interations, user_ids): n_users, n_items = dataset.interactions_shape() for user_id in user_ids: user_id = int(user_id) known_positives = [ movie2name[rev_item_mapping[x]] for x in interactions.tocsr()[user_id].indices ]