Beispiel #1
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
Beispiel #2
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
Beispiel #3
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
Beispiel #4
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
def main():
    current_stage = 6
    model = LightFM(no_components=30)
    dataset = Dataset()

    for c in range(0, current_stage + 1):
        click_train = pd.read_csv(
            train_path + "/underexpose_train_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        click_test = pd.read_csv(
            test_path + "/underexpose_test_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        dataset.fit_partial(click_train["user_id"], click_train["item_id"])
        num_users, num_items = dataset.interactions_shape()
        log('Num users: {}, num_items {}.'.format(num_users, num_items))
Beispiel #6
0
def load_parameter():
    ratings = get_ratings()
    books = get_books()
    users = get_users()
    books_pd = convert_pd(books)

    id_users_books = StoreValue()

    for x in ratings:
        id_users_books._user_id.append(x[0])
        id_users_books._book_id.append(x[1])

    # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html
    dataset_explicit = Dataset()
    dataset_explicit.fit(id_users_books._user_id,
                id_users_books._book_id)

    num_users, num_items = dataset_explicit.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    dataset_explicit.fit_partial(items=(x[0] for x in books),
                        item_features=(x[7] for x in books))
    
    dataset_explicit.fit_partial(users=(x[0] for x in users))


    # create ---> mapping
    # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id
    # Trọng số voting
    (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings)))

    # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp
    item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books))
    # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users))

    model_explicit_ratings = LightFM_ext(loss='warp')

    (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02)

    model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4)
    return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
Beispiel #7
0
def run_learning_curve(test_fraction, max_epoch):

    # create data_train
    data  = Dataset(user_identity_features=True)
    
    # user featurs
    user_features, user_feature_names = get_user_features()
    
    # create map between user_id, post_id, user_features and internal indices
    data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
    
    # print shape
    num_users, num_items = data.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))
    
    #---------------------------
    # Building the interactions matrix
    #---------------------------
    # create interaction matrix to optimize
    (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
    print(repr(interactions))
    
    # retrieve mapping from dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping()
    
    # split test and train
    interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction)
    
    #---------------------------
    # train model
    #---------------------------
    model_cs  = LightFM(learning_rate=0.05, loss='warp')
    model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

    precision_cs = []
    precision_ws = []

    recall_cs = []
    recall_ws = []

    for epoch in range(int(max_epoch/2)):

        model_cs.fit(interaction_train, epochs=int(epoch*2))
        model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2))
   
        # calculate precision and recall for each epoch
        precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train)
        precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train)
        recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        # append to result
        precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
        precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
        recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
        recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))

    df_result = pd.DataFrame({
        "precision_cs": precision_cs,
        "precision_ws": precision_ws,
        "recall_cs": recall_cs,
        "recall_ws": recall_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.epoch.csv", index=False)

    return
Beispiel #8
0
def run_lightfm(ratings, train, test, k_items, dataset):
    def create_interaction_matrix(df,
                                  user_col,
                                  item_col,
                                  rating_col,
                                  norm=False,
                                  threshold=None):
        '''
        Function to create an interaction matrix dataframe from transactional type interactions
        Required Input -
            - df = Pandas DataFrame containing user-item interactions
            - user_col = column name containing user's identifier
            - item_col = column name containing item's identifier
            - rating col = column name containing user feedback on interaction with a given item
            - norm (optional) = True if a normalization of ratings is needed
            - threshold (required if norm = True) = value above which the rating is favorable
        Expected output -
            - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
        '''
        interactions = df.groupby([user_col, item_col])[rating_col] \
                .sum().unstack().reset_index(). \
                fillna(0).set_index(user_col)
        if norm:
            interactions = interactions.applymap(lambda x: 1
                                                 if x > threshold else 0)
        return interactions

    test_interactions = create_interaction_matrix(df=test,
                                                  user_col='userId',
                                                  item_col='movieId',
                                                  rating_col='rating')

    budget_l = dataset.budget.unique().tolist()
    gross_l = dataset.gross.unique().tolist()
    awards_l = dataset.awards.unique().tolist()
    nom_l = dataset.nominations.unique().tolist()
    votes_l = dataset.votes.unique().tolist()
    item_ids = np.unique(train.movieId.astype(int))
    print(f'length dataset: {len(dataset)}')
    dataset = dataset[dataset.movieId.isin(item_ids)]
    print(f'length dataset: {len(dataset)}')
    item_features_list = [f'rating_{f}' for f in range(11)]
    gen = [
        'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]  # 'unknown' add unknown for movielens100k
    item_features_list += gen
    item_features_list += budget_l
    item_features_list += gross_l
    item_features_list += awards_l
    item_features_list += nom_l
    item_features_list += votes_l
    item_features = []
    for y, x in dataset.iterrows():
        genres = x['genres']
        tmp_row = (int(x['movieId']), [
            x['rating'], x['budget'], x['gross'], x['awards'],
            x['nominations'], x['votes']
        ])
        for g in genres:
            tmp_row[1].append(g)
        item_features.append(tmp_row)
    #item_features = [(int(x['movieId']), [x['rating'], z, x['budget'], x['gross'], x['awards'], x['votes']]) for y, x in dataset.iterrows() for z in x['genres']] #x['nominations']
    user_ids = np.unique(train.userId)
    built_dif = Dataset()
    built_dif.fit_partial(users=user_ids)
    built_dif.fit_partial(items=item_ids)
    built_dif.fit_partial(item_features=item_features_list)
    dataset_item_features = built_dif.build_item_features(item_features)
    (interactions, weights) = built_dif.build_interactions(
        ((int(x['userId']), int(x['movieId'])) for y, x in train.iterrows()))
    modelx = LightFM(no_components=30, loss='bpr', k=15, random_state=1)
    modelx.fit(interactions,
               epochs=30,
               num_threads=4,
               item_features=dataset_item_features
               )  #item_features=dataset_item_features
    test = sparse.csr_matrix(test_interactions.values)
    test = test.tocoo()
    num_users, num_items = built_dif.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    prec_list = dict()
    rec_list = dict()

    for num_k in k_items:
        trainprecision = precision_at_k(
            modelx, test, k=num_k, item_features=dataset_item_features).mean(
            )  #item_features=dataset_item_features,
        print('Hybrid training set precision: %s' % trainprecision)
        trainrecall = recall_at_k(modelx,
                                  test,
                                  k=num_k,
                                  item_features=dataset_item_features).mean(
                                  )  #item_features=dataset_item_features
        print('Hybrid training set recall: %s' % trainrecall)
        if num_k in prec_list:
            prec_list[num_k].append(trainprecision)
        else:
            prec_list[num_k] = trainprecision

        if num_k in rec_list:
            rec_list[num_k].append(trainrecall)
        else:
            rec_list[num_k] = trainrecall

    return prec_list, rec_list
def main(train_file, val_file, test_file, weight, output_file):

    # Read data from parquet
    print('Reading data ...')
    train_df = pd.read_parquet(train_file)
    val_df = pd.read_parquet(val_file)
    test_df = pd.read_parquet(test_file)

    train_df = train_df[['user_id', 'book_id', 'rating']]
    val_df = val_df[['user_id', 'book_id', 'rating']]
    test_df = test_df[['user_id', 'book_id', 'rating']]

    # Build the ID mappings
    print('Building the ID mappings ...')
    train = Dataset()
    train.fit((x for x in train_df.user_id), (x for x in train_df.book_id))
    user_map = train.mapping()[0]
    item_map = train.mapping()[2]
    train_size = train.interactions_shape()
    with open(output_file, "a") as f:
        f.write(
            'There are {} interactions in the training data, including {} users and {} items \n'
            .format(len(train_df), train_size[0], train_size[1]))
    print(
        'There are {} interactions in the training data, including {} users and {} items'
        .format(len(train_df), train_size[0], train_size[1]))

    # Build the interactions matrix
    print('Building the interactions and weights matrix ...')
    if weight == 'True':
        train_df.rating = train_df.rating + 1  # use rating +1 as weights
        (train_int, train_weight) = train.build_interactions(
            ((i[1][0], i[1][1], i[1][2]) for i in train_df.iterrows()))
    else:
        (train_int, train_weight) = train.build_interactions(
            ((i[1][0], i[1][1]) for i in train_df.iterrows()))

    # filter out interactions with rating >= 3 as true label
    val_df = val_df[val_df.rating >= 3].reset_index(drop=True)
    val_user = np.array([user_map[i] for i in val_df.user_id])
    val_item = np.array([item_map[i] for i in val_df.book_id])
    val_data = val_df.rating
    val_int = coo_matrix((val_data, (val_user, val_item)), shape=train_size)

    test_df = test_df[test_df.rating >= 3].reset_index(drop=True)
    test_user = np.array([user_map[i] for i in test_df.user_id])
    test_item = np.array([item_map[i] for i in test_df.book_id])
    test_data = test_df.rating
    test_int = coo_matrix((test_data, (test_user, test_item)),
                          shape=train_size)

    print('Running grid search on ranks and regularizations ...')
    ranks = [10, 20, 30]
    regs = [0, 1e-5, 5e-5]
    max_precision = -1
    best_rank = None
    best_reg = None
    best_training_time = None
    best_eval_time = None
    best_model = None

    # Do grid search on ranks and regularizations using training and validation data
    for rank in ranks:
        for reg in regs:
            start_time = time.time()
            model = LightFM(no_components=rank,
                            item_alpha=reg,
                            user_alpha=reg,
                            loss='warp',
                            random_state=1211)  # OPTIMIZE: precision@k
            model.fit(train_int, sample_weight=train_weight, epochs=10)
            train_end_time = time.time()

            val_precision = precision_at_k(model,
                                           val_int,
                                           train_interactions=train_int,
                                           k=500).mean()
            eval_end_time = time.time()

            with open(output_file, "a") as f:
                f.write(
                    'Rank %2d & Reg %.5f Validation Precision@500: %.5f \n' %
                    (rank, reg, val_precision))
            print('Rank %2d & Reg %.5f Validation Precision@500: %.5f' %
                  (rank, reg, val_precision))

            if val_precision > max_precision:
                max_precision = val_precision
                best_rank = rank
                best_reg = reg
                best_training_time = train_end_time - start_time
                best_eval_time = eval_end_time - train_end_time
                best_model = model

    # Evaluate best model performance on test set
    test_precision = precision_at_k(best_model,
                                    test_int,
                                    train_interactions=train_int,
                                    k=500).mean()

    with open(output_file, "a") as f:
        f.write(
            'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f \n'
            % (best_rank, best_reg, test_precision))
        f.write('The training takes %ss and evaluation takes %ss \n' %
                (best_training_time, best_eval_time))
    print(
        'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f'
        % (best_rank, best_reg, test_precision))
    print('The training takes %ss and evaluation takes %ss' %
          (best_training_time, best_eval_time))
Beispiel #10
0
def main():

	if request.method == 'POST':
		global df_movies
		# global top_trending_ids
		# print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) )
		print(request.form)
		# Get recommendations!
		if 'run-mf-model' in request.form:
			
			for i, user_rating in enumerate(session['arr']):
				session['arr'][i] = user_rating[:-2]
			session['movieIds'] = session['movieIds'][:-2]
			rated_movies = min(len(session['arr'][0]), len(session['movieIds']))
			for i, user_rating in enumerate(session['arr']):
				session['arr'][i] = user_rating[:rated_movies]
			session['movieIds'] = session['movieIds'][:rated_movies]

			pu = recommendation_mf(session['arr'], session['members'], session['movieIds'])


			session.clear()
			top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml)
			session['counter'] = 0
			session['members'] = 0
			session['userAges'] = []
			session['userGenders'] = []
			session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml)
			session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) 
			session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url)
			session['arr'] = None
			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu}))
		
		if 'run-siamese-model' in request.form:
			# global df
			global friends
			global ratings
			global new_friend_id
			new_ratings = []
			for mid, movie_real_id in enumerate(session['movieIds']):
				avg_mv_rating = np.median(np.array([user_ratings[mid] for user_ratings in session['arr']]))
				new_ratings.append({'movie_id_ml':movie_real_id, 
									'rating': avg_mv_rating,
									'friend_id': new_friend_id}) 
			new_friend = {'friend_id': new_friend_id, 'friends_age': np.mean(np.array(session['userAges'])), 'friends_gender': np.mean(np.array(session['userGenders']))}	

			friends.append(new_friend)
			ratings.extend(new_ratings)

			dataset = LightFMDataset()
			item_str_for_eval = "x['title'],x['release'], x['unknown'], x['action'], x['adventure'],x['animation'], x['childrens'], x['comedy'], x['crime'], x['documentary'], x['drama'],  x['fantasy'], x['noir'], x['horror'], x['musical'],x['mystery'], x['romance'], x['scifi'], x['thriller'], x['war'], x['western'], *soup_movie_features[x['soup_id']]"
			friend_str_for_eval = "x['friends_age'], x['friends_gender']"

			dataset.fit(users=(int(x['friend_id']) for x in friends),
						items=(int(x['movie_id_ml']) for x in movies),
						item_features=(eval("("+item_str_for_eval+")") for x in movies),
						user_features=((eval(friend_str_for_eval)) for x in friends))
			num_friends, num_items = dataset.interactions_shape()
			print(f'Num friends: {num_friends}, num_items {num_items}. {datetime.datetime.now()}')

			(interactions, weights) = dataset.build_interactions(((int(x['friend_id']), int(x['movie_id_ml']))
													  for x in ratings))
			item_features = dataset.build_item_features(((x['movie_id_ml'], 
											  [eval("("+item_str_for_eval+")")]) for x in movies) )
			user_features = dataset.build_user_features(((x['friend_id'], 
											  [eval(friend_str_for_eval)]) for x in friends) )

			print(f"Item and User features created {datetime.datetime.now()}")

			epochs = 50 #150
			lr = 0.015
			max_sampled = 11

			loss_type = "warp"  # "bpr"


			model = LightFM(learning_rate=lr, loss=loss_type, max_sampled=max_sampled)

			model.fit_partial(interactions, epochs=epochs, user_features=user_features, item_features=item_features)
			train_precision = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()

			train_auc = auc_score(model, interactions, user_features=user_features, item_features=item_features).mean()

			print(f'Precision: {train_precision}, AUC: {train_auc}, {datetime.datetime.now()}')

			k = 18
			top_movie_ids, scores = predict_top_k_movies(model, new_friend_id, k, num_items, user_features=user_features, item_features=item_features, use_features = False)
			top_movies = df_movies[df_movies.movie_id_ml.isin(top_movie_ids)]

			pu = recommendation_siamese(top_movies, scores)

			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu}))
		
		# Collect friends info
		elif 'person-select-gender-0' in request.form:
			for i in range(session['members']):
				session['userAges'].append(int(request.form.get(f'age-{i}')))
				session['userGenders'].append(int(request.form.get(f'person-select-gender-{i}')))

			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': session['members'], 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))

		# Choose number of people in the group
		elif 'people-select' in request.form:
			count = int(request.form.get('people-select'))
			session['members'] = count
			session['arr'] = [[0 for x in range(15)] for y in range(count)] 
			return(render_template('main.html', settings = {'friendsInfo':True, 'showVote': False, 'people': count, 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))

		# All people voting
		elif 'person-select-0' in request.form:
			for i in range(session['members']):
				session['arr'][i][session['counter']] = int(request.form.get(f'person-select-{i}'))
			
			session['counter'] += 1 
			if session['counter'] < 15:     
				return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))
			else:
				return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':True,  'recommendation': None}))

	elif request.method == 'GET':
		session.clear()
		top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml)
		print(top_trending_ids)
		print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) )
		session['counter'] = 0
		session['members'] = 0
		session['userAges'] = []
		session['userGenders'] = []
		session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) 
		session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) 
		session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url)
		session['arr'] = None

		return(render_template('main.html', settings = {'showVote': False, 'people': 0, 'buttonDisable': False, 'recommendation': None}))
#The first thing we need to do is to create a mapping between the user and item ids from our input data to indices that will be used internally by our model.

#We do this because LightFM works with user and item ids that are consecutive non-negative integers. The `Dataset` class allow us to create a mapping between the IDs we use in our systems and the consecutive indices preferred by the model.

#To do this, we create a dataset and call its `fit` method. The first argument is an iterable of all user ids in our data, and the second is an iterable of all item ids. In this case, we use generator expressions to lazily iterate over our data and yield user and item ids:

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

#This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model.

#We can check that the mappings have been created by querying the dataset on how many users and books it knows about:

num_users, num_items = dataset.interactions_shape()

print('Num users: {}, num_items {}.'.format(num_users, num_items))

#Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings:

dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

#This will create a feature for every unique author name in the dataset.

#(Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.)

## Building the interactions matrix
print("Load in movie ratings file")
# reader2 = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# ratingsDS = Dataset.load_from_file('./data/movie/ratings.csv', reader2)
# ratingsDF = pd.read_csv('./data/movie/ratings.csv')
# ratingsDF = ratingsDF.rename(columns={'userId': 'user_id', 'movieId': 'movie_id', 'rating': 'rating'})

newUser = pd.DataFrame(data=[[60000, 264, 5, 'heroic'],
                             [60000, 18, 3, 'historic'],
                             [60000, 70, 4, 'sci-fi']],
                       columns=['user_id', 'book_id', 'rating', 'tag_name'])
userDS = Dataset()
userDS.fit((x['user_id'] for _, x in newUser.iterrows()),
           (x['book_id'] for _, x in newUser.iterrows()),
           item_features=(x['tag_name'] for _, x in newUser.iterrows()))

print("Building training set")
# ratingsTrain = ratingsDS.build_full_trainset()
num_users, num_items = userDS.interactions_shape()
(interactions, weights) = userDS.build_interactions(
    ((x['user_id'], x['book_id']) for _, x in newUser.iterrows()))

print(newUser.head())
print("Starting fit")
# movieAlgo.fit(ratingsTrain)
lightFMAlgo.fit_partial(interactions, sample_weight=weights)
print("Finished fit")
print("Sending to dump file")
# joblib.dump(lightFMAlgo, "./RecommenderDump/algorithm2_dump")
print("Sent to dump file")
# print out the ratings
#for line in islice(ratings, 2):
#print(json.dumps(line, indent=4))

# print out the book features
#for line in islice(book_features, 1):
#print(json.dumps(line, indent=4))

# create a dataset and build the ID mappings
dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

# query the dataset to check how many users and items (i.e. books) it knows
num_users, num_items = dataset.interactions_shape()
print('Num users : {}, num_items {}.'.format(num_users, num_items))

# add some item feature mappings, and creates a unique feature for each author
# NOTE: more item ids are fitted than usual, to make sure our mappings are complete
# even if there are items in the features dataset that are not in the interaction set
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

# build the interaction matrix which is a main input to the LightFM model
# it encodes the interactions between the users and the items
(interactions, weights) = dataset.build_interactions(
    ((x['User-ID'], x['ISBN']) for x in get_ratings()))

# item_features matrix can also be created
Beispiel #14
0
def run_validation(test_fraction, max_val):

    # containers to hold results
    ave_precision_at_k_cs   = []
    ave_recall_at_k_cs      = []
    ave_auc_score_cs        = []

    ave_precision_at_k_ws   = []
    ave_recall_at_k_ws      = []
    ave_auc_score_ws        = []
   

    # perform validation
    validation_itr = 0

    while (validation_itr < max_val):

        print("Start validating cold, warm start, iteration %s" %validation_itr)

        # prevent random failure to abort entire job
        try:

            # count
            validation_itr += 1

            # create data_train
            data_cs = Dataset()
            data_ws = Dataset(user_identity_features=True)

            # user featurs
            user_features, user_feature_names = get_user_features()
            print(user_feature_names)

            # create map between user_id, post_id, user_features and internal indices
            data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()))
            data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
            
            # print shape
            num_users, num_items = data_ws.interactions_shape()
            print('Num users: {}, num_items {}.'.format(num_users, num_items))
            
            #---------------------------
            # Building the interactions matrix
            #---------------------------
            # create interaction matrix to optimize
            (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            print(repr(interactions_ws))

            # retrieve mapping from dataset
            user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping()
            user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping()

            # split test and train
            interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction)
            interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction)

            #---------------------------
            # train model
            #---------------------------
            model_cs  = LightFM(learning_rate=0.05, loss='warp')
            model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

            model_cs.fit(interaction_train_cs, epochs=30)
            model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30)

            #---------------------------
            # make predictions
            #---------------------------
            precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs)

            precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)

            # append score from each iteration to results
            ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
            ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
            ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs))

            ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
            ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))
            ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws))


        except:
            print("teration %s failed. Skipping.." %validation_itr)


    print("Validation score for test")
    print(ave_precision_at_k_cs  )
    print(ave_recall_at_k_cs     )
    print(ave_auc_score_cs )
    print(ave_precision_at_k_ws  )
    print(ave_recall_at_k_ws     )
    print(ave_auc_score_ws )

    df_result = pd.DataFrame({
        'precision_at_k_cs': ave_precision_at_k_cs,
        'recall_at_k_cs': ave_recall_at_k_cs,
        'auc_score_cs': ave_auc_score_cs,
        'precision_at_k_ws': ave_precision_at_k_ws,
        'recall_at_k_ws': ave_recall_at_k_ws,
        'auc_score_ws': ave_auc_score_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.csv", index=False)

    return
def preprocess():
    import pandas as pd
    import math
    import numpy as np 
            
    data_users = pd.read_csv('users_tag.csv',index_col=0)
    data_business = pd.read_csv('business_Nora.csv',index_col=0)
    data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0)        
            
    data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count])
    data_users.useful =  pd.Series([math.log(x+1) for x in data_users.useful])  
            
    #cleam business skewness
    data_business.review_count =  pd.Series([math.log(x+1) for x in data_business.review_count])        
            
    from lightfm.data import Dataset        
            
    #model establishment
    dataset = Dataset()
    dataset.fit(data_review.user_id,data_review.business_id)
    type(dataset)
    num_users, num_items = dataset.interactions_shape()        
            
    # fit item and user features. 
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['stars'])
            
            
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['review_count'])        
            
    tar_cols = [x for x in data_business.columns[24:]] 
            
    dataset.fit_partial(items = data_business.business_id,
                       item_features = tar_cols)        
            
    user_cols = [x for x in data_users[['review_count', 'useful',
                                       'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates',
           'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges',
           'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms',
           'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer',
           'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque',
           'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners',
           'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal',
           'Caterers', 'Arts & Entertainment']]]        
            
    dataset.fit_partial(users=data_users.user_id,
                        user_features = user_cols)  
          
    print("Building Interactions")        
    (interactions, weights) = dataset.build_interactions([(x['user_id'],
                                                           x['business_id'],
                                                           x['stars']) for index,x in data_review.iterrows()])   
    print("Interactions Build")        
    # build user and item features
    
    def build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    def user_build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    # get max of each column to regularize value to [0,1]
    max_star = max(data_business.stars)
    max_b_rc = max(data_business.review_count)
    print('max_b_rc')
    print(max_b_rc)
    
    # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25
    item_features = dataset.build_item_features(((x['business_id'], 
                                                  {'stars':0.5*x['stars']/max_star,
                                                   'review_count':0.5*x['review_count']/max_b_rc,
                                                   **build_dict(x,tar_cols,[0.5*x['stars']/max_star,
                                                               0.5*x['review_count']/max_b_rc])})
                                                  for index,x in data_business.iterrows()))
    
    
    # user_features = dataset.build_user_features(((x['user_id'],
    #                                              [x['is_elite'],x['year']])
    #                                            for index, x in data_users.iterrows()))
    max_u_rc = max(data_users.review_count)
    max_useful = max(data_users.useful)
    user_features = dataset.build_user_features(((x['user_id'],
                                                 {'review_count':0.35*x['review_count']/max_u_rc,
                                                  'useful':0.35*x['useful']/max_useful,
                                                 **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows()))
            
    #train-test split
    
    # seed = 12345 #has multiple seeds set up to account for split biases
    # seed = 101
    # seed = 186
    seed = 123
    from lightfm.cross_validation import random_train_test_split
    train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))
    
    print('The dataset has %s users and %s items, '
          'with %s interactions in the test and %s interactions in the training set.'
          % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))
    
    train.multiply(test).nnz == 0 # make sure train and test are truly disjoint        
    return train,test,data_business,dataset,user_features,item_features   
Beispiel #16
0
item_meta = pd.read_csv('data/books.csv')
item_meta = item_meta[['book_id', 'authors', 'average_rating', 'original_title']]

item_features_source = [(item_meta['book_id'][i],
                        [item_meta['authors'][i],
                         item_meta['average_rating'][i]]) for i in range(item_meta.shape[0])]

# Construct Data-set
# set, list, pandas series 모두 가능하다.
# 먼저 User/Item Index를 mapping하고, User Features/Item Features를 추가한 후
# occurence 데이터를 fit한다.
# 혹은 scipy.csr_matrix를 바로 fit하는 것도 가능하다.
# 주의: Null 값은 다 채운 후여야 한다.
dataset = Dataset()
dataset.fit(users=ratings['user_id'].unique(),
            items=ratings['book_id'].unique(),
            item_features=item_meta[item_meta.columns[1:]].values.flatten()
            )

print("Num Users: {}, Num Items: {}".format(*dataset.interactions_shape()))
print(dataset.user_features_shape(), dataset.item_features_shape())

interactions, weights = dataset.build_interactions(ratings_source)
item_features = dataset.build_item_features(item_features_source)
# mappings = dataset.mapping()

# Save
# mmwrite('data/interactions.mtx', interactions)
# mmwrite('data/item_features.mtx', item_features)
# mmwrite('data/weights.mtx', weights)
Beispiel #17
0
def main():
    #     n = len(sys.argv)
    #     if n > 0:
    #         f = sys.argv[0]
    #     else:
    #         f = 'new_sample.csv'

    # Start imports from s3
    bucket_name = 'forumrecbucket'
    samplecsv_key = 'new_sample.csv'
    pickle_key = 'savefile.pickle'
    item_features_key = 'item_features.npz'
    post_mappings_key = 'post_mappings.csv'

    client = boto3.client(
        's3')  #, aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
    csv_obj = client.get_object(
        Bucket=bucket_name, Key=samplecsv_key)['Body'].read().decode('utf-8')
    new = pd.read_csv(StringIO(csv_obj))

    s3 = S3FileSystem()
    user_indicies_key = 'user_indicies.npy'
    post_indicies_key = 'post_indicies.npy'

    user_indicies = np.load(
        s3.open('{}/{}'.format(bucket_name, user_indicies_key)))
    post_indicies = np.load(
        s3.open('{}/{}'.format(bucket_name, post_indicies_key)))
    post_mappings_obj = client.get_object(
        Bucket=bucket_name,
        Key=post_mappings_key)['Body'].read().decode('utf-8')
    post_mappings = pd.read_csv(StringIO(post_mappings_obj))

    post_mappings.columns = ['ParentId', 'post_indicies']
    post_mappings.index = post_mappings['ParentId']
    post_mappings = post_mappings['post_indicies']
    post_ind = lambda x: post_mappings.loc[x]

    model_client = client.get_object(Bucket=bucket_name,
                                     Key=pickle_key)['Body'].read()
    model = pickle.loads(model_client)
    print('user_indicies length:  ', len(user_indicies))
    print('post_indicies length:  ', len(post_indicies))
    # item_features_npz = client.get_object(Bucket=bucket_name, Key=item_features_key)['Body'].read()
    # item_features_npz = csr_matrix(item_features_npz)
    # user_indicies = np.load('user_indicies.npy')
    # print(max(user_indicies))
    # post_indicies = np.load('post_indicies.npy')
    # print(max(post_indicies))
    # model = pickle.load(open("savefile.pickle", "rb"))
    dataset = Dataset()
    dataset.fit((x for x in user_indicies), (x for x in post_indicies))
    dummies = range(max(user_indicies) + 1, 876)
    dataset.fit_partial((x for x in dummies))
    print(dataset.interactions_shape())
    # new = pd.read_csv(f)
    new['post_indicies'] = new['ParentId'].apply(post_ind)
    new_user_indicies = dict()
    for i in range(len(new.OwnerUserId.unique())):
        new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i]
    new['user_indicies'] = new.OwnerUserId.apply(
        lambda x: new_user_indicies[x])
    print(new['user_indicies'].values)
    new_user_indicies = dict()
    for i in range(len(new.OwnerUserId.unique())):
        new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i]
    new['user_indicies'] = new.OwnerUserId.apply(
        lambda x: new_user_indicies[x])
    #user_indicies = np.append(user_indicies, new.user_indicies.unique())
    #######
    #np.save('user_indicies.npy', user_indicies)
    #######
    new = new[[
        'user_indicies', 'post_indicies', 'Score', 'OwnerUserId', 'ParentId'
    ]]
    dataset.fit_partial((x for x in new.user_indicies.values),
                        (x for x in new.post_indicies.values))
    (new_interactions, new_weights) = dataset.build_interactions(
        ((x[0], x[1], x[2]) for x in new.values))
    print(new_interactions.shape)
    #interactions = sparse.load_npz("interactions.npz")
    item_features = sparse.load_npz("item_features.npz")
    print(item_features.shape)
    # item_features = sparse.load_npz(item_features_npz)
    for i in new.user_indicies.unique():
        print(i, 'mean user embedding before refitting :',
              np.mean(model.user_embeddings[i]))
    print(new_interactions.shape)
    model = model.fit_partial(new_interactions,
                              item_features=item_features,
                              sample_weight=new_weights,
                              epochs=10,
                              verbose=True)
    for i in new.user_indicies.unique():
        print(i, 'mean user embedding after refitting:',
              np.mean(model.user_embeddings[i]))

    nq = pd.read_csv('new_questions.csv')

    csv_buffer = StringIO()
    s3_resource = boto3.resource('s3')

    for i in new.user_indicies.unique():
        scores = pd.Series(
            model.predict(int(i),
                          nq.post_indicies.values,
                          item_features=item_features))
        temp = nq.copy()
        temp['reccomendation'] = scores.values

        temp.to_csv(csv_buffer, index=False)
        s3_resource.Object(bucket_name,
                           'new_recs.csv').put(Body=csv_buffer.getvalue())

    # with open('savefile.pickle', 'wb') as fle:
    #     pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

    s3_resource.Object(bucket_name, pickle_key).put(
        Body=pickle.dumps(model))  #, protocol=pickle.HIGHEST_PROTOCOL))
Beispiel #18
0
def lambda_handler(event, context):
    try:
        ## Fetch data from RDS code
        connection = pymysql.connect(
            host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com',
            user='******',
            passwd='postgres',
            db='fitbookdb',
            cursorclass=pymysql.cursors.DictCursor)

        print("Connection successful")
    except:
        print("Connection error")

    # In[3]:

    #Get Food DataFrame
    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from food_dataset")
        for row in cur:
            dict_list.append(row)

    food_rds_df = pd.DataFrame(dict_list)
    food_df = food_rds_df.copy()
    food_df.drop([
        'Portion_Default', 'Portion_Amount', 'Factor', 'Increment',
        'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name'
    ],
                 axis=1,
                 inplace=True)
    # food_df.head()
    print('Food Dataframe imported')

    # In[4]:

    # # TODO: Perform Binning
    # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats']
    # for each_column in food_30_bins:
    #     bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30)
    #     food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1))
    # food_df

    # In[5]:

    # for each_column in food_30_bins:
    #     print(food_df[each_column].min())

    # In[6]:

    #Get User Dataframe
    # user_df = pd.read_csv('user_db_try.csv')
    # user_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserData")
        for row in cur:
            dict_list.append(row)

    user_rds_df = pd.DataFrame(dict_list)
    user_df = user_rds_df.copy()
    user_df.drop([
        'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken',
        'fitbitAccessToken', 'fitbitUserID', 'userName'
    ],
                 axis=1,
                 inplace=True)
    # user_df.head()

    print('User Dataframe imported')

    # In[7]:

    #Get userItem DataFrame
    # userItem_df = pd.read_csv('userItem_db_try_new.csv')
    # userItem_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserRating")
        for row in cur:
            dict_list.append(row)

    userItem_rds_df = pd.DataFrame(dict_list)
    userItem_df = userItem_rds_df.copy()
    # userItem_df.head()
    print('UserItem Dataframe imported')

    # In[8]:

    #Make all the feature values unique
    for column_name in food_df.columns:
        if column_name != 'food_ID':
            food_df[column_name] = str(
                column_name) + ":" + food_df[column_name].astype(str)
    # food_df.head()

    # In[9]:

    #This Dict will be useful while creating tupples
    food_features_df = food_df.drop(['food_ID'], axis=1).copy()
    food_features_dict = food_features_df.to_dict('split')
    # food_features_dict

    # In[10]:

    food_feature_values = []

    for column_name in food_features_df.columns:
        food_feature_values.extend(food_features_df[column_name].unique())

    # food_feature_values

    # In[11]:

    for column_name in user_df.columns:
        if column_name != 'userID':
            user_df[column_name] = str(
                column_name) + ":" + user_df[column_name].astype(str)

    user_features_df = user_df.drop(['userID'], axis=1).copy()

    user_features_dict = user_features_df.to_dict('split')
    # user_features_dict

    # In[12]:

    user_feature_values = []

    for column_name in user_features_df.columns:
        user_feature_values.extend(user_features_df[column_name].unique())

    # user_feature_values

    # In[13]:

    user_tuples = []
    food_tuples = []

    for index, row in user_df.iterrows():
        user_tuples.append((row['userID'], user_features_dict['data'][index]))

    for index, row in food_df.iterrows():
        food_tuples.append((row['food_ID'], food_features_dict['data'][index]))

    # food_tuples

    # In[14]:

    print("Creating LightFm dataset")
    dataset = Dataset()
    dataset.fit(users=(user_id for user_id in user_df['userID']),
                items=(food_id for food_id in food_df['food_ID']))

    print("Dataset Created")
    # In[15]:

    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    # In[16]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data']))

    # In[17]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows()))

    # In[18]:

    print("fittng item partial features")
    dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']),
                        item_features=(each_value
                                       for each_value in food_feature_values))

    # In[19]:

    # dataset.fit_partial(users=(user_id for user_id in user_df['Id']),
    #                     user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data']))

    # In[20]:
    print("fittng user partial features")

    dataset.fit_partial(users=(user_id for user_id in user_df['userID']),
                        user_features=(each_value
                                       for each_value in user_feature_values))

    # In[21]:

    # dataset.item_features_shape()
    # dataset.user_features_shape()

    # In[22]:

    print("Building Interactions")
    (interactions, weights) = dataset.build_interactions(
        ((x['userID'], x['food_ID'], x['rating'])
         for y, x in userItem_df.iterrows()))

    # print(repr(interactions))
    # print(weights)

    # In[23]:

    # interactions.shape

    # In[24]:

    print("Building item features")
    item_features = dataset.build_item_features(each_tuple
                                                for each_tuple in food_tuples)
    # print(item_features)

    # In[25]:

    user_features = dataset.build_user_features(each_tuple
                                                for each_tuple in user_tuples)
    # print(user_features)

    # In[26]:

    print("Fitting Model")
    model = LightFM(loss='warp')
    model.fit(interactions,
              item_features=item_features,
              user_features=user_features)

    print("Model trained!!")

    print("Pickle started!!")
    pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2)

    bucketName = "fitbook-lambda-packages"
    Key = "/tmp/model.pkl"
    outPutname = "model.pkl"

    print("Uploading to S3")
    s3 = boto3.client('s3')
    s3.upload_file(Key, bucketName, outPutname)
    print("Upload done")
    os.remove("/tmp/model.pkl")

    print("Pickle file deleted")
    print("Successssss!!!!!")
Beispiel #19
0
mov_features = ((row[0], row[2].split('|') + [row[3], row[0]])
                for rid, row in movies.iterrows())
# print(mov_features[0])
item_features = dataset.build_item_features(mov_features)

model = LightFM(loss='warp',
                no_components=28,
                item_alpha=0.0001,
                learning_rate=0.05)
model.fit(interactions, item_features=item_features, num_threads=16)

movie2name = {}
for rid, row in movies.iterrows():
    movie2name[row[0]] = row[1]

n_users, n_items = dataset.interactions_shape()
# Adjust using base ratings
base_mat = model.predict(0, np.arange(n_items), num_threads=16)
base_mat = (base_mat + np.min(base_mat))


# base_mat = np.log2(base_mat + np.min(base_mat))
def sample_recommendation(model, interations, user_ids):
    n_users, n_items = dataset.interactions_shape()

    for user_id in user_ids:
        user_id = int(user_id)
        known_positives = [
            movie2name[rev_item_mapping[x]]
            for x in interactions.tocsr()[user_id].indices
        ]