def prepareData(df, tags): df = df[df.actionCategory == "WebNei clicked"] actionByUsers = df.groupby(["userName", "actionName"]).size() uniqueUsers = df[df.userName.isin( actionByUsers.index.get_level_values( 0).unique().values)].drop_duplicates('userName') uniqueUsers['user_features'] = uniqueUsers[[ 'title', 'team', 'organization', 'department' ]].values.tolist() dataset = Dataset() dataset.fit((list(actionByUsers.index.get_level_values(0))), (list(actionByUsers.index.get_level_values(1)))) rowM, colM = prepareJson(tags) rowU, colU = prepareUserFeatures(uniqueUsers) dataset.fit_partial(items=rowM, item_features=colM, users=rowU, user_features=colU) (interactions, weights) = dataset.build_interactions( zip(list(actionByUsers.index.get_level_values(0)), list(actionByUsers.index.get_level_values(1)))) item_features = dataset.build_item_features(zip(rowM, [colM])) user_features = dataset.build_user_features(zip(rowU, [colU])) return interactions, item_features, user_features
def build_lightfm_dataset(self) -> None: """ Builds final datasets for user-variant and variant-variant recommendations. """ logging.info("Creating LightFM matrices...") lightfm_dataset = LFMDataset() ratings_list = self.interaction_list logging.info('#'*60) lightfm_dataset.fit_partial( (rating['user_id'] for rating in ratings_list), (rating['product_id'] for rating in ratings_list) ) item_feature_names = self.item_df.columns logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}') item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])] logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}') for item_feature_name in item_feature_names: lightfm_dataset.fit_partial( items=(item['product_id'] for item in self.item_list), item_features=((item[item_feature_name] for item in self.item_list)), ) item_features_data = [] for item in self.item_list: item_features_data.append( ( item['product_id'], [ item['product_name'], item['aisle'], item['department'] ], ) ) logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}') self.item_features = lightfm_dataset.build_item_features(item_features_data) self.interactions, self.weights = lightfm_dataset.build_interactions( ((rating['user_id'], rating['product_id']) for rating in ratings_list) ) self.n_users, self.n_items = self.interactions.shape logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}') logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}') logging.info( f'The shape of self.interactions {self.interactions.shape} ' f'and self.weights {self.weights.shape} represent the user-item matrix.')
def test_exceptions(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) with pytest.raises(ValueError): dataset.build_interactions([(users + 1, 0)]) with pytest.raises(ValueError): dataset.build_interactions([(0, items + 1)]) dataset.fit_partial([users + 1], [items + 1]) dataset.build_interactions([(users + 1, 0)]) dataset.build_interactions([(0, items + 1)])
def main(): current_stage = 6 model = LightFM(no_components=30) dataset = Dataset() for c in range(0, current_stage + 1): click_train = pd.read_csv( train_path + "/underexpose_train_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) click_test = pd.read_csv( test_path + "/underexpose_test_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) dataset.fit_partial(click_train["user_id"], click_train["item_id"]) num_users, num_items = dataset.interactions_shape() log('Num users: {}, num_items {}.'.format(num_users, num_items))
def fit_data(self, matrix, user_features=None, item_features=None): """ Create datasets for .fit() method. Args: matrix: User-item interactions matrix (weighted) user_features: User-features pandas dataframe which index contains user_ids (crd_no) item_features: Item-features pandas dataframe which index contains good_ids (plu_id) Returns: Model with fitted (mapped) datasets """ matrix.sort_index(inplace=True) matrix.sort_index(inplace=True, axis=1) dataset = Dataset() dataset.fit((x for x in matrix.index), (x for x in matrix.columns)) interactions = pd.melt( matrix.replace(0, np.nan).reset_index(), id_vars='index', value_vars=list(matrix.columns[1:]), var_name='plu_id', value_name='rating').dropna().sort_values('index') interactions.columns = ['crd_no', 'plu_id', 'rating'] self.interactions, self.weights = dataset.build_interactions( [tuple(x) for x in interactions.values]) if user_features is not None: user_features.sort_index(inplace=True) dataset.fit_partial(users=user_features.index, user_features=user_features) self.user_features = dataset.build_user_features( ((index, dict(row)) for index, row in user_features.iterrows())) else: self.user_features = None if item_features is not None: item_features.sort_index(inplace=True) dataset.fit_partial(items=item_features.index, item_features=item_features) self.item_features = dataset.build_item_features( ((index, dict(row)) for index, row in item_features.iterrows())) else: self.item_features = None
def load_parameter(): ratings = get_ratings() books = get_books() users = get_users() books_pd = convert_pd(books) id_users_books = StoreValue() for x in ratings: id_users_books._user_id.append(x[0]) id_users_books._book_id.append(x[1]) # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html dataset_explicit = Dataset() dataset_explicit.fit(id_users_books._user_id, id_users_books._book_id) num_users, num_items = dataset_explicit.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset_explicit.fit_partial(items=(x[0] for x in books), item_features=(x[7] for x in books)) dataset_explicit.fit_partial(users=(x[0] for x in users)) # create ---> mapping # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id # Trọng số voting (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings))) # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books)) # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users)) model_explicit_ratings = LightFM_ext(loss='warp') (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02) model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4) return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
def predict(user_id: int) -> str: model_file = Path(BASE_DIR).joinpath(MODEL_FILE_NAME) data_file = Path(BASE_DIR).joinpath(DATA_FILE_NAME) if not model_file.exists(): return None if not data_file.exists(): return None model: LightFM = pickle.load(open(model_file, "rb")) data: pd.DataFrame = pd.read_csv(data_file) dataset = Dataset() dataset.fit((cac for cac in data.cac.unique()), (product for product in data.product_code.unique())) features = ['product_code', 'country_code', 'cost_bin'] for product_feature in features: dataset.fit_partial( users=(cac for cac in data.cac.unique()), items=(product for product in data.product_code.unique()), item_features=(feature for feature in data[product_feature].unique())) item_features = dataset.build_item_features(((getattr(row, 'product_code'), [getattr(row, product_feature) for product_feature in features if product_feature != 'product_code']) \ for row in data[features].itertuples())) predicted_products: List[str] = sample_recommendation( model=model, dataset=dataset, raw_data=data, item_features=item_features, user_ids=user_id) return predicted_products
dict2 = [] for i in range(0, len(dict)): if ("author_id" in dict[i].keys() and "cat_id" in dict[i].keys()): dict2.append(dict[i]) dict = dict2 import numpy as np dict = np.array(dict) print(dict) from lightfm.data import Dataset print("Build the dataset...") dataset = Dataset() dataset.fit((x['userid'] for x in dict), (x['postid'] for x in dict)) dataset.fit_partial(items=(x['postid'] for x in dict), item_features=(x["author_id"] for x in dict)) dataset.fit_partial(items=(x['postid'] for x in dict), item_features=(x["cat_id"] for x in dict)) num_users, num_items = dataset.interactions_shape() (interactions, weights) = dataset.build_interactions( ((x['userid'], x['postid']) for x in dict)) from lightfm import LightFM print("Training the model...") model = LightFM(loss='warp') model.fit(interactions) from lightfm.evaluation import precision_at_k
dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) #This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model. #We can check that the mappings have been created by querying the dataset on how many users and books it knows about: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings: dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) #This will create a feature for every unique author name in the dataset. #(Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.) ## Building the interactions matrix #Having created the mapping, we build the interaction matrix: (interactions, weights) = dataset.build_interactions( ((x['User-ID'], x['ISBN']) for x in get_ratings())) print(repr(interactions)) #This is main input into a LightFM model: it encodes the interactions betwee users and items.
testrankings = rankings[101008:] #dividing the features into Train/Cv/Test #unused currently but usable later so it is being kept in trainfeats = winefeatures[0:90980] cvfeats = winefeatures[90980:110476] testfeats = winefeatures[110476:] #LightFm Dataset Object dataset = Dataset() dataset.fit((x['taster'] for x in trainrankings),(y['title'] for y in winefeatures)) #it needs to be fit by providing iterators for users and the corresponding items #manually add all features to the dataset dataset.fit_partial(item_features=(x['country'] for x in winefeatures)) dataset.fit_partial(item_features=(x['province'] for x in winefeatures)) dataset.fit_partial(item_features=(x['region_1'] for x in winefeatures)) dataset.fit_partial(item_features=(x['variety'] for x in winefeatures)) dataset.fit_partial(item_features=(x['winery'] for x in winefeatures)) dataset.fit_partial(item_features=(x['points'] for x in winefeatures)) dataset.fit_partial(item_features=(x['price'] for x in winefeatures)) #then add our word vector features iteratively for i in range(9,209): dataset.fit_partial(item_features=(x[str(fields[i])] for x in winefeatures)) num_users, num_items = dataset.interactions_shape() #building the interaction matrix for training ratings (interactions, weights) = dataset.build_interactions(((x['taster'],x['title']) for x in trainrankings))
#print(json.dumps(line, indent=4)) # create a dataset and build the ID mappings dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) # query the dataset to check how many users and items (i.e. books) it knows num_users, num_items = dataset.interactions_shape() print('Num users : {}, num_items {}.'.format(num_users, num_items)) # add some item feature mappings, and creates a unique feature for each author # NOTE: more item ids are fitted than usual, to make sure our mappings are complete # even if there are items in the features dataset that are not in the interaction set dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) # build the interaction matrix which is a main input to the LightFM model # it encodes the interactions between the users and the items (interactions, weights) = dataset.build_interactions( ((x['User-ID'], x['ISBN']) for x in get_ratings())) # item_features matrix can also be created item_features = dataset.build_item_features( ((x['ISBN'], [x['Book-Author']]) for x in get_book_features())) # split the current dataset into a training and test dataset train, test = random_train_test_split(interactions, test_percentage=0.01, random_state=None)
average_stay='average_stay', lat='lat', long='long', image_url='image_url', category='Category', voyager_id='voyager_id', airport_code='airport_code') from lightfm.data import Dataset dataset = Dataset() dataset.fit((x[0] for i, x in users.iterrows()), (x[1] for i, x in users.iterrows())) # (interactions, weights) = dataset.build_interactions((x[0],x[1]) for i,x in users.iterrows()) dataset.fit_partial(items=(x['destinationid'] for i, x in destinations.iterrows()), item_features=(x['Destination-tf-idf'] for i, x in destinations.iterrows())) dataset.fit_partial(items=(x['userid'] for i, x in users.iterrows()), user_features=(x['age'] for i, x in users.iterrows())) item_features = dataset.build_item_features( ((x['destinationid'], [x['Destination-tf-idf']]) for i, x in destinations.iterrows())) user_features = dataset.build_user_features( ((x['userid'], [x['age']]) for i, x in users.iterrows())) mf_model = runMF(interactions=interactions, item_features=item_features, user_features=user_features, n_components=30, loss='warp',
################################# # # # Building the Model # # # ################################# dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset.fit_partial(users=(x['User-ID'] for x in get_user_features()), items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features()), user_features=(x['Age'] for x in get_user_features())) (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) for x in get_ratings())) #print(repr(interactions)) item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) for x in get_book_features())) #print(repr(item_features)) user_features = dataset.build_user_features(((x['User-ID'], [x['Age']]) for x in get_user_features()))
qd = VacancyData() matchings, vacancies, profiles, profilestest = qd.getData() # Creating a dataset dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit((x['ProfielId'] for x in qd.getMatchings()), (x['VacatureId'] for x in qd.getMatchings())) # Check on items and users num_users, num_items = dataset.interactions_shape() print('--- Interaction set : Num users: {}, num_items {}. ---'.format( num_users, num_items)) # Adding the features in the mix dataset.fit_partial( items=(x['VacatureId'] for x in qd.getVacancies()), item_features=(x['Naam'] for x in qd.getVacancies()), ) '''dataset.fit_partial(items=(x['VacatureId'] for x in qd.getVacancies()), item_features=(x['Taal'] for x in qd.getVacancies()), ) dataset.fit_partial(items=(x['VacatureId'] for x in qd.getVacancies()), item_features=(x['Functie'] for x in qd.getVacancies()), ) dataset.fit_partial(users=(x['Id'] for x in qd.getProfiles()), user_features=(x['Motivatie'] for x in qd.getProfiles()) ) ''' num_users, num_items = dataset.interactions_shape() print('--- Total set : Num users: {}, num_items {}. ---'.format(
def init_movielens(path, min_rating=0.0, k=3, item_features=None, cluster_n=18, model='vgg19', test_percentage=0.2): valid_item_features = {'genres': 'genres', 'clusters': 'clusters'} if item_features is not None: assert all(item in valid_item_features.values() for item in item_features), \ 'Your specified item features is invalid. You have to use one or more of this: ' \ + ', '.join(valid_item_features) train_dataset = Dataset() test_dataset = Dataset() data = dict() min_interactions = dict() with open(path + '/ratings.csv', 'r') as ratings_file: reader = csv.reader( ratings_file, delimiter=',', ) next(reader) # skip header ratings = [] users = set() items = set() for row in reader: user_id = int(row[0]) item_id = int(row[1]) users.add(user_id) items.add(item_id) rating = float(row[2]) if rating >= min_rating: ratings.append((user_id, item_id, rating)) __add_interaction(min_interactions, user_id) __info_no_of_min_interactions( k, 'No of interactions per user overall ==> ', min_interactions) users = list(users) items = list(items) users_column, items_column, ratings_column = zip(*ratings) ratings = sparse.coo_matrix( (ratings_column, (users_column, items_column))) ratings_train, ratings_test = random_train_test_split( ratings, test_percentage=test_percentage, random_state=np.random.RandomState(7)) ratings_train_to_count = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_train = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_test_to_count = zip(ratings_test.row, ratings_test.col, ratings_test.data) ratings_test = zip(ratings_test.row, ratings_test.col, ratings_test.data) min_interactions = __count_train_test_min_interactions( ratings_train_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on train ==> ', min_interactions) min_interactions = __count_train_test_min_interactions( ratings_test_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on test ==> ', min_interactions) train_dataset.fit(users=users, items=items) test_dataset.fit(users=users, items=items) (train_interactions, train_weights) = train_dataset.build_interactions(ratings_train) (test_interactions, test_weights) = test_dataset.build_interactions(ratings_test) data.update({'train': train_interactions}) data.update({'test': test_interactions}) data.update({'train-mapping': train_dataset.mapping()}) # add item features if item_features is not None: aggregated_features = [] if valid_item_features.get('genres') in item_features: movie_genres, genres = __init_movies_genres(path) aggregated_features.append(movie_genres) train_dataset.fit_partial(item_features=genres) test_dataset.fit_partial(item_features=genres) train_dataset.fit_partial(items=list(movie_genres.keys())) test_dataset.fit_partial(items=list(movie_genres.keys())) if valid_item_features.get('clusters') in item_features: movies_posters_clusters, clusters = __init_movies_posters_clusters( path, cluster_n, model=model) aggregated_features.append(movies_posters_clusters) train_dataset.fit_partial(item_features=clusters) test_dataset.fit_partial(item_features=clusters) train_dataset.fit_partial( items=list(movies_posters_clusters.keys())) test_dataset.fit_partial( items=list(movies_posters_clusters.keys())) aggregated_features = __aggregate_features(aggregated_features) item_features = train_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) _ = test_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) data.update({'item_features': item_features}) else: data.update({'item_features': None}) return data
def preprocess(): import pandas as pd import math import numpy as np data_users = pd.read_csv('users_tag.csv',index_col=0) data_business = pd.read_csv('business_Nora.csv',index_col=0) data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0) data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count]) data_users.useful = pd.Series([math.log(x+1) for x in data_users.useful]) #cleam business skewness data_business.review_count = pd.Series([math.log(x+1) for x in data_business.review_count]) from lightfm.data import Dataset #model establishment dataset = Dataset() dataset.fit(data_review.user_id,data_review.business_id) type(dataset) num_users, num_items = dataset.interactions_shape() # fit item and user features. dataset.fit_partial(items=data_business.business_id, item_features=['stars']) dataset.fit_partial(items=data_business.business_id, item_features=['review_count']) tar_cols = [x for x in data_business.columns[24:]] dataset.fit_partial(items = data_business.business_id, item_features = tar_cols) user_cols = [x for x in data_users[['review_count', 'useful', 'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates', 'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges', 'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms', 'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer', 'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque', 'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners', 'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal', 'Caterers', 'Arts & Entertainment']]] dataset.fit_partial(users=data_users.user_id, user_features = user_cols) print("Building Interactions") (interactions, weights) = dataset.build_interactions([(x['user_id'], x['business_id'], x['stars']) for index,x in data_review.iterrows()]) print("Interactions Build") # build user and item features def build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst def user_build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst # get max of each column to regularize value to [0,1] max_star = max(data_business.stars) max_b_rc = max(data_business.review_count) print('max_b_rc') print(max_b_rc) # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25 item_features = dataset.build_item_features(((x['business_id'], {'stars':0.5*x['stars']/max_star, 'review_count':0.5*x['review_count']/max_b_rc, **build_dict(x,tar_cols,[0.5*x['stars']/max_star, 0.5*x['review_count']/max_b_rc])}) for index,x in data_business.iterrows())) # user_features = dataset.build_user_features(((x['user_id'], # [x['is_elite'],x['year']]) # for index, x in data_users.iterrows())) max_u_rc = max(data_users.review_count) max_useful = max(data_users.useful) user_features = dataset.build_user_features(((x['user_id'], {'review_count':0.35*x['review_count']/max_u_rc, 'useful':0.35*x['useful']/max_useful, **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows())) #train-test split # seed = 12345 #has multiple seeds set up to account for split biases # seed = 101 # seed = 186 seed = 123 from lightfm.cross_validation import random_train_test_split train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed)) print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) train.multiply(test).nnz == 0 # make sure train and test are truly disjoint return train,test,data_business,dataset,user_features,item_features
def run_lightfm(ratings, train, test, k_items, dataset): def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None): ''' Function to create an interaction matrix dataframe from transactional type interactions Required Input - - df = Pandas DataFrame containing user-item interactions - user_col = column name containing user's identifier - item_col = column name containing item's identifier - rating col = column name containing user feedback on interaction with a given item - norm (optional) = True if a normalization of ratings is needed - threshold (required if norm = True) = value above which the rating is favorable Expected output - - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm ''' interactions = df.groupby([user_col, item_col])[rating_col] \ .sum().unstack().reset_index(). \ fillna(0).set_index(user_col) if norm: interactions = interactions.applymap(lambda x: 1 if x > threshold else 0) return interactions test_interactions = create_interaction_matrix(df=test, user_col='userId', item_col='movieId', rating_col='rating') budget_l = dataset.budget.unique().tolist() gross_l = dataset.gross.unique().tolist() awards_l = dataset.awards.unique().tolist() nom_l = dataset.nominations.unique().tolist() votes_l = dataset.votes.unique().tolist() item_ids = np.unique(train.movieId.astype(int)) print(f'length dataset: {len(dataset)}') dataset = dataset[dataset.movieId.isin(item_ids)] print(f'length dataset: {len(dataset)}') item_features_list = [f'rating_{f}' for f in range(11)] gen = [ 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] # 'unknown' add unknown for movielens100k item_features_list += gen item_features_list += budget_l item_features_list += gross_l item_features_list += awards_l item_features_list += nom_l item_features_list += votes_l item_features = [] for y, x in dataset.iterrows(): genres = x['genres'] tmp_row = (int(x['movieId']), [ x['rating'], x['budget'], x['gross'], x['awards'], x['nominations'], x['votes'] ]) for g in genres: tmp_row[1].append(g) item_features.append(tmp_row) #item_features = [(int(x['movieId']), [x['rating'], z, x['budget'], x['gross'], x['awards'], x['votes']]) for y, x in dataset.iterrows() for z in x['genres']] #x['nominations'] user_ids = np.unique(train.userId) built_dif = Dataset() built_dif.fit_partial(users=user_ids) built_dif.fit_partial(items=item_ids) built_dif.fit_partial(item_features=item_features_list) dataset_item_features = built_dif.build_item_features(item_features) (interactions, weights) = built_dif.build_interactions( ((int(x['userId']), int(x['movieId'])) for y, x in train.iterrows())) modelx = LightFM(no_components=30, loss='bpr', k=15, random_state=1) modelx.fit(interactions, epochs=30, num_threads=4, item_features=dataset_item_features ) #item_features=dataset_item_features test = sparse.csr_matrix(test_interactions.values) test = test.tocoo() num_users, num_items = built_dif.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) prec_list = dict() rec_list = dict() for num_k in k_items: trainprecision = precision_at_k( modelx, test, k=num_k, item_features=dataset_item_features).mean( ) #item_features=dataset_item_features, print('Hybrid training set precision: %s' % trainprecision) trainrecall = recall_at_k(modelx, test, k=num_k, item_features=dataset_item_features).mean( ) #item_features=dataset_item_features print('Hybrid training set recall: %s' % trainrecall) if num_k in prec_list: prec_list[num_k].append(trainprecision) else: prec_list[num_k] = trainprecision if num_k in rec_list: rec_list[num_k].append(trainrecall) else: rec_list[num_k] = trainrecall return prec_list, rec_list
def main(): # n = len(sys.argv) # if n > 0: # f = sys.argv[0] # else: # f = 'new_sample.csv' # Start imports from s3 bucket_name = 'forumrecbucket' samplecsv_key = 'new_sample.csv' pickle_key = 'savefile.pickle' item_features_key = 'item_features.npz' post_mappings_key = 'post_mappings.csv' client = boto3.client( 's3') #, aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) csv_obj = client.get_object( Bucket=bucket_name, Key=samplecsv_key)['Body'].read().decode('utf-8') new = pd.read_csv(StringIO(csv_obj)) s3 = S3FileSystem() user_indicies_key = 'user_indicies.npy' post_indicies_key = 'post_indicies.npy' user_indicies = np.load( s3.open('{}/{}'.format(bucket_name, user_indicies_key))) post_indicies = np.load( s3.open('{}/{}'.format(bucket_name, post_indicies_key))) post_mappings_obj = client.get_object( Bucket=bucket_name, Key=post_mappings_key)['Body'].read().decode('utf-8') post_mappings = pd.read_csv(StringIO(post_mappings_obj)) post_mappings.columns = ['ParentId', 'post_indicies'] post_mappings.index = post_mappings['ParentId'] post_mappings = post_mappings['post_indicies'] post_ind = lambda x: post_mappings.loc[x] model_client = client.get_object(Bucket=bucket_name, Key=pickle_key)['Body'].read() model = pickle.loads(model_client) print('user_indicies length: ', len(user_indicies)) print('post_indicies length: ', len(post_indicies)) # item_features_npz = client.get_object(Bucket=bucket_name, Key=item_features_key)['Body'].read() # item_features_npz = csr_matrix(item_features_npz) # user_indicies = np.load('user_indicies.npy') # print(max(user_indicies)) # post_indicies = np.load('post_indicies.npy') # print(max(post_indicies)) # model = pickle.load(open("savefile.pickle", "rb")) dataset = Dataset() dataset.fit((x for x in user_indicies), (x for x in post_indicies)) dummies = range(max(user_indicies) + 1, 876) dataset.fit_partial((x for x in dummies)) print(dataset.interactions_shape()) # new = pd.read_csv(f) new['post_indicies'] = new['ParentId'].apply(post_ind) new_user_indicies = dict() for i in range(len(new.OwnerUserId.unique())): new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i] new['user_indicies'] = new.OwnerUserId.apply( lambda x: new_user_indicies[x]) print(new['user_indicies'].values) new_user_indicies = dict() for i in range(len(new.OwnerUserId.unique())): new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i] new['user_indicies'] = new.OwnerUserId.apply( lambda x: new_user_indicies[x]) #user_indicies = np.append(user_indicies, new.user_indicies.unique()) ####### #np.save('user_indicies.npy', user_indicies) ####### new = new[[ 'user_indicies', 'post_indicies', 'Score', 'OwnerUserId', 'ParentId' ]] dataset.fit_partial((x for x in new.user_indicies.values), (x for x in new.post_indicies.values)) (new_interactions, new_weights) = dataset.build_interactions( ((x[0], x[1], x[2]) for x in new.values)) print(new_interactions.shape) #interactions = sparse.load_npz("interactions.npz") item_features = sparse.load_npz("item_features.npz") print(item_features.shape) # item_features = sparse.load_npz(item_features_npz) for i in new.user_indicies.unique(): print(i, 'mean user embedding before refitting :', np.mean(model.user_embeddings[i])) print(new_interactions.shape) model = model.fit_partial(new_interactions, item_features=item_features, sample_weight=new_weights, epochs=10, verbose=True) for i in new.user_indicies.unique(): print(i, 'mean user embedding after refitting:', np.mean(model.user_embeddings[i])) nq = pd.read_csv('new_questions.csv') csv_buffer = StringIO() s3_resource = boto3.resource('s3') for i in new.user_indicies.unique(): scores = pd.Series( model.predict(int(i), nq.post_indicies.values, item_features=item_features)) temp = nq.copy() temp['reccomendation'] = scores.values temp.to_csv(csv_buffer, index=False) s3_resource.Object(bucket_name, 'new_recs.csv').put(Body=csv_buffer.getvalue()) # with open('savefile.pickle', 'wb') as fle: # pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL) s3_resource.Object(bucket_name, pickle_key).put( Body=pickle.dumps(model)) #, protocol=pickle.HIGHEST_PROTOCOL))
def lightfm_node(X1_train, X2_train, X1_test, X2_test): X2 = pd.concat([X2_train, X2_test]) X1 = pd.concat([X1_train, X1_test]).set_index('id') X1.columns = ['X1_' + i for i in X1.columns] X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop') for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']: X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}') X1 = X1.reset_index() from lightfm.data import Dataset dataset = Dataset() dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_1'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_13'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_5'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_8'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_6'])) user_features = dataset.build_user_features( [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6' ]].values.tolist()) for x in X1.iterrows()], normalize=True) (interactions, weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T)) model = LightFM(no_components=32, learning_rate=0.04, loss='bpr', max_sampled=55, random_state=0) num_epochs = 20 for i in range(num_epochs): model.fit_partial(interactions, user_features=user_features) users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping( ) user_features_mapping_inv = { j: i for i, j in user_features_mapping.items() } tag_embeddings = (model.user_embeddings.T / np.linalg.norm(model.user_embeddings, axis=1)).T lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)], index=X1['id']) return lightfm_embed
def lambda_handler(event, context): try: ## Fetch data from RDS code connection = pymysql.connect( host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com', user='******', passwd='postgres', db='fitbookdb', cursorclass=pymysql.cursors.DictCursor) print("Connection successful") except: print("Connection error") # In[3]: #Get Food DataFrame dict_list = [] with connection.cursor() as cur: cur.execute("select * from food_dataset") for row in cur: dict_list.append(row) food_rds_df = pd.DataFrame(dict_list) food_df = food_rds_df.copy() food_df.drop([ 'Portion_Default', 'Portion_Amount', 'Factor', 'Increment', 'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name' ], axis=1, inplace=True) # food_df.head() print('Food Dataframe imported') # In[4]: # # TODO: Perform Binning # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats'] # for each_column in food_30_bins: # bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30) # food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1)) # food_df # In[5]: # for each_column in food_30_bins: # print(food_df[each_column].min()) # In[6]: #Get User Dataframe # user_df = pd.read_csv('user_db_try.csv') # user_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserData") for row in cur: dict_list.append(row) user_rds_df = pd.DataFrame(dict_list) user_df = user_rds_df.copy() user_df.drop([ 'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken', 'fitbitAccessToken', 'fitbitUserID', 'userName' ], axis=1, inplace=True) # user_df.head() print('User Dataframe imported') # In[7]: #Get userItem DataFrame # userItem_df = pd.read_csv('userItem_db_try_new.csv') # userItem_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserRating") for row in cur: dict_list.append(row) userItem_rds_df = pd.DataFrame(dict_list) userItem_df = userItem_rds_df.copy() # userItem_df.head() print('UserItem Dataframe imported') # In[8]: #Make all the feature values unique for column_name in food_df.columns: if column_name != 'food_ID': food_df[column_name] = str( column_name) + ":" + food_df[column_name].astype(str) # food_df.head() # In[9]: #This Dict will be useful while creating tupples food_features_df = food_df.drop(['food_ID'], axis=1).copy() food_features_dict = food_features_df.to_dict('split') # food_features_dict # In[10]: food_feature_values = [] for column_name in food_features_df.columns: food_feature_values.extend(food_features_df[column_name].unique()) # food_feature_values # In[11]: for column_name in user_df.columns: if column_name != 'userID': user_df[column_name] = str( column_name) + ":" + user_df[column_name].astype(str) user_features_df = user_df.drop(['userID'], axis=1).copy() user_features_dict = user_features_df.to_dict('split') # user_features_dict # In[12]: user_feature_values = [] for column_name in user_features_df.columns: user_feature_values.extend(user_features_df[column_name].unique()) # user_feature_values # In[13]: user_tuples = [] food_tuples = [] for index, row in user_df.iterrows(): user_tuples.append((row['userID'], user_features_dict['data'][index])) for index, row in food_df.iterrows(): food_tuples.append((row['food_ID'], food_features_dict['data'][index])) # food_tuples # In[14]: print("Creating LightFm dataset") dataset = Dataset() dataset.fit(users=(user_id for user_id in user_df['userID']), items=(food_id for food_id in food_df['food_ID'])) print("Dataset Created") # In[15]: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) # In[16]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data'])) # In[17]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows())) # In[18]: print("fittng item partial features") dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']), item_features=(each_value for each_value in food_feature_values)) # In[19]: # dataset.fit_partial(users=(user_id for user_id in user_df['Id']), # user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data'])) # In[20]: print("fittng user partial features") dataset.fit_partial(users=(user_id for user_id in user_df['userID']), user_features=(each_value for each_value in user_feature_values)) # In[21]: # dataset.item_features_shape() # dataset.user_features_shape() # In[22]: print("Building Interactions") (interactions, weights) = dataset.build_interactions( ((x['userID'], x['food_ID'], x['rating']) for y, x in userItem_df.iterrows())) # print(repr(interactions)) # print(weights) # In[23]: # interactions.shape # In[24]: print("Building item features") item_features = dataset.build_item_features(each_tuple for each_tuple in food_tuples) # print(item_features) # In[25]: user_features = dataset.build_user_features(each_tuple for each_tuple in user_tuples) # print(user_features) # In[26]: print("Fitting Model") model = LightFM(loss='warp') model.fit(interactions, item_features=item_features, user_features=user_features) print("Model trained!!") print("Pickle started!!") pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2) bucketName = "fitbook-lambda-packages" Key = "/tmp/model.pkl" outPutname = "model.pkl" print("Uploading to S3") s3 = boto3.client('s3') s3.upload_file(Key, bucketName, outPutname) print("Upload done") os.remove("/tmp/model.pkl") print("Pickle file deleted") print("Successssss!!!!!")
for x in get_ratings(): if k==5000: break id_isbn._user_id.append(x['user_id']) id_isbn._isbn.append(x['book_id']) k+=1 # print(id_isbn._user_id) dataset = Dataset() dataset.fit(id_isbn._user_id, id_isbn._isbn) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset.fit_partial(items=(x['book_id'] for x in get_book_features()), item_features=(x['authors'] for x in get_book_features())) (interactions, weights) = dataset.build_interactions((id_isbn._user_id[i], id_isbn._isbn[i]) for i in range(5000)) item_features = dataset.build_item_features(((x['book_id'], [x['authors']]) for x in get_book_features())) print(item_features.shape) print(interactions.shape) # print(weights) ################################# # # # Training the Model # # #