def test_build_features(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit( range(users), range(items), ["user:{}".format(x) for x in range(users)], ["item:{}".format(x) for x in range(items)], ) # Build from lists user_features = dataset.build_user_features( [ (user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users) ] ) assert user_features.getnnz() == users ** 2 item_features = dataset.build_item_features( [ (item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items) ] ) assert item_features.getnnz() == items ** 2 # Build from dicts user_features = dataset.build_user_features( [ (user_id, {"user:{}".format(x): float(x) for x in range(users)}) for user_id in range(users) ], normalize=False, ) assert np.all(user_features.todense() == np.array([list(range(users))] * users)) item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ], normalize=False, ) assert np.all(item_features.todense() == np.array([list(range(items))] * items)) # Test normalization item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ] ) assert np.all(item_features.sum(1) == 1.0)
def test_build_features(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit( range(users), range(items), ["user:{}".format(x) for x in range(users)], ["item:{}".format(x) for x in range(items)], ) # Build from lists user_features = dataset.build_user_features( [(user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users)] ) assert user_features.getnnz() == users ** 2 item_features = dataset.build_item_features( [(item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items)] ) assert item_features.getnnz() == items ** 2 # Build from dicts user_features = dataset.build_user_features( [ (user_id, {"user:{}".format(x): float(x) for x in range(users)}) for user_id in range(users) ], normalize=False, ) assert np.all(user_features.todense() == np.array([list(range(users))] * users)) item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ], normalize=False, ) assert np.all(item_features.todense() == np.array([list(range(items))] * items)) # Test normalization item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ] ) assert np.all(item_features.sum(1) == 1.0)
def interactions(df): movie_genre = [x.split("|") for x in df["genre"]] all_movie_genre = sorted( list(set(itertools.chain.from_iterable(movie_genre)))) all_occupations = sorted(list(set(df["occupation"]))) dataset = Dataset() dataset.fit( df["userID"], df["itemID"], item_features=all_movie_genre, user_features=all_occupations, ) item_features = dataset.build_item_features( (x, y) for x, y in zip(df.itemID, movie_genre)) user_features = dataset.build_user_features( (x, [y]) for x, y in zip(df.userID, df["occupation"])) (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values) train_interactions, test_interactions = cross_validation.random_train_test_split( interactions, test_percentage=TEST_PERCENTAGE, random_state=np.random.RandomState(SEEDNO), ) return train_interactions, test_interactions, item_features, user_features
def prepareData(df, tags): df = df[df.actionCategory == "WebNei clicked"] actionByUsers = df.groupby(["userName", "actionName"]).size() uniqueUsers = df[df.userName.isin( actionByUsers.index.get_level_values( 0).unique().values)].drop_duplicates('userName') uniqueUsers['user_features'] = uniqueUsers[[ 'title', 'team', 'organization', 'department' ]].values.tolist() dataset = Dataset() dataset.fit((list(actionByUsers.index.get_level_values(0))), (list(actionByUsers.index.get_level_values(1)))) rowM, colM = prepareJson(tags) rowU, colU = prepareUserFeatures(uniqueUsers) dataset.fit_partial(items=rowM, item_features=colM, users=rowU, user_features=colU) (interactions, weights) = dataset.build_interactions( zip(list(actionByUsers.index.get_level_values(0)), list(actionByUsers.index.get_level_values(1)))) item_features = dataset.build_item_features(zip(rowM, [colM])) user_features = dataset.build_user_features(zip(rowU, [colU])) return interactions, item_features, user_features
def obtener_matrices(self): """ Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM. Este método solo se utiliza en la interfaz de texto. """ global train, test, modelo, item_features, user_features # Se obtienen los dataframes Entrada.obtener_datos() ratings_df = Entrada.ratings_df users_df = Entrada.users_df items_df = Entrada.items_df # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos dataset = Dataset() dataset.fit(users_df[users_df.columns.values[0]], items_df[items_df.columns.values[0]], user_features=users_df[users_df.columns.values[1]], item_features=items_df[items_df.columns.values[1]]) # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios if self.opcion_modelo == 1 or self.opcion_modelo == 2: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]], row[ratings_df.columns.values[2]]) for index, row in ratings_df.iterrows()) else: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]]) for index, row in ratings_df.iterrows()) # Se obtienen las matrices de features y se guardan item_features = dataset.build_item_features( (row[items_df.columns.values[0]], [row[items_df.columns.values[1]]]) for index, row in items_df.iterrows()) user_features = dataset.build_user_features( (row[users_df.columns.values[0]], [row[users_df.columns.values[1]]]) for index, row in users_df.iterrows()) print("Guarda la matriz de item features") guardar_datos_pickle(item_features, 'la matriz de item features') print("Guarda la matriz de user features") guardar_datos_pickle(user_features, 'la matriz de user feautures') # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan train, test = random_train_test_split(interacciones, test_percentage=0.2) print("Guarda la matriz de entrenamiento") guardar_datos_pickle(train, 'la matriz de entrenamiento') print("Guarda la matriz de test") guardar_datos_pickle(test, 'la matriz de test')
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def fit_data(self, matrix, user_features=None, item_features=None): """ Create datasets for .fit() method. Args: matrix: User-item interactions matrix (weighted) user_features: User-features pandas dataframe which index contains user_ids (crd_no) item_features: Item-features pandas dataframe which index contains good_ids (plu_id) Returns: Model with fitted (mapped) datasets """ matrix.sort_index(inplace=True) matrix.sort_index(inplace=True, axis=1) dataset = Dataset() dataset.fit((x for x in matrix.index), (x for x in matrix.columns)) interactions = pd.melt( matrix.replace(0, np.nan).reset_index(), id_vars='index', value_vars=list(matrix.columns[1:]), var_name='plu_id', value_name='rating').dropna().sort_values('index') interactions.columns = ['crd_no', 'plu_id', 'rating'] self.interactions, self.weights = dataset.build_interactions( [tuple(x) for x in interactions.values]) if user_features is not None: user_features.sort_index(inplace=True) dataset.fit_partial(users=user_features.index, user_features=user_features) self.user_features = dataset.build_user_features( ((index, dict(row)) for index, row in user_features.iterrows())) else: self.user_features = None if item_features is not None: item_features.sort_index(inplace=True) dataset.fit_partial(items=item_features.index, item_features=item_features) self.item_features = dataset.build_item_features( ((index, dict(row)) for index, row in item_features.iterrows())) else: self.item_features = None
def create_datasets(cluster_id): events_list = get_events_from_es(cluster_id) dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id) print(dataframe_interactions, cluster_id, file=sys.stderr) print(dataframe_users_features, cluster_id, file=sys.stderr) print(dataframe_item_features, cluster_id, file=sys.stderr) #print(user_tuple) # print(item_tuple) user_features = format_users_features(dataframe_users_features) #print(user_features) item_features = format_items_features(dataframe_item_features) #print(item_features) dataset = Dataset() dataset.fit( dataframe_interactions['user'].unique(), # all the users dataframe_interactions['item'].unique(), # all the items user_features = user_features, item_features = item_features ) (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ]) # print(interactions) # print(weights) final_user_features = dataset.build_user_features(user_tuple, normalize= False) final_item_features = dataset.build_item_features(item_tuple, normalize= False) return dataset, interactions, weights, final_item_features, final_user_features
ratings = pd.read_csv('ratings.txt', sep=';', header=None) from lightfm.data import Dataset dataset = Dataset(user_identity_features=True, item_identity_features=True) dataset.fit(users=(users[50].unique()), items=(items[0]), item_features=list(range(2, 10)), user_features=list(range(2, 50))) items_features_raw = list( (item[1], (np.argwhere(np.array(item[3:]) == 1)[0] + 2).tolist()) for item in items.itertuples()) items_features = dataset.build_item_features(items_features_raw) users_features_raw = build_user_dict(users) users_features = dataset.build_user_features(users_features_raw) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) ratings2 = ratings[ratings[2] > 0] ratings2 = ratings2.drop_duplicates(subset=[1, 2, 3]) train, test = train_test_split(ratings2, test_size=0.1) print(train.shape) print(test.shape) (train_interactions, train_weights) = dataset.build_interactions(train[[3, 1]].values) (test_interactions, test_weights) = dataset.build_interactions(test[[3, 1 ]].values)
print('[ %04ds ] Files loaded' % (time.time() - start_time)) all_user_features = ['NO_FEAT'] all_business_features = Business.collect_business_features(business_stats) all_user_ids = User.extract_user_ids(user_stats) all_business_ids = Business.extract_business_ids(business_stats) dataset = Dataset() dataset.fit(all_user_ids, all_business_ids, user_features=all_user_features, item_features=all_business_features) user_features = dataset.build_user_features( User.build_user_features(user_stats, all_user_ids), True) business_features = dataset.build_item_features( Business.build_business_features(business_stats, all_business_ids), True) print('[ %04ds ] Dataset initialized' % (time.time() - start_time)) user_avg, user_std = Review.extract_user_average_and_std(training_set) normalized_training_reviews = Review.normalize_by_user( training_set, user_avg) training_interactions = Review.extract_sparse_interaction_matrix( normalized_training_reviews) training_user_ids = Review.extract_user_ids(normalized_training_reviews) training_business_ids = Review.extract_business_ids( normalized_training_reviews)
interactions.idaviso.values, interactions.rating.values ], dtype=np.object).T res_interactions, res_weights = lfm_dataset.build_interactions(data=interactions) print(50 * '-') print('Building User Features...') print(50 * '-') users_features = np.array([users.idpostulante.values, users[['edad', 'sexo', 'educacion']].values.tolist()], dtype=np.object).T print(users_features) print(50*'-') u_feat = lfm_dataset.build_user_features(data=users_features, normalize=False) print(50 * '-') print('Matrix factorization model...') mf_model = rs.runMF(interactions=res_interactions, n_components=30, loss='warp', k=15, epoch=50, n_jobs=24) print(50 * "-") print('Building Recommendations...') users = pd.Series(data=list(u_map.keys()), name='idpostulante')
# fit dataset dataset.fit(users=user_iterable, items=iteam_iterable, user_features=user_feature_names, item_features=item_feature_names ) # check shape num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items: {}.'.format(num_users, num_items)) _, num_users_feature = dataset.user_features_shape() _, num_items_feature = dataset.item_features_shape() print('Num users feature: {}, num_items feature: {}.'.format(num_users_feature, num_items_feature)) # build user feature matrix user_feature_matrix = dataset.build_user_features(user_feature_iterable, normalize=True) # build item feature matrix item_feature_matrix = dataset.build_item_features(item_feature_iterable, normalize=True) # build interaction (train_interactions, weights) = dataset.build_interactions(data=((row['userCode'], row['project_id'], row[interaction_col_name])for index, row in train.iterrows() if row['project_id'] not in ignore_project)) from lightfm import LightFM model = LightFM(loss='warp', random_state=44, learning_schedule='adagrad') model.fit(train_interactions, item_features=item_feature_matrix, user_features=user_feature_matrix, )
def train_model( df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """ Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. user_feature_map: the feature map of users business_feature_map: the feature map of items """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) df_users = df.drop_duplicates(user_id_col) # df_users = df[df.duplicated(user_id_col) == False] users_features = [] for i in range(len(df_users)): users_features.append(get_users_features_tuple(df_users.values[i])) users_features = ds_full.build_user_features( users_features, normalize=False) items = df.drop_duplicates(item_id_col) # items = df[df.duplicated(item_id_col) == False] items_features = [] for i in range(len(items)): items_features.append(get_items_features_tuple( items.values[i], categories)) items_features = ds_full.build_item_features( items_features, normalize=False) (interactions, weights) = ds_full.build_interactions( [(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit( interactions, user_features=users_features, item_features=items_features, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, user_feature_map, business_id_map, business_feature_map = \ ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, \ item_dict, user_feature_map, business_feature_map
def lambda_handler(event, context): try: ## Fetch data from RDS code connection = pymysql.connect( host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com', user='******', passwd='postgres', db='fitbookdb', cursorclass=pymysql.cursors.DictCursor) print("Connection successful") except: print("Connection error") # In[3]: #Get Food DataFrame dict_list = [] with connection.cursor() as cur: cur.execute("select * from food_dataset") for row in cur: dict_list.append(row) food_rds_df = pd.DataFrame(dict_list) food_df = food_rds_df.copy() food_df.drop([ 'Portion_Default', 'Portion_Amount', 'Factor', 'Increment', 'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name' ], axis=1, inplace=True) # food_df.head() print('Food Dataframe imported') # In[4]: # # TODO: Perform Binning # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats'] # for each_column in food_30_bins: # bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30) # food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1)) # food_df # In[5]: # for each_column in food_30_bins: # print(food_df[each_column].min()) # In[6]: #Get User Dataframe # user_df = pd.read_csv('user_db_try.csv') # user_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserData") for row in cur: dict_list.append(row) user_rds_df = pd.DataFrame(dict_list) user_df = user_rds_df.copy() user_df.drop([ 'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken', 'fitbitAccessToken', 'fitbitUserID', 'userName' ], axis=1, inplace=True) # user_df.head() print('User Dataframe imported') # In[7]: #Get userItem DataFrame # userItem_df = pd.read_csv('userItem_db_try_new.csv') # userItem_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserRating") for row in cur: dict_list.append(row) userItem_rds_df = pd.DataFrame(dict_list) userItem_df = userItem_rds_df.copy() # userItem_df.head() print('UserItem Dataframe imported') # In[8]: #Make all the feature values unique for column_name in food_df.columns: if column_name != 'food_ID': food_df[column_name] = str( column_name) + ":" + food_df[column_name].astype(str) # food_df.head() # In[9]: #This Dict will be useful while creating tupples food_features_df = food_df.drop(['food_ID'], axis=1).copy() food_features_dict = food_features_df.to_dict('split') # food_features_dict # In[10]: food_feature_values = [] for column_name in food_features_df.columns: food_feature_values.extend(food_features_df[column_name].unique()) # food_feature_values # In[11]: for column_name in user_df.columns: if column_name != 'userID': user_df[column_name] = str( column_name) + ":" + user_df[column_name].astype(str) user_features_df = user_df.drop(['userID'], axis=1).copy() user_features_dict = user_features_df.to_dict('split') # user_features_dict # In[12]: user_feature_values = [] for column_name in user_features_df.columns: user_feature_values.extend(user_features_df[column_name].unique()) # user_feature_values # In[13]: user_tuples = [] food_tuples = [] for index, row in user_df.iterrows(): user_tuples.append((row['userID'], user_features_dict['data'][index])) for index, row in food_df.iterrows(): food_tuples.append((row['food_ID'], food_features_dict['data'][index])) # food_tuples # In[14]: print("Creating LightFm dataset") dataset = Dataset() dataset.fit(users=(user_id for user_id in user_df['userID']), items=(food_id for food_id in food_df['food_ID'])) print("Dataset Created") # In[15]: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) # In[16]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data'])) # In[17]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows())) # In[18]: print("fittng item partial features") dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']), item_features=(each_value for each_value in food_feature_values)) # In[19]: # dataset.fit_partial(users=(user_id for user_id in user_df['Id']), # user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data'])) # In[20]: print("fittng user partial features") dataset.fit_partial(users=(user_id for user_id in user_df['userID']), user_features=(each_value for each_value in user_feature_values)) # In[21]: # dataset.item_features_shape() # dataset.user_features_shape() # In[22]: print("Building Interactions") (interactions, weights) = dataset.build_interactions( ((x['userID'], x['food_ID'], x['rating']) for y, x in userItem_df.iterrows())) # print(repr(interactions)) # print(weights) # In[23]: # interactions.shape # In[24]: print("Building item features") item_features = dataset.build_item_features(each_tuple for each_tuple in food_tuples) # print(item_features) # In[25]: user_features = dataset.build_user_features(each_tuple for each_tuple in user_tuples) # print(user_features) # In[26]: print("Fitting Model") model = LightFM(loss='warp') model.fit(interactions, item_features=item_features, user_features=user_features) print("Model trained!!") print("Pickle started!!") pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2) bucketName = "fitbook-lambda-packages" Key = "/tmp/model.pkl" outPutname = "model.pkl" print("Uploading to S3") s3 = boto3.client('s3') s3.upload_file(Key, bucketName, outPutname) print("Upload done") os.remove("/tmp/model.pkl") print("Pickle file deleted") print("Successssss!!!!!")
'writer_name']].drop_duplicates().reset_index(drop=True) users = uid[['uid', 'popular_section', 'popular_platform', 'popular_sources']].drop_duplicates() dataset = Dataset() features_list = create_feature_list(items, cols=['section_primary', 'writer_name']) user_features_list = create_feature_list( users, cols=['popular_section', 'popular_platform', 'popular_sources']) #features_list = list(set(items.writer_name.to_list())) dataset.fit(users=uid.uid.unique(), items=uid.article_id.unique(), item_features=features_list, user_features=user_features_list) (interactions, weights) = dataset.build_interactions( (x.uid, x.article_id) for x in uid.itertuples()) n_users, n_items = interactions.shape 1 - (interactions.getnnz() / (interactions.shape[0] * interactions.shape[1])) item_features = dataset.build_item_features([ (i.article_id, [i.section_primary, i.writer_name]) for i in items.itertuples() ]) user_features = dataset.build_user_features([(u.uid, [u.popular_section]) for u in users.itertuples()]) item_features = dataset.build_item_features(build_features(items)) user_features = dataset.build_user_features(build_features(users)) user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping( )
def model(df, params, u=None, i=None): state = np.random.RandomState(params['seed']) data = Dataset() data.fit(df['userID'].unique(), df['poiID'].unique(), user_features=u[1] if u is not None else None, item_features=i[1] if i is not None else None) if u is not None: user_features_iterable = map(lambda l: (l[0], l[1]), u[0].iteritems()) user_features = data.build_user_features(user_features_iterable, normalize=False) else: user_features = None if i is not None: item_features_iterable = map(lambda l: (l[0], [l[1]]), i[0].iteritems()) item_features = data.build_item_features(item_features_iterable, normalize=False) else: item_features = None ratings, weights = data.build_interactions(df[['userID', 'poiID' ]].itertuples(index=False, name=None)) train, test = random_train_test_split(ratings, test_percentage=params['test'], random_state=state) lfm = LightFM(no_components=params['f'], learning_rate=params['lr'], loss=params['loss'], user_alpha=params['alpha'], random_state=state) lfm.fit(train, epochs=params['epochs'], user_features=user_features, item_features=item_features) return { 'pr-train': 100.0 * precision_at_k(lfm, train, k=params['k'], user_features=user_features, item_features=item_features).mean(), 'mrr-train': 100.0 * reciprocal_rank(lfm, train, user_features=user_features, item_features=item_features).mean(), 'pr-test': 100.0 * precision_at_k(lfm, test, k=params['k'], user_features=user_features, item_features=item_features).mean(), 'mrr-test': 100.0 * reciprocal_rank(lfm, test, user_features=user_features, item_features=item_features).mean() }
dataset = Dataset() dataset.fit((x[0] for i, x in users.iterrows()), (x[1] for i, x in users.iterrows())) # (interactions, weights) = dataset.build_interactions((x[0],x[1]) for i,x in users.iterrows()) dataset.fit_partial(items=(x['destinationid'] for i, x in destinations.iterrows()), item_features=(x['Destination-tf-idf'] for i, x in destinations.iterrows())) dataset.fit_partial(items=(x['userid'] for i, x in users.iterrows()), user_features=(x['age'] for i, x in users.iterrows())) item_features = dataset.build_item_features( ((x['destinationid'], [x['Destination-tf-idf']]) for i, x in destinations.iterrows())) user_features = dataset.build_user_features( ((x['userid'], [x['age']]) for i, x in users.iterrows())) mf_model = runMF(interactions=interactions, item_features=item_features, user_features=user_features, n_components=30, loss='warp', epoch=30, n_jobs=4) def get_all_users(): return user_all_records def sample_recommendation_user_1(user_id):
# fit the dataset to create mappings for users, items and respective features dataset.fit((x['_id'] for x in full_users), (x['_id'] for x in locations_data), user_features=interestList, item_features=subCatList) # build interactions from ratings data, 'liked' places only at the moment (~350 only) interactions = dataset.build_interactions( ((x['userId'], x['place'], x['weight']) for x in full_ratings)) print(repr(interactions[0])) # print number of users and items num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) # buil user features from users interests user_features = dataset.build_user_features( ((x['_id'], x['interests']) for x in full_users), normalize=False) # buil item features from users iterests item_features = dataset.build_item_features( ((x['_id'], x['subCategory']) for x in locations_data), normalize=False) # print(repr(item_features)) with open('data.json', 'w') as outfile: json.dump(dataset.mapping(), outfile) model = LightFM(loss='warp', no_components=20) model.fit(interactions[0], user_features=user_features, item_features=item_features) train_auc = auc_score(model,
dataset.fit_partial(users=(x['User-ID'] for x in get_user_features()), items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features()), user_features=(x['Age'] for x in get_user_features())) (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) for x in get_ratings())) #print(repr(interactions)) item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) for x in get_book_features())) #print(repr(item_features)) user_features = dataset.build_user_features(((x['User-ID'], [x['Age']]) for x in get_user_features())) labels = np.array([x['ISBN'] for x in get_ratings()]) ################################# # # # Training the Model # # # ################################# model = LightFM(loss='warp') (train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2) model.fit(train, item_features=item_features, user_features=user_features, epochs=2)
def main(): if request.method == 'POST': global df_movies # global top_trending_ids # print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) ) print(request.form) # Get recommendations! if 'run-mf-model' in request.form: for i, user_rating in enumerate(session['arr']): session['arr'][i] = user_rating[:-2] session['movieIds'] = session['movieIds'][:-2] rated_movies = min(len(session['arr'][0]), len(session['movieIds'])) for i, user_rating in enumerate(session['arr']): session['arr'][i] = user_rating[:rated_movies] session['movieIds'] = session['movieIds'][:rated_movies] pu = recommendation_mf(session['arr'], session['members'], session['movieIds']) session.clear() top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml) session['counter'] = 0 session['members'] = 0 session['userAges'] = [] session['userGenders'] = [] session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url) session['arr'] = None return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu})) if 'run-siamese-model' in request.form: # global df global friends global ratings global new_friend_id new_ratings = [] for mid, movie_real_id in enumerate(session['movieIds']): avg_mv_rating = np.median(np.array([user_ratings[mid] for user_ratings in session['arr']])) new_ratings.append({'movie_id_ml':movie_real_id, 'rating': avg_mv_rating, 'friend_id': new_friend_id}) new_friend = {'friend_id': new_friend_id, 'friends_age': np.mean(np.array(session['userAges'])), 'friends_gender': np.mean(np.array(session['userGenders']))} friends.append(new_friend) ratings.extend(new_ratings) dataset = LightFMDataset() item_str_for_eval = "x['title'],x['release'], x['unknown'], x['action'], x['adventure'],x['animation'], x['childrens'], x['comedy'], x['crime'], x['documentary'], x['drama'], x['fantasy'], x['noir'], x['horror'], x['musical'],x['mystery'], x['romance'], x['scifi'], x['thriller'], x['war'], x['western'], *soup_movie_features[x['soup_id']]" friend_str_for_eval = "x['friends_age'], x['friends_gender']" dataset.fit(users=(int(x['friend_id']) for x in friends), items=(int(x['movie_id_ml']) for x in movies), item_features=(eval("("+item_str_for_eval+")") for x in movies), user_features=((eval(friend_str_for_eval)) for x in friends)) num_friends, num_items = dataset.interactions_shape() print(f'Num friends: {num_friends}, num_items {num_items}. {datetime.datetime.now()}') (interactions, weights) = dataset.build_interactions(((int(x['friend_id']), int(x['movie_id_ml'])) for x in ratings)) item_features = dataset.build_item_features(((x['movie_id_ml'], [eval("("+item_str_for_eval+")")]) for x in movies) ) user_features = dataset.build_user_features(((x['friend_id'], [eval(friend_str_for_eval)]) for x in friends) ) print(f"Item and User features created {datetime.datetime.now()}") epochs = 50 #150 lr = 0.015 max_sampled = 11 loss_type = "warp" # "bpr" model = LightFM(learning_rate=lr, loss=loss_type, max_sampled=max_sampled) model.fit_partial(interactions, epochs=epochs, user_features=user_features, item_features=item_features) train_precision = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean() train_auc = auc_score(model, interactions, user_features=user_features, item_features=item_features).mean() print(f'Precision: {train_precision}, AUC: {train_auc}, {datetime.datetime.now()}') k = 18 top_movie_ids, scores = predict_top_k_movies(model, new_friend_id, k, num_items, user_features=user_features, item_features=item_features, use_features = False) top_movies = df_movies[df_movies.movie_id_ml.isin(top_movie_ids)] pu = recommendation_siamese(top_movies, scores) return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu})) # Collect friends info elif 'person-select-gender-0' in request.form: for i in range(session['members']): session['userAges'].append(int(request.form.get(f'age-{i}'))) session['userGenders'].append(int(request.form.get(f'person-select-gender-{i}'))) return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': session['members'], 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) # Choose number of people in the group elif 'people-select' in request.form: count = int(request.form.get('people-select')) session['members'] = count session['arr'] = [[0 for x in range(15)] for y in range(count)] return(render_template('main.html', settings = {'friendsInfo':True, 'showVote': False, 'people': count, 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) # All people voting elif 'person-select-0' in request.form: for i in range(session['members']): session['arr'][i][session['counter']] = int(request.form.get(f'person-select-{i}')) session['counter'] += 1 if session['counter'] < 15: return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None})) else: return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':True, 'recommendation': None})) elif request.method == 'GET': session.clear() top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml) print(top_trending_ids) print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) ) session['counter'] = 0 session['members'] = 0 session['userAges'] = [] session['userGenders'] = [] session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url) session['arr'] = None return(render_template('main.html', settings = {'showVote': False, 'people': 0, 'buttonDisable': False, 'recommendation': None}))
def lightfm_node(X1_train, X2_train, X1_test, X2_test): X2 = pd.concat([X2_train, X2_test]) X1 = pd.concat([X1_train, X1_test]).set_index('id') X1.columns = ['X1_' + i for i in X1.columns] X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop') for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']: X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}') X1 = X1.reset_index() from lightfm.data import Dataset dataset = Dataset() dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_1'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_13'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_5'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_8'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_6'])) user_features = dataset.build_user_features( [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6' ]].values.tolist()) for x in X1.iterrows()], normalize=True) (interactions, weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T)) model = LightFM(no_components=32, learning_rate=0.04, loss='bpr', max_sampled=55, random_state=0) num_epochs = 20 for i in range(num_epochs): model.fit_partial(interactions, user_features=user_features) users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping( ) user_features_mapping_inv = { j: i for i, j in user_features_mapping.items() } tag_embeddings = (model.user_embeddings.T / np.linalg.norm(model.user_embeddings, axis=1)).T lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)], index=X1['id']) return lightfm_embed
def obtener_matrices_gui(self, ruta_ratings, sep_ratings, encoding_ratings, ruta_users, sep_users, encoding_users, ruta_items, sep_items, encoding_items): """ Método obtener_matrices_gui. Obtiene las matrices necesarias para la creación de los modelos de LightFM. Este método solo se utiliza en la interfaz web. Parameters ---------- ruta_ratings: str ruta del archivo que contiene las valoraciones. sep_ratings: str separador utilizado en el archivo de valoraiones. encoding_ratings: str encoding utilizado en el archivo de valoraciones. ruta_users: str ruta del archivo que contiene los datos de los usuarios. sep_users: str separador utilizado en el archivo de usuarios. encoding_users: str encoding utilizado en el archivo de usuarios. ruta_items: str ruta del archivo que contiene los datos de los ítems. sep_items: str separador utilizado en el archivo de ítems. encoding_items: str encoding utilizado en el archivo de ítems. """ global train, test, item_features, user_features # Se obtienen los dataframes ratings_df = Entrada.leer_csv(ruta_ratings, sep_ratings, encoding_ratings) ratings_df.sort_values( [ratings_df.columns.values[0], ratings_df.columns.values[1]], inplace=True) users_df = Entrada.leer_csv(ruta_users, sep_users, encoding_users) users_df.sort_values([users_df.columns.values[0]], inplace=True) items_df = Entrada.leer_csv(ruta_items, sep_items, encoding_items) items_df.sort_values([items_df.columns.values[0]], inplace=True) # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos dataset = Dataset() dataset.fit(users_df[users_df.columns.values[0]], items_df[items_df.columns.values[0]], user_features=users_df[users_df.columns.values[1]], item_features=items_df[items_df.columns.values[1]]) # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios if self.opcion_modelo == 1 or self.opcion_modelo == 2: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]], row[ratings_df.columns.values[2]]) for index, row in ratings_df.iterrows()) else: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]]) for index, row in ratings_df.iterrows()) # Se obtienen las matrices de features y se guardan item_features = dataset.build_item_features( (row[items_df.columns.values[0]], [row[items_df.columns.values[1]]]) for index, row in items_df.iterrows()) user_features = dataset.build_user_features( (row[users_df.columns.values[0]], [row[users_df.columns.values[1]]]) for index, row in users_df.iterrows()) print("Guarda la matriz de item features") guardar_datos_pickle(item_features, 'la matriz de item features') print("Guarda la matriz de user features") guardar_datos_pickle(user_features, 'la matriz de user feautures') # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan train, test = random_train_test_split(interacciones, test_percentage=0.2) print("Guarda la matriz de entrenamiento") guardar_datos_pickle(train, 'la matriz de entrenamiento') print("Guarda la matriz de test") guardar_datos_pickle(test, 'la matriz de test')
def evaluate_model( df, user_id_col='user_id', item_id_col='business_id', stratify=None): """ Model evaluation. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. stratify: if use stratification. No return value """ # create test and train datasets print('model evaluation') train, test = train_test_split(df, test_size=0.2, stratify=stratify) ds = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) train_users = train.drop_duplicates('user_id') # train_users = train[train.duplicated('user_id') == False] train_user_features = [] for i in range(len(train_users)): train_user_features.append(get_users_features_tuple( train_users.values[i])) train_user_features = ds.build_user_features( train_user_features, normalize=False) test_users = test.drop_duplicates('user_id') # test_users = test[test.duplicated('user_id') == False] test_user1_features = [] for i in range(len(test_users)): test_user1_features.append(get_users_features_tuple( test_users.values[i])) test_user_features = ds.build_user_features( test_user1_features, normalize=False) train_items = train.drop_duplicates('business_id') # train_items = train[train.duplicated('business_id') == False] train_item1_features = [] for i in range(len(train_items)): train_item1_features.append(get_items_features_tuple( train_items.values[i], categories)) train_item_features = ds.build_item_features( train_item1_features, normalize=False) test_items = test.drop_duplicates('business_id') # test_items = test[test.duplicated('business_id') == False] test_item_features = [] for i in range(len(test_items)): test_item_features.append(get_items_features_tuple( test_items.values[i], categories)) test_item_features = ds.build_item_features( test_item_features, normalize=False) # plugging in the interactions and their weights (train_interactions, train_weights) = ds.build_interactions( [(x[0], x[1], x[2]) for x in train.values]) (test_interactions, test_weights) = ds.build_interactions( [(x[0], x[1], x[2]) for x in test.values]) # model model = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model.fit( train_interactions, user_features=train_user_features, item_features=train_item_features, sample_weight=train_weights, epochs=10, num_threads=10) # auc-roc train_auc = auc_score( model, train_interactions, user_features=train_user_features, item_features=train_item_features, num_threads=20).mean() print('Training set AUC: %s' % train_auc) test_auc = auc_score( model, test_interactions, user_features=test_user_features, item_features=test_item_features, num_threads=20).mean() print('Testing set AUC: %s' % test_auc)
def predict_hard_users( train: pd.DataFrame, test: pd.DataFrame, genre: pd.DataFrame, education: pd.DataFrame, notices: pd.DataFrame, available_notices: set, applicant_notice: dict, header=None, ): user_feature = genre.merge(education, on="idpostulante", how="left") user_feature.drop(columns=["fechanacimiento"], inplace=True) user_feature_hard_user = user_feature[user_feature.idpostulante.isin( train.idpostulante)] uf = generate_features(user_feature[["sexo", "nombre", "estado"]]) itf = generate_features(notices[[ "nombre_zona", "tipo_de_trabajo", "nivel_laboral", "nombre_area" ]]) dataset1 = Dataset() dataset1.fit( train.idpostulante.unique(), # all the users notices.idaviso.unique(), user_features=uf, # additional user features item_features=itf, # additional item features ) # plugging in the interactions and their weights (interactions, weights) = dataset1.build_interactions([ (x[1], x[0], x[3]) for x in train.values ]) user_feature_list = generate_in_use_features( user_feature_hard_user[["sexo", "nombre", "estado"]].values, ["sexo", "nombre", "estado"], ) user_tuple = list( zip(user_feature_hard_user.idpostulante, user_feature_list)) user_features = dataset1.build_user_features(user_tuple, normalize=False) ( user_id_map, user_feature_map, item_id_map, item_feature_map, ) = dataset1.mapping() inv_item_id_map = {v: k for k, v in item_id_map.items()} # for component in [10, 35, 50, 80, 100, 200]: component = 35 model = lfm.LightFM(no_components=component, loss="warp", random_state=42) model.fit( interactions, # user_features=user_features, # sample_weight=weights, epochs=150, num_threads=8, verbose=True, ) test_precision = precision_at_k( model, interactions, # user_features=user_features, k=10, num_threads=8, ).mean() logger.info( f"Evaluation for LightFM is: {test_precision} with {component} number of component" ) final_predictions = {} for a_user in tqdm(test.idpostulante.unique()): try: notices_by_user = applicant_notice[a_user] except: notices_by_user = set() try: user_x = user_id_map[a_user] except: user_x = 0 n_users, n_items = interactions.shape prediction = np.argsort( model.predict( user_x, np.arange(n_items), # user_features=user_features, ))[::-1] prediction_for_user = [] for pred in prediction: notice = inv_item_id_map[pred] should_add = (notice in available_notices and notice not in notices_by_user) if should_add: prediction_for_user += [notice] if len(prediction_for_user) == 10: break final_predictions[a_user] = prediction_for_user write_dict(final_predictions, "lightfm", header) return ["lightfm"]
business_stats: Dict[str, Business] = Business.load_from_file( business_stats_file) print('[ %04ds ] Files loaded' % (time.time() - start_time)) all_user_features = ['NO_FEAT'] all_business_features = Business.collect_business_features(business_stats) dataset = Dataset() dataset.fit(User.extract_user_ids(user_stats), Business.extract_business_ids(business_stats), user_features=all_user_features, item_features=all_business_features) user_features = dataset.build_user_features( User.build_user_features(user_stats, User.extract_user_ids(user_stats)), True) business_features = dataset.build_item_features( Business.build_business_features( business_stats, Business.extract_business_ids(business_stats)), True) print('[ %04ds ] Dataset initialized' % (time.time() - start_time)) user_avg, user_std = Review.extract_user_average_and_std(training_set) normalized_training_reviews = Review.normalize_by_user( training_set, user_avg) training_interactions = Review.extract_sparse_interaction_matrix( normalized_training_reviews)
def preprocess(): import pandas as pd import math import numpy as np data_users = pd.read_csv('users_tag.csv',index_col=0) data_business = pd.read_csv('business_Nora.csv',index_col=0) data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0) data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count]) data_users.useful = pd.Series([math.log(x+1) for x in data_users.useful]) #cleam business skewness data_business.review_count = pd.Series([math.log(x+1) for x in data_business.review_count]) from lightfm.data import Dataset #model establishment dataset = Dataset() dataset.fit(data_review.user_id,data_review.business_id) type(dataset) num_users, num_items = dataset.interactions_shape() # fit item and user features. dataset.fit_partial(items=data_business.business_id, item_features=['stars']) dataset.fit_partial(items=data_business.business_id, item_features=['review_count']) tar_cols = [x for x in data_business.columns[24:]] dataset.fit_partial(items = data_business.business_id, item_features = tar_cols) user_cols = [x for x in data_users[['review_count', 'useful', 'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates', 'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges', 'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms', 'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer', 'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque', 'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners', 'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal', 'Caterers', 'Arts & Entertainment']]] dataset.fit_partial(users=data_users.user_id, user_features = user_cols) print("Building Interactions") (interactions, weights) = dataset.build_interactions([(x['user_id'], x['business_id'], x['stars']) for index,x in data_review.iterrows()]) print("Interactions Build") # build user and item features def build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst def user_build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst # get max of each column to regularize value to [0,1] max_star = max(data_business.stars) max_b_rc = max(data_business.review_count) print('max_b_rc') print(max_b_rc) # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25 item_features = dataset.build_item_features(((x['business_id'], {'stars':0.5*x['stars']/max_star, 'review_count':0.5*x['review_count']/max_b_rc, **build_dict(x,tar_cols,[0.5*x['stars']/max_star, 0.5*x['review_count']/max_b_rc])}) for index,x in data_business.iterrows())) # user_features = dataset.build_user_features(((x['user_id'], # [x['is_elite'],x['year']]) # for index, x in data_users.iterrows())) max_u_rc = max(data_users.review_count) max_useful = max(data_users.useful) user_features = dataset.build_user_features(((x['user_id'], {'review_count':0.35*x['review_count']/max_u_rc, 'useful':0.35*x['useful']/max_useful, **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows())) #train-test split # seed = 12345 #has multiple seeds set up to account for split biases # seed = 101 # seed = 186 seed = 123 from lightfm.cross_validation import random_train_test_split train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed)) print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) train.multiply(test).nnz == 0 # make sure train and test are truly disjoint return train,test,data_business,dataset,user_features,item_features