def prepare(dataset_path): # get initial data and metadata df = load_data(dataset_path) metadata = get_metadata(df) print_data(df, metadata) # convert data values as array val = df['pollution'].values # reshape to change the dimension of data val = reshape(val, (-1, 1)) # normalize data scaler = MinMaxScaler(feature_range=(0, 1)) scaled_val = scaler.fit_transform(val) # split data into train and test data = split_data(scaled_val) print_splitted_data(data) # split data into input and output data['train_in'], data['train_out'] = split_data(data['train'], False) data['test_in'], data['test_out'] = split_data(data['test'], False) print_splitted_data(data, True) print('Data Train Input') print(DataFrame(data['train_in']).head()) print('Data Train Output') print(DataFrame(data['train_out']).head()) return scaler, data
pred = pred.groupby('user_ix').apply(lambda x: list(x['article'])) pred = pd.DataFrame(pred, columns=['predictions']) return pred, pred_raw if __name__ == "__main__": from preprocessing import load_data, get_metadata, load_data_vertical N = 50 # number of predictions limit = 5000 # number of samples to look at first_time = True folder = os.getenv('DATA_FOLDER', 'processed') user_item_train, user_item_test, user_item_validation = load_data( folder=folder, cut=40) metadata = get_metadata(folder=folder, usecols=['resource_id', 'text']) # slow user_item_train = user_item_train.head(limit) user_item_test = user_item_test.head(limit) print(f"Data loaded") model_params = { 'lr': 0.0001, 'batch_size': 400, 'epochs': 4, 'alpha': 1, 'layers': [128, 32, 8], 'dropout': 0.5, } if first_time: preprocessing(user_item_train, metadata,
def run(cut, high_cut, seed, epsilon, lr): """ Trains a idea1 model in a federated way. We take the same data for client a and client b. We output the metrics for three usergroups: all users, low click users, high click users. """ N = 300 # number of predictions limit = 50000000 # number of samples to look at first_time = True folder_a = os.getenv('DATA_FOLDER', 'processed') folder_b = os.getenv('DATA_FOLDER', 'processed') client_num = 0 #which client to evaluate on model_params = { 'lr': 0.0001, 'batch_size': 400, 'epochs': 1, 'alpha': 1, 'layers': [128, 16], "stop_on_metric": True, 'dropout': 0.5, 'reg': 0.0001, 'interval': 1, 'checkpoint_interval': 1, 'eval_batch': False, "normalize": 0, "take_target_out": False, "early_stopping": 4, "loss": "BPR", "optimizer": "ADAM", "workers": 1, "epsilon": epsilon, 'rounds': 50 } #load data client_a_user_item_train, client_a_user_item_test, client_a_user_item_validation = load_data_cv( folder=folder_b, cut=cut, high_cut=high_cut, seed=seed) client_a_user_item_train = client_a_user_item_train.head(limit) client_a_user_item_test = client_a_user_item_test[ client_a_user_item_test.index.isin(client_a_user_item_train.index)] client_a_group_metadata = get_metadata(folder=folder_b) client_b_user_item_train, client_b_user_item_test, client_b_user_item_validation = load_data_cv( folder=folder_a, cut=cut, high_cut=high_cut, seed=seed) client_b_user_item_train = client_b_user_item_train.head(limit) client_b_metadata = get_metadata(folder=folder_a) client_b_user_item_test = client_b_user_item_test[ client_b_user_item_test.index.isin(client_b_user_item_train.index)] print(f"Data loaded") # embedd data if first_time: preprocessing(client_a_user_item_train, client_a_group_metadata, folder=folder_b, model_params=model_params) print("embedded") preprocessing(client_b_user_item_train, client_b_metadata, folder=folder_a, model_params=model_params) client_b_article_embedding, client_b_user_embedding = load_embedding( folder=folder_a) client_a_article_embedding, client_a_user_embedding = load_embedding( folder=folder_b) #dict for federated learning clients = [{ "name": "b", "user_item_train": client_b_user_item_train, "user_item_test": client_b_user_item_test, "user_embedding": client_b_user_embedding, "article_embedding": client_b_article_embedding, }, { "name": "a", "user_item_train": client_a_user_item_train, "user_item_test": client_a_user_item_test, "user_embedding": client_a_user_embedding, "article_embedding": client_a_article_embedding, }] #prepare inital global model model_params['train'] = False params = f"{clients[client_num]['name']}_{cut}_{high_cut}_{seed}_{epsilon}_{lr}" global_model, history = nn_train( clients[0]['user_item_train'], clients[0]['user_item_test'].sample(frac=0.3, random_state=seed + 1), user_embedding=clients[0]['user_embedding'], article_embedding=[ clients[0]['article_embedding'], clients[0]['article_embedding'] ], model_params=model_params.copy(), new=True, model_path= f'idea1_models/fl/fl_client_b_client_a_global_model_{params}', last_x_articles=high_cut) model_params['train'] = True for client in clients: if not os.path.exists( f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}' ): os.makedirs( f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}' ) open( f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}/metrics', "w").close() # commence global training loop for comm_round in range(model_params['rounds']): model_params['round'] = comm_round # get the global model's weights - will serve as the initial weights for all local models global_weights = global_model.get_weights() print(comm_round) # initial list to collect local model weights local_weight_updates_list = list() # loop through each client and create and train new local model for client in clients: local_model, history = nn_train( client['user_item_train'], client['user_item_test'].sample(frac=0.3, random_state=seed + 1), user_embedding=client['user_embedding'], article_embedding=[ client['article_embedding'], client['article_embedding'] ], model_params=model_params.copy(), new=False, retrain=True, new_model_path= f'idea1_models/fl/fl_{client["name"]}_local_model_{params}', model_path= f'idea1_models/fl/fl_client_b_client_a_global_model_{params}', last_x_articles=high_cut) updates = get_weight_updates(local_model.get_weights(), global_weights) local_weight_updates_list.append(updates) K.clear_session() # stop if metrics do not increase for the target client df = pd.read_csv( f'results/idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}/metrics' ) df.columns = ['name', f'metric', f'value', f'std'] dfgroup = df.groupby('metric').tail(10) dfgroup['epoch'] = (dfgroup.index / 7).astype(int) ndcg100 = dfgroup[dfgroup['metric'].str.contains('Recall@10')][[ 'value', 'epoch' ]] top = ndcg100.sort_values('value').tail(1) stop2 = len( ndcg100.tail(model_params["early_stopping"])[ndcg100.tail( model_params["early_stopping"])['value'] > top.iloc[0, 0]]) == 0 print( ndcg100.tail(model_params["early_stopping"])[ndcg100.tail( model_params["early_stopping"])['value'] > top.iloc[0, 0]]) ndcg100 = dfgroup[dfgroup['metric'].str.contains('NDCG@100')][[ 'value', 'epoch' ]] top = ndcg100.sort_values('value').tail(1) print( ndcg100.tail(model_params["early_stopping"])[ndcg100.tail( model_params["early_stopping"])['value'] > top.iloc[0, 0]]) stop1 = len( ndcg100.tail(model_params["early_stopping"])[ndcg100.tail( model_params["early_stopping"])['value'] > top.iloc[0, 0]]) == 0 if stop1 and stop2 and comm_round != 0: epoch = top['epoch'].iloc[0] print(epoch) model = tf.keras.models.load_model( f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}_epochs/{epoch}.h5', ) model.save( f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}.h5', ) break # update global model rate_updates = 1 average_weights = get_average_updates(local_weight_updates_list, global_weights, rate_updates) global_model.set_weights(average_weights) if not os.path.exists( f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/' ): os.makedirs( f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/' ) global_model.save( f'idea1_models/fl/fl_client_b_client_a_global_model_{params}.h5') global_model.save( f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/{comm_round}.h5' ) # evaluate model_params['train'] = False model, history = nn_train( clients[client_num]['user_item_train'], clients[client_num]['user_item_test'].sample(frac=0.3, random_state=seed + 1), user_embedding=clients[client_num]['user_embedding'], article_embedding=[ clients[client_num]['article_embedding'], clients[client_num]['article_embedding'] ], model_params=model_params, new=False, model_path= f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}', last_x_articles=high_cut) user_item_test = clients[client_num]['user_item_test'] user_embedding = clients[client_num]['user_embedding'] article_embedding = clients[client_num]['article_embedding'] user_item_train = clients[client_num]['user_item_train'] user_item_test_sample = user_item_test pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample.index], article_embedding, user_item_train.loc[user_item_test_sample.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample.index)] idea1 = evaluate(pred.sort_index(), user_item_test_sample.loc[pred.index].sort_index(), limit=limit, experiment_name=f'result_{params}_all_users.results') user_item_test_sample_low = user_item_test.loc[ user_item_test.str.len().sort_values().head( int(len(user_item_test) / 4)).index] pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample_low.index], article_embedding, user_item_train.loc[user_item_test_sample_low.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample_low.index)] idea1 = evaluate( pred.sort_index(), user_item_test_sample_low.loc[pred.index].sort_index(), limit=limit, experiment_name=f'result_{params}_low_click_users.results') user_item_test_sample_high = user_item_test.loc[ user_item_test.str.len().sort_values().tail( int(len(user_item_test) / 4)).index] pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample_high.index], article_embedding, user_item_train.loc[user_item_test_sample_high.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample_high.index)] idea1 = evaluate( pred.sort_index(), user_item_test_sample_high.loc[pred.index].sort_index(), limit=limit, experiment_name=f'result_{params}_high_click_users.results')
def run(cut, high_cut, seed, name): """ Merges the data and train on individual or merged data. We take the same data for client a and client b. We output the metrics for three usergroups: all users, low click users, high click users. """ N = 300 # number of predictions limit = 10000000 # number of samples to look at first_time = True folder_a = os.getenv('DATA_FOLDER', 'processed') folder_b = os.getenv('DATA_FOLDER', 'processed') # load data client_a_user_item_train, client_a_user_item_test, client_a_user_item_validation = load_data_cv( folder=folder_a, cut=cut, high_cut=high_cut, seed=seed) client_a_metadata = get_metadata(folder=folder_a) client_a_user_item_test = client_a_user_item_test[ client_a_user_item_test.index.isin(client_a_user_item_train.index)] client_b_user_item_train, client_b_user_item_test, client_b_user_item_validation = load_data_cv( folder=folder_b, cut=cut, high_cut=high_cut, seed=seed) client_b_metadata = get_metadata(folder=folder_b) client_b_user_item_test = client_b_user_item_test[ client_b_user_item_test.index.isin(client_b_user_item_train.index)] # add suffix to article/user_id. Only needed if ids overlap client_a_user_item_train = client_a_user_item_train.apply( lambda x: [str(article_id) + "_a" for article_id in x]) client_a_user_item_test = client_a_user_item_test.apply( lambda x: [str(article_id) + "_a" for article_id in x]) client_a_metadata['resource_id'] = client_a_metadata['resource_id'].astype( str) + "_a" client_a_user_item_train.index = client_a_user_item_train.index.astype( str) + "_a" client_a_user_item_test.index = client_a_user_item_test.index.astype( str) + "_a" client_b_user_item_train = client_b_user_item_train.apply( lambda x: [str(article_id) + "_b" for article_id in x]) client_b_user_item_test = client_b_user_item_test.apply( lambda x: [str(article_id) + "_b" for article_id in x]) client_b_metadata['resource_id'] = client_b_metadata['resource_id'].astype( str) + "_b" client_b_user_item_train.index = client_b_user_item_train.index.astype( str) + "_b" client_b_user_item_test.index = client_b_user_item_test.index.astype( str) + "_b" model_params = { 'lr': 0.0001, 'batch_size': 400, 'epochs': 30, 'alpha': 1, 'layers': [1024, 1024, 16], 'dropout': 0.5, 'reg': 0.0001, 'interval': 1, 'checkpoint_interval': 1, 'eval_batch': False, "normalize": 0, "take_target_out": False, "early_stopping": 4, "stop_on_metric": True, "loss": "BPR", "optimizer": "ADAM", "workers": 3, } print(f"Data loaded") folder_a = f"{folder_a}/a" folder_b = f"{folder_b}/b" # embedd data if first_time: preprocessing(client_a_user_item_train, client_a_metadata, folder=folder_a, model_params=model_params) print("embedded") preprocessing(client_b_user_item_train, client_b_metadata, folder=folder_b, model_params=model_params) # get embedding client_b_article_embedding, client_b_user_embedding = load_embedding( folder=folder_b) client_a_article_embedding, client_a_user_embedding = load_embedding( folder=folder_a) #define data to train on both_user_item_train = pd.concat( [client_b_user_item_train, client_a_user_item_train]) both_user_item_test = pd.concat( [client_b_user_item_test, client_a_user_item_test]) both_user_embedding = pd.concat( [client_b_user_embedding, client_a_user_embedding]) both_article_embedding = pd.concat( [client_b_article_embedding, client_a_article_embedding]) if name == 'client_a': user_item_train, user_item_test, user_embedding, article_embedding_train, new = client_a_user_item_train, client_a_user_item_test, client_a_user_embedding, client_a_article_embedding, True article_embedding = article_embedding_train if name == 'client_b': user_item_train, user_item_test, user_embedding, article_embedding_train, new = client_b_user_item_train, client_b_user_item_test, client_b_user_embedding, client_b_article_embedding, True article_embedding = article_embedding_train if name == 'client_a_both': user_item_train, user_item_test, user_embedding, article_embedding_train, new = both_user_item_train, client_a_user_item_test, both_user_embedding, both_article_embedding, True article_embedding = client_a_article_embedding if name == 'client_b_both': user_item_train, user_item_test, user_embedding, article_embedding_train, new = both_user_item_train, client_b_user_item_test, both_user_embedding, both_article_embedding, True article_embedding = client_b_article_embedding #train model, history = nn_train( user_item_train, user_item_test.sample(frac=0.3, random_state=seed + 1), user_embedding=user_embedding, article_embedding=[article_embedding_train, article_embedding], new=new, model_params=model_params, model_path=f'idea1_models/{name}_{cut}_{high_cut}_{seed}', last_x_articles=high_cut) user_item_test_sample = user_item_test pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample.index], article_embedding, user_item_train.loc[user_item_test_sample.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample.index)] idea1 = evaluate( pred.sort_index(), user_item_test_sample.loc[pred.index].sort_index(), limit=limit, experiment_name=f'result_{name}_{new}_{cut}_{high_cut}_{seed}.results') user_item_test_sample_low = user_item_test.loc[ user_item_test.str.len().sort_values().head( int(len(user_item_test) / 4)).index] pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample_low.index], article_embedding, user_item_train.loc[user_item_test_sample_low.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample_low.index)] idea1 = evaluate( pred.sort_index(), user_item_test_sample_low.loc[pred.index].sort_index(), limit=limit, experiment_name= f'result_{name}_{new}_{cut}_{high_cut}_{seed}_low_click_users.results') user_item_test_sample_high = user_item_test.loc[ user_item_test.str.len().sort_values().tail( int(len(user_item_test) / 4)).index] pred, pred_raw = prediction( model, user_embedding.loc[user_item_test_sample_high.index], article_embedding, user_item_train.loc[user_item_test_sample_high.index], N, model_params=model_params) pred = pred[pred.index.isin(user_item_test_sample_high.index)] idea1 = evaluate( pred.sort_index(), user_item_test_sample_high.loc[pred.index].sort_index(), limit=limit, experiment_name= f'result_{name}_{new}_{cut}_{high_cut}_{seed}_high_click_users.results' )