Beispiel #1
0
def prepare(dataset_path):
    # get initial data and metadata
    df = load_data(dataset_path)
    metadata = get_metadata(df)
    print_data(df, metadata)
    # convert data values as array
    val = df['pollution'].values
    # reshape to change the dimension of data
    val = reshape(val, (-1, 1))
    # normalize data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_val = scaler.fit_transform(val)
    # split data into train and test
    data = split_data(scaled_val)
    print_splitted_data(data)
    # split data into input and output
    data['train_in'], data['train_out'] = split_data(data['train'], False)
    data['test_in'], data['test_out'] = split_data(data['test'], False)
    print_splitted_data(data, True)
    print('Data Train Input')
    print(DataFrame(data['train_in']).head())
    print('Data Train Output')
    print(DataFrame(data['train_out']).head())
    return scaler, data 
Beispiel #2
0
    pred = pred.groupby('user_ix').apply(lambda x: list(x['article']))
    pred = pd.DataFrame(pred, columns=['predictions'])
    return pred, pred_raw


if __name__ == "__main__":
    from preprocessing import load_data, get_metadata, load_data_vertical

    N = 50  # number of predictions
    limit = 5000  # number of samples to look at
    first_time = True
    folder = os.getenv('DATA_FOLDER', 'processed')

    user_item_train, user_item_test, user_item_validation = load_data(
        folder=folder, cut=40)
    metadata = get_metadata(folder=folder, usecols=['resource_id',
                                                    'text'])  # slow
    user_item_train = user_item_train.head(limit)
    user_item_test = user_item_test.head(limit)

    print(f"Data loaded")
    model_params = {
        'lr': 0.0001,
        'batch_size': 400,
        'epochs': 4,
        'alpha': 1,
        'layers': [128, 32, 8],
        'dropout': 0.5,
    }
    if first_time:
        preprocessing(user_item_train,
                      metadata,
Beispiel #3
0
def run(cut, high_cut, seed, epsilon, lr):
    """
    Trains a idea1 model in a federated way. We take the same data for client a and client b. We output the metrics
    for three usergroups: all users, low click users, high click users.

    """
    N = 300  # number of predictions
    limit = 50000000  # number of samples to look at
    first_time = True
    folder_a = os.getenv('DATA_FOLDER', 'processed')
    folder_b = os.getenv('DATA_FOLDER', 'processed')
    client_num = 0  #which client to evaluate on
    model_params = {
        'lr': 0.0001,
        'batch_size': 400,
        'epochs': 1,
        'alpha': 1,
        'layers': [128, 16],
        "stop_on_metric": True,
        'dropout': 0.5,
        'reg': 0.0001,
        'interval': 1,
        'checkpoint_interval': 1,
        'eval_batch': False,
        "normalize": 0,
        "take_target_out": False,
        "early_stopping": 4,
        "loss": "BPR",
        "optimizer": "ADAM",
        "workers": 1,
        "epsilon": epsilon,
        'rounds': 50
    }

    #load data
    client_a_user_item_train, client_a_user_item_test, client_a_user_item_validation = load_data_cv(
        folder=folder_b, cut=cut, high_cut=high_cut, seed=seed)
    client_a_user_item_train = client_a_user_item_train.head(limit)
    client_a_user_item_test = client_a_user_item_test[
        client_a_user_item_test.index.isin(client_a_user_item_train.index)]
    client_a_group_metadata = get_metadata(folder=folder_b)

    client_b_user_item_train, client_b_user_item_test, client_b_user_item_validation = load_data_cv(
        folder=folder_a, cut=cut, high_cut=high_cut, seed=seed)
    client_b_user_item_train = client_b_user_item_train.head(limit)
    client_b_metadata = get_metadata(folder=folder_a)
    client_b_user_item_test = client_b_user_item_test[
        client_b_user_item_test.index.isin(client_b_user_item_train.index)]

    print(f"Data loaded")
    # embedd data
    if first_time:
        preprocessing(client_a_user_item_train,
                      client_a_group_metadata,
                      folder=folder_b,
                      model_params=model_params)
        print("embedded")
        preprocessing(client_b_user_item_train,
                      client_b_metadata,
                      folder=folder_a,
                      model_params=model_params)
    client_b_article_embedding, client_b_user_embedding = load_embedding(
        folder=folder_a)
    client_a_article_embedding, client_a_user_embedding = load_embedding(
        folder=folder_b)

    #dict for federated learning
    clients = [{
        "name": "b",
        "user_item_train": client_b_user_item_train,
        "user_item_test": client_b_user_item_test,
        "user_embedding": client_b_user_embedding,
        "article_embedding": client_b_article_embedding,
    }, {
        "name": "a",
        "user_item_train": client_a_user_item_train,
        "user_item_test": client_a_user_item_test,
        "user_embedding": client_a_user_embedding,
        "article_embedding": client_a_article_embedding,
    }]

    #prepare inital global model
    model_params['train'] = False
    params = f"{clients[client_num]['name']}_{cut}_{high_cut}_{seed}_{epsilon}_{lr}"

    global_model, history = nn_train(
        clients[0]['user_item_train'],
        clients[0]['user_item_test'].sample(frac=0.3, random_state=seed + 1),
        user_embedding=clients[0]['user_embedding'],
        article_embedding=[
            clients[0]['article_embedding'], clients[0]['article_embedding']
        ],
        model_params=model_params.copy(),
        new=True,
        model_path=
        f'idea1_models/fl/fl_client_b_client_a_global_model_{params}',
        last_x_articles=high_cut)

    model_params['train'] = True
    for client in clients:
        if not os.path.exists(
                f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}'
        ):
            os.makedirs(
                f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}'
            )
        open(
            f'results/idea1_models/fl/fl_{client["name"]}_local_model_{params}/metrics',
            "w").close()

    # commence global training loop
    for comm_round in range(model_params['rounds']):
        model_params['round'] = comm_round
        # get the global model's weights - will serve as the initial weights for all local models
        global_weights = global_model.get_weights()
        print(comm_round)
        # initial list to collect local model weights
        local_weight_updates_list = list()

        # loop through each client and create and train new local model
        for client in clients:
            local_model, history = nn_train(
                client['user_item_train'],
                client['user_item_test'].sample(frac=0.3,
                                                random_state=seed + 1),
                user_embedding=client['user_embedding'],
                article_embedding=[
                    client['article_embedding'], client['article_embedding']
                ],
                model_params=model_params.copy(),
                new=False,
                retrain=True,
                new_model_path=
                f'idea1_models/fl/fl_{client["name"]}_local_model_{params}',
                model_path=
                f'idea1_models/fl/fl_client_b_client_a_global_model_{params}',
                last_x_articles=high_cut)

            updates = get_weight_updates(local_model.get_weights(),
                                         global_weights)
            local_weight_updates_list.append(updates)

            K.clear_session()

        # stop if metrics do not increase for the target client
        df = pd.read_csv(
            f'results/idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}/metrics'
        )
        df.columns = ['name', f'metric', f'value', f'std']
        dfgroup = df.groupby('metric').tail(10)
        dfgroup['epoch'] = (dfgroup.index / 7).astype(int)

        ndcg100 = dfgroup[dfgroup['metric'].str.contains('Recall@10')][[
            'value', 'epoch'
        ]]
        top = ndcg100.sort_values('value').tail(1)
        stop2 = len(
            ndcg100.tail(model_params["early_stopping"])[ndcg100.tail(
                model_params["early_stopping"])['value'] > top.iloc[0,
                                                                    0]]) == 0
        print(
            ndcg100.tail(model_params["early_stopping"])[ndcg100.tail(
                model_params["early_stopping"])['value'] > top.iloc[0, 0]])

        ndcg100 = dfgroup[dfgroup['metric'].str.contains('NDCG@100')][[
            'value', 'epoch'
        ]]
        top = ndcg100.sort_values('value').tail(1)
        print(
            ndcg100.tail(model_params["early_stopping"])[ndcg100.tail(
                model_params["early_stopping"])['value'] > top.iloc[0, 0]])
        stop1 = len(
            ndcg100.tail(model_params["early_stopping"])[ndcg100.tail(
                model_params["early_stopping"])['value'] > top.iloc[0,
                                                                    0]]) == 0

        if stop1 and stop2 and comm_round != 0:
            epoch = top['epoch'].iloc[0]
            print(epoch)
            model = tf.keras.models.load_model(
                f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}_epochs/{epoch}.h5',
            )
            model.save(
                f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}.h5',
            )
            break

        # update global model
        rate_updates = 1
        average_weights = get_average_updates(local_weight_updates_list,
                                              global_weights, rate_updates)
        global_model.set_weights(average_weights)
        if not os.path.exists(
                f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/'
        ):
            os.makedirs(
                f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/'
            )
        global_model.save(
            f'idea1_models/fl/fl_client_b_client_a_global_model_{params}.h5')
        global_model.save(
            f'idea1_models/fl/fl_client_b_client_a_global_model_{params}/epochs/{comm_round}.h5'
        )

    # evaluate
    model_params['train'] = False
    model, history = nn_train(
        clients[client_num]['user_item_train'],
        clients[client_num]['user_item_test'].sample(frac=0.3,
                                                     random_state=seed + 1),
        user_embedding=clients[client_num]['user_embedding'],
        article_embedding=[
            clients[client_num]['article_embedding'],
            clients[client_num]['article_embedding']
        ],
        model_params=model_params,
        new=False,
        model_path=
        f'idea1_models/fl/fl_{clients[client_num]["name"]}_local_model_{params}',
        last_x_articles=high_cut)
    user_item_test = clients[client_num]['user_item_test']
    user_embedding = clients[client_num]['user_embedding']
    article_embedding = clients[client_num]['article_embedding']
    user_item_train = clients[client_num]['user_item_train']
    user_item_test_sample = user_item_test

    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample.index)]
    idea1 = evaluate(pred.sort_index(),
                     user_item_test_sample.loc[pred.index].sort_index(),
                     limit=limit,
                     experiment_name=f'result_{params}_all_users.results')

    user_item_test_sample_low = user_item_test.loc[
        user_item_test.str.len().sort_values().head(
            int(len(user_item_test) / 4)).index]
    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample_low.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample_low.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample_low.index)]
    idea1 = evaluate(
        pred.sort_index(),
        user_item_test_sample_low.loc[pred.index].sort_index(),
        limit=limit,
        experiment_name=f'result_{params}_low_click_users.results')

    user_item_test_sample_high = user_item_test.loc[
        user_item_test.str.len().sort_values().tail(
            int(len(user_item_test) / 4)).index]
    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample_high.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample_high.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample_high.index)]
    idea1 = evaluate(
        pred.sort_index(),
        user_item_test_sample_high.loc[pred.index].sort_index(),
        limit=limit,
        experiment_name=f'result_{params}_high_click_users.results')
Beispiel #4
0
def run(cut, high_cut, seed, name):
    """
    Merges the data and train on individual or merged data. We take the same data for client a and client b.
    We output the metrics    for three usergroups: all users, low click users, high click users.

    """
    N = 300  # number of predictions
    limit = 10000000  # number of samples to look at
    first_time = True
    folder_a = os.getenv('DATA_FOLDER', 'processed')
    folder_b = os.getenv('DATA_FOLDER', 'processed')
    # load data
    client_a_user_item_train, client_a_user_item_test, client_a_user_item_validation = load_data_cv(
        folder=folder_a, cut=cut, high_cut=high_cut, seed=seed)
    client_a_metadata = get_metadata(folder=folder_a)
    client_a_user_item_test = client_a_user_item_test[
        client_a_user_item_test.index.isin(client_a_user_item_train.index)]

    client_b_user_item_train, client_b_user_item_test, client_b_user_item_validation = load_data_cv(
        folder=folder_b, cut=cut, high_cut=high_cut, seed=seed)
    client_b_metadata = get_metadata(folder=folder_b)
    client_b_user_item_test = client_b_user_item_test[
        client_b_user_item_test.index.isin(client_b_user_item_train.index)]

    # add suffix to article/user_id. Only needed if ids overlap
    client_a_user_item_train = client_a_user_item_train.apply(
        lambda x: [str(article_id) + "_a" for article_id in x])
    client_a_user_item_test = client_a_user_item_test.apply(
        lambda x: [str(article_id) + "_a" for article_id in x])
    client_a_metadata['resource_id'] = client_a_metadata['resource_id'].astype(
        str) + "_a"
    client_a_user_item_train.index = client_a_user_item_train.index.astype(
        str) + "_a"
    client_a_user_item_test.index = client_a_user_item_test.index.astype(
        str) + "_a"

    client_b_user_item_train = client_b_user_item_train.apply(
        lambda x: [str(article_id) + "_b" for article_id in x])
    client_b_user_item_test = client_b_user_item_test.apply(
        lambda x: [str(article_id) + "_b" for article_id in x])
    client_b_metadata['resource_id'] = client_b_metadata['resource_id'].astype(
        str) + "_b"
    client_b_user_item_train.index = client_b_user_item_train.index.astype(
        str) + "_b"
    client_b_user_item_test.index = client_b_user_item_test.index.astype(
        str) + "_b"

    model_params = {
        'lr': 0.0001,
        'batch_size': 400,
        'epochs': 30,
        'alpha': 1,
        'layers': [1024, 1024, 16],
        'dropout': 0.5,
        'reg': 0.0001,
        'interval': 1,
        'checkpoint_interval': 1,
        'eval_batch': False,
        "normalize": 0,
        "take_target_out": False,
        "early_stopping": 4,
        "stop_on_metric": True,
        "loss": "BPR",
        "optimizer": "ADAM",
        "workers": 3,
    }
    print(f"Data loaded")

    folder_a = f"{folder_a}/a"
    folder_b = f"{folder_b}/b"
    # embedd data
    if first_time:
        preprocessing(client_a_user_item_train,
                      client_a_metadata,
                      folder=folder_a,
                      model_params=model_params)
        print("embedded")
        preprocessing(client_b_user_item_train,
                      client_b_metadata,
                      folder=folder_b,
                      model_params=model_params)

    # get embedding
    client_b_article_embedding, client_b_user_embedding = load_embedding(
        folder=folder_b)
    client_a_article_embedding, client_a_user_embedding = load_embedding(
        folder=folder_a)

    #define data to train on
    both_user_item_train = pd.concat(
        [client_b_user_item_train, client_a_user_item_train])
    both_user_item_test = pd.concat(
        [client_b_user_item_test, client_a_user_item_test])
    both_user_embedding = pd.concat(
        [client_b_user_embedding, client_a_user_embedding])
    both_article_embedding = pd.concat(
        [client_b_article_embedding, client_a_article_embedding])

    if name == 'client_a':
        user_item_train, user_item_test, user_embedding, article_embedding_train, new = client_a_user_item_train, client_a_user_item_test, client_a_user_embedding, client_a_article_embedding, True
        article_embedding = article_embedding_train

    if name == 'client_b':
        user_item_train, user_item_test, user_embedding, article_embedding_train, new = client_b_user_item_train, client_b_user_item_test, client_b_user_embedding, client_b_article_embedding, True
        article_embedding = article_embedding_train

    if name == 'client_a_both':
        user_item_train, user_item_test, user_embedding, article_embedding_train, new = both_user_item_train, client_a_user_item_test, both_user_embedding, both_article_embedding, True
        article_embedding = client_a_article_embedding

    if name == 'client_b_both':
        user_item_train, user_item_test, user_embedding, article_embedding_train, new = both_user_item_train, client_b_user_item_test, both_user_embedding, both_article_embedding, True
        article_embedding = client_b_article_embedding

    #train

    model, history = nn_train(
        user_item_train,
        user_item_test.sample(frac=0.3, random_state=seed + 1),
        user_embedding=user_embedding,
        article_embedding=[article_embedding_train, article_embedding],
        new=new,
        model_params=model_params,
        model_path=f'idea1_models/{name}_{cut}_{high_cut}_{seed}',
        last_x_articles=high_cut)

    user_item_test_sample = user_item_test
    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample.index)]
    idea1 = evaluate(
        pred.sort_index(),
        user_item_test_sample.loc[pred.index].sort_index(),
        limit=limit,
        experiment_name=f'result_{name}_{new}_{cut}_{high_cut}_{seed}.results')

    user_item_test_sample_low = user_item_test.loc[
        user_item_test.str.len().sort_values().head(
            int(len(user_item_test) / 4)).index]
    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample_low.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample_low.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample_low.index)]
    idea1 = evaluate(
        pred.sort_index(),
        user_item_test_sample_low.loc[pred.index].sort_index(),
        limit=limit,
        experiment_name=
        f'result_{name}_{new}_{cut}_{high_cut}_{seed}_low_click_users.results')

    user_item_test_sample_high = user_item_test.loc[
        user_item_test.str.len().sort_values().tail(
            int(len(user_item_test) / 4)).index]
    pred, pred_raw = prediction(
        model,
        user_embedding.loc[user_item_test_sample_high.index],
        article_embedding,
        user_item_train.loc[user_item_test_sample_high.index],
        N,
        model_params=model_params)
    pred = pred[pred.index.isin(user_item_test_sample_high.index)]
    idea1 = evaluate(
        pred.sort_index(),
        user_item_test_sample_high.loc[pred.index].sort_index(),
        limit=limit,
        experiment_name=
        f'result_{name}_{new}_{cut}_{high_cut}_{seed}_high_click_users.results'
    )