def save_adult_party_data(nb_dp_per_party, should_stratify, party_folder):
    """
    Saves Adult party data

    :param nb_dp_per_party: the number of data points each party should have
    :type nb_dp_per_party: `list[int]`
    :param should_stratify: True if data should be assigned proportional to source class distributions
    :type should_stratify: `bool`
    :param party_folder: folder to save party data
    :type party_folder: `str`
    """
    dataset_path = os.path.join("examples", "datasets")
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    x_train = load_adult(download_dir=dataset_path)
    num_train = len(x_train.index)
    y_train = x_train['class'].values.tolist()
    labels, counts = np.unique(y_train, return_counts=True)

    if should_stratify:
        strat_col = y_train
        groups, counts = np.unique(strat_col, return_counts=True)
        # to use custom proportions, replace probs with a dictionary where key:value pairs are label:proportion
        probs = {
            group: counts[np.where(groups == group)[0][0]] / float(num_train)
            for group in groups
        }
        p_list = np.array([probs[strat_col[idx]] for idx in range(num_train)])
        p_list /= np.sum(p_list)

    else:
        probs = {label: 1.0 / num_train for label in labels}
        p_list = np.array([probs[y_train[idx]] for idx in range(num_train)])
        p_list /= np.sum(p_list)

    for i, dp in enumerate(nb_dp_per_party):
        # Create variable for indices
        indices = np.random.choice(num_train, dp, p=p_list)
        indices = indices.tolist()
        # Use indices for data/classification subset
        x_train_pi = x_train.iloc[indices]

        name_file = 'data_party' + str(i) + '.csv'
        name_file = os.path.join(party_folder, name_file)
        with open(name_file, 'w') as writeFile:
            writer = csv.writer(writeFile)
            writer.writerows(x_train_pi)

        x_train_pi.to_csv(path_or_buf=name_file, index=None)

    print('Finished! :) Data saved in', party_folder)
Example #2
0
def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):

    if dataset == 'adult':
        loaded_data = load_adult()
        # preproces the dataset first before generating the model spec
        dh = AdultDTDataHandler()
        loaded_data = dh.preprocess(loaded_data)
    elif dataset == 'nursery':
        loaded_data = load_nursery()
    spec = dict()
    spec['list_of_features'] = list(range(loaded_data.shape[1] - 1))

    feature_values = list()
    for feature in range(loaded_data.shape[1]):
        if loaded_data.columns[feature] != 'class':
            new_feature = loaded_data[loaded_data.columns[feature]
                                      ].cat.categories
            feature_values.append(new_feature.tolist())
    spec['feature_values'] = feature_values

    list_of_labels = loaded_data['class'].cat.categories
    spec['list_of_labels'] = list_of_labels.tolist()

    f_spec = os.path.join(folder_configs, 'dt_model_spec.json')
    with open(f_spec, 'w') as f:
        json.dump(spec, f)

    spec = {
        'model_name': 'decision-tree',
        'model_definition': os.path.join(folder_configs, 'dt_model_spec.json')
    }

    model = {
        'name': 'DTFLModel',
        'path': 'ibmfl.model.dt_fl_model',
        'spec': spec
    }

    return model