def save_adult_party_data(nb_dp_per_party, should_stratify, party_folder): """ Saves Adult party data :param nb_dp_per_party: the number of data points each party should have :type nb_dp_per_party: `list[int]` :param should_stratify: True if data should be assigned proportional to source class distributions :type should_stratify: `bool` :param party_folder: folder to save party data :type party_folder: `str` """ dataset_path = os.path.join("examples", "datasets") if not os.path.exists(dataset_path): os.makedirs(dataset_path) x_train = load_adult(download_dir=dataset_path) num_train = len(x_train.index) y_train = x_train['class'].values.tolist() labels, counts = np.unique(y_train, return_counts=True) if should_stratify: strat_col = y_train groups, counts = np.unique(strat_col, return_counts=True) # to use custom proportions, replace probs with a dictionary where key:value pairs are label:proportion probs = { group: counts[np.where(groups == group)[0][0]] / float(num_train) for group in groups } p_list = np.array([probs[strat_col[idx]] for idx in range(num_train)]) p_list /= np.sum(p_list) else: probs = {label: 1.0 / num_train for label in labels} p_list = np.array([probs[y_train[idx]] for idx in range(num_train)]) p_list /= np.sum(p_list) for i, dp in enumerate(nb_dp_per_party): # Create variable for indices indices = np.random.choice(num_train, dp, p=p_list) indices = indices.tolist() # Use indices for data/classification subset x_train_pi = x_train.iloc[indices] name_file = 'data_party' + str(i) + '.csv' name_file = os.path.join(party_folder, name_file) with open(name_file, 'w') as writeFile: writer = csv.writer(writeFile) writer.writerows(x_train_pi) x_train_pi.to_csv(path_or_buf=name_file, index=None) print('Finished! :) Data saved in', party_folder)
def get_model_config(folder_configs, dataset, is_agg=False, party_id=0): if dataset == 'adult': loaded_data = load_adult() # preproces the dataset first before generating the model spec dh = AdultDTDataHandler() loaded_data = dh.preprocess(loaded_data) elif dataset == 'nursery': loaded_data = load_nursery() spec = dict() spec['list_of_features'] = list(range(loaded_data.shape[1] - 1)) feature_values = list() for feature in range(loaded_data.shape[1]): if loaded_data.columns[feature] != 'class': new_feature = loaded_data[loaded_data.columns[feature] ].cat.categories feature_values.append(new_feature.tolist()) spec['feature_values'] = feature_values list_of_labels = loaded_data['class'].cat.categories spec['list_of_labels'] = list_of_labels.tolist() f_spec = os.path.join(folder_configs, 'dt_model_spec.json') with open(f_spec, 'w') as f: json.dump(spec, f) spec = { 'model_name': 'decision-tree', 'model_definition': os.path.join(folder_configs, 'dt_model_spec.json') } model = { 'name': 'DTFLModel', 'path': 'ibmfl.model.dt_fl_model', 'spec': spec } return model