def train(self, model: nn.Module, train_data: DataContainer, context: FederatedLearning.Context, config: TrainerParams) -> Tuple[any, int]: model.to(self.device) model.train() optimizer = config.get_optimizer()(model) criterion = config.get_criterion() epoch_loss = [] for epoch in range(config.epochs): batch_loss = [] for batch_idx, (x, labels) in enumerate( train_data.batch(config.batch_size)): x = x.to(self.device) labels = labels.to(self.device) optimizer.zero_grad() log_probs = model(x) loss = criterion(log_probs, labels) loss.backward() optimizer.step() batch_loss.append(loss.item()) if len(batch_loss) > 0: epoch_loss.append(sum(batch_loss) / len(batch_loss)) weights = model.cpu().state_dict() return weights, len(train_data)
def distribute_labels(self): group = self.Grouper(self.data.x, self.data.y) clients_data = Dict() for index, label in enumerate(group.groups()): x, y = group.get(label) clients_data[index] = DataContainer(x, y).as_tensor() return clients_data
def distribute_percentage(self, num_clients, percentage=0.8, min_size=10, max_size=100) -> Dict[int, DataContainer]: self.data = self.data.as_list() clients_data = {} xs = self.data.x ys = self.data.y unique_labels = np.unique(ys) for i in range(num_clients): client_data_size = random.randint(min_size, max_size) selected_label = unique_labels[random.randint( 0, len(unique_labels) - 1)] client_x = [] client_y = [] while len(client_y) / client_data_size < percentage: for index, item in enumerate(ys): if item == selected_label: client_x.append(xs.pop(index)) client_y.append(ys.pop(index)) break while len(client_y) < client_data_size: for index, item in enumerate(ys): if item != selected_label: client_x.append(xs.pop(index)) client_y.append(ys.pop(index)) break clients_data[i] = DataContainer(client_x, client_y).as_tensor() return Dict(clients_data)
def distribute_shards(self, num_clients, shards_per_client, min_size, max_size) -> Dict[int, DataContainer]: self.data = self.data.as_numpy() clients_data = defaultdict(list) grouper = self.Grouper(self.data.x, self.data.y) for client_id in range(num_clients): client_data_size = random.randint(min_size, max_size) selected_shards = grouper.groups(shards_per_client) self.log(f'generating data for {client_id}-{selected_shards}') client_x = [] client_y = [] for shard in selected_shards: rx, ry = grouper.get( shard, int(client_data_size / len(selected_shards))) if len(rx) == 0: self.log( f'shard {round(shard)} have no more available data to distribute, skipping...' ) else: client_x = rx if len(client_x) == 0 else np.concatenate( (client_x, rx)) client_y = ry if len(client_y) == 0 else np.concatenate( (client_y, ry)) clients_data[client_id] = DataContainer(client_x, client_y).as_tensor() return Dict(clients_data)
def collect(self) -> DataContainer: collected = super().collect() new_x = [] new_y = [] for index in range(len(collected.x)): new_x.append(lt.word_to_indices(collected.x[index])) new_y.append(lt.letter_to_index(collected.y[index])) return DataContainer(new_x, new_y)
def collect(self) -> DataContainer: cursor = self.db.cursor() cursor.execute(self.query) xs = [] ys = [] for row in cursor.fetchall(): x, y = self.fetcher(row) xs.append(x) ys.append(y) return DataContainer(xs, ys)
def distribute_size(self, num_clients, min_size, max_size) -> Dict[int, DataContainer]: self.data = self.data.as_list() clients_data = Dict() xs = self.data.x ys = self.data.y data_pos = 0 for i in range(num_clients): client_data_size = random.randint(min_size, max_size) client_x = xs[data_pos:data_pos + client_data_size] client_y = ys[data_pos:data_pos + client_data_size] data_pos += len(client_x) clients_data[i] = DataContainer(client_x, client_y).as_tensor() return Dict(clients_data)
def distribute_continuous(self, num_clients, min_size, max_size) -> Dict: self.data = self.data.as_list() clients_data = Dict() xs = self.data.x ys = self.data.y group = {} for index in range(len(xs)): if ys[index] not in group: group[ys[index]] = [] group[ys[index]].append(xs[index]) for i in range(num_clients): client_data_size = random.randint(min_size, max_size) client_x = group[i][0:client_data_size] client_y = [i for _ in range(len(client_x))] clients_data[i] = DataContainer(client_x, client_y).as_tensor() return clients_data
def distribute_dirichlet(self, num_clients, num_labels, skewness=0.5) -> Dict[int, DataContainer]: self.data = self.data.as_list() client_rows = non_iid_partition_with_dirichlet_distribution( self.data.y, num_clients, num_labels, skewness) clients_data = {} for client in client_rows: client_x = [] client_y = [] for pos in client_rows[client]: client_x.append(self.data.x[pos]) client_y.append(self.data.y[pos]) clients_data[client] = DataContainer(client_x, client_y).as_tensor() return Dict(clients_data)
def train(self, model: nn.Module, train_data: DataContainer, context: FederatedLearning.Context, config: TrainerParams) -> Tuple[any, int]: round_id = context.round_id num_rounds = context.num_rounds total_size = len(train_data) round_data_size = total_size / num_rounds x = train_data.x[int(round_id * round_data_size):int((round_id * round_data_size) + round_data_size)] y = train_data.y[int(round_id * round_data_size):int((round_id * round_data_size) + round_data_size)] chunk = DataContainer(x, y) return super(TorchChunkTrainer, self).train(model, chunk, round_id, config)
def infer(self, model: nn.Module, test_data: DataContainer): model.to(self.device) model.eval() test_loss = test_acc = test_total = 0. criterion = self.criterion with torch.no_grad(): for batch_idx, (x, target) in enumerate( test_data.batch(self.batch_size)): x = x.to(self.device) target = target.to(self.device) pred = model(x) loss = criterion(pred, target) _, predicted = torch.max(pred, -1) correct = predicted.eq(target).sum() test_acc += correct.item() test_loss += loss.item() * target.size(0) test_total += target.size(0) return test_acc / test_total, test_loss / test_total
def dict2dc(dc: DataContainer, key: int, val: DataContainer) -> DataContainer: dc = DataContainer([], []) if dc is None else dc return dc.concat(val)
def as_numpy(_, val: DataContainer): return val.as_numpy()
def clients_features(nb_features) -> typing.Callable: return lambda cid, data: DataContainer(data.x[:, 0:nb_features], data.y)
import logging import h5py from src.data.data_container import DataContainer from src.data.data_provider import PickleDataProvider logging.basicConfig(level=logging.INFO) logger = logging.getLogger('main') path_train = "../../raw/femnist/fed_emnist_train.h5" path_test = "../../raw/femnist/fed_emnist_test.h5" f = h5py.File(path_train, 'r') x = [] y = [] for name in f: for user in f[name]: h5_x = f[name][user]['pixels'] h5_y = f[name][user]['label'] logging.info(f"processing user {user} - num raw {len(h5_x)}") for i in range(len(h5_x)): x.append(f[name][user]['pixels'][i].flatten().tolist()) y.append(f[name][user]['label'][i]) dc = DataContainer(x, y) print("saving...") PickleDataProvider.save(dc, '../../pickles/femnist.pkl')
import json import mysql.connector from libs import language_tools from src.data.data_container import DataContainer from src.data.data_provider import PickleDataProvider file = open('../../raw/shakespeare/shakespeare_all_data.json', 'r') shakespear = json.load(file) user_data = shakespear['user_data'] finished = 0 all_x = [] all_y = [] for user_id, data in user_data.items(): print(f"start with {user_id}") for x, y in zip(data['x'], data['y']): all_x.append(language_tools.word_to_indices(x)) all_y.append(language_tools.letter_to_index(y)) finished += 1 print(f"finished with {user_id}") print(f"finished: {finished / len(user_data) * 100}%") dc = DataContainer(all_x, all_y) PickleDataProvider.save(dc, '../../pickles/shakespeare.pkl')
def as_tensor(_, val: DataContainer): return val.as_tensor()
} image = Image.open(line['path']).crop((line['x1'], line['y1'], line['x2'], line['y2'])) \ .resize((224, 224), Image.ANTIALIAS) image_x = asarray(image, dtype=int) try: x = image_x.transpose((2, 1, 0)).flatten().tolist() if len(x) != 150528: raise Exception('invalid size') y = line['label'] all_x.append(x) all_y.append(y) except Exception: print(line) sx = asarray(all_x, dtype=int) sy = asarray(all_y, dtype=int) dc = DataContainer(sx, sy) print("saving...") PickleDataProvider.save(dc, '../../pickles/cars.pkl') # # images = "../../raw/cars" # print("loading...") # all_x = [] # all_y = [] # errors = 0 # show_count = 100 # for location, folders, files in os.walk(images): # for index, file in enumerate(files): # if file.startswith("a"): # file_path = location + "/" + file # try: