Example #1
0
    def train(self, model: nn.Module, train_data: DataContainer,
              context: FederatedLearning.Context,
              config: TrainerParams) -> Tuple[any, int]:
        model.to(self.device)
        model.train()
        optimizer = config.get_optimizer()(model)
        criterion = config.get_criterion()

        epoch_loss = []
        for epoch in range(config.epochs):
            batch_loss = []
            for batch_idx, (x, labels) in enumerate(
                    train_data.batch(config.batch_size)):
                x = x.to(self.device)
                labels = labels.to(self.device)
                optimizer.zero_grad()
                log_probs = model(x)
                loss = criterion(log_probs, labels)
                loss.backward()
                optimizer.step()
                batch_loss.append(loss.item())
            if len(batch_loss) > 0:
                epoch_loss.append(sum(batch_loss) / len(batch_loss))

        weights = model.cpu().state_dict()
        return weights, len(train_data)
Example #2
0
 def distribute_labels(self):
     group = self.Grouper(self.data.x, self.data.y)
     clients_data = Dict()
     for index, label in enumerate(group.groups()):
         x, y = group.get(label)
         clients_data[index] = DataContainer(x, y).as_tensor()
     return clients_data
Example #3
0
 def distribute_percentage(self,
                           num_clients,
                           percentage=0.8,
                           min_size=10,
                           max_size=100) -> Dict[int, DataContainer]:
     self.data = self.data.as_list()
     clients_data = {}
     xs = self.data.x
     ys = self.data.y
     unique_labels = np.unique(ys)
     for i in range(num_clients):
         client_data_size = random.randint(min_size, max_size)
         selected_label = unique_labels[random.randint(
             0,
             len(unique_labels) - 1)]
         client_x = []
         client_y = []
         while len(client_y) / client_data_size < percentage:
             for index, item in enumerate(ys):
                 if item == selected_label:
                     client_x.append(xs.pop(index))
                     client_y.append(ys.pop(index))
                     break
         while len(client_y) < client_data_size:
             for index, item in enumerate(ys):
                 if item != selected_label:
                     client_x.append(xs.pop(index))
                     client_y.append(ys.pop(index))
                     break
         clients_data[i] = DataContainer(client_x, client_y).as_tensor()
     return Dict(clients_data)
Example #4
0
 def distribute_shards(self, num_clients, shards_per_client, min_size,
                       max_size) -> Dict[int, DataContainer]:
     self.data = self.data.as_numpy()
     clients_data = defaultdict(list)
     grouper = self.Grouper(self.data.x, self.data.y)
     for client_id in range(num_clients):
         client_data_size = random.randint(min_size, max_size)
         selected_shards = grouper.groups(shards_per_client)
         self.log(f'generating data for {client_id}-{selected_shards}')
         client_x = []
         client_y = []
         for shard in selected_shards:
             rx, ry = grouper.get(
                 shard, int(client_data_size / len(selected_shards)))
             if len(rx) == 0:
                 self.log(
                     f'shard {round(shard)} have no more available data to distribute, skipping...'
                 )
             else:
                 client_x = rx if len(client_x) == 0 else np.concatenate(
                     (client_x, rx))
                 client_y = ry if len(client_y) == 0 else np.concatenate(
                     (client_y, ry))
         clients_data[client_id] = DataContainer(client_x,
                                                 client_y).as_tensor()
     return Dict(clients_data)
Example #5
0
 def collect(self) -> DataContainer:
     collected = super().collect()
     new_x = []
     new_y = []
     for index in range(len(collected.x)):
         new_x.append(lt.word_to_indices(collected.x[index]))
         new_y.append(lt.letter_to_index(collected.y[index]))
     return DataContainer(new_x, new_y)
Example #6
0
 def collect(self) -> DataContainer:
     cursor = self.db.cursor()
     cursor.execute(self.query)
     xs = []
     ys = []
     for row in cursor.fetchall():
         x, y = self.fetcher(row)
         xs.append(x)
         ys.append(y)
     return DataContainer(xs, ys)
Example #7
0
 def distribute_size(self, num_clients, min_size,
                     max_size) -> Dict[int, DataContainer]:
     self.data = self.data.as_list()
     clients_data = Dict()
     xs = self.data.x
     ys = self.data.y
     data_pos = 0
     for i in range(num_clients):
         client_data_size = random.randint(min_size, max_size)
         client_x = xs[data_pos:data_pos + client_data_size]
         client_y = ys[data_pos:data_pos + client_data_size]
         data_pos += len(client_x)
         clients_data[i] = DataContainer(client_x, client_y).as_tensor()
     return Dict(clients_data)
Example #8
0
 def distribute_continuous(self, num_clients, min_size, max_size) -> Dict:
     self.data = self.data.as_list()
     clients_data = Dict()
     xs = self.data.x
     ys = self.data.y
     group = {}
     for index in range(len(xs)):
         if ys[index] not in group:
             group[ys[index]] = []
         group[ys[index]].append(xs[index])
     for i in range(num_clients):
         client_data_size = random.randint(min_size, max_size)
         client_x = group[i][0:client_data_size]
         client_y = [i for _ in range(len(client_x))]
         clients_data[i] = DataContainer(client_x, client_y).as_tensor()
     return clients_data
Example #9
0
 def distribute_dirichlet(self,
                          num_clients,
                          num_labels,
                          skewness=0.5) -> Dict[int, DataContainer]:
     self.data = self.data.as_list()
     client_rows = non_iid_partition_with_dirichlet_distribution(
         self.data.y, num_clients, num_labels, skewness)
     clients_data = {}
     for client in client_rows:
         client_x = []
         client_y = []
         for pos in client_rows[client]:
             client_x.append(self.data.x[pos])
             client_y.append(self.data.y[pos])
         clients_data[client] = DataContainer(client_x,
                                              client_y).as_tensor()
     return Dict(clients_data)
Example #10
0
 def train(self, model: nn.Module, train_data: DataContainer,
           context: FederatedLearning.Context,
           config: TrainerParams) -> Tuple[any, int]:
     round_id = context.round_id
     num_rounds = context.num_rounds
     total_size = len(train_data)
     round_data_size = total_size / num_rounds
     x = train_data.x[int(round_id *
                          round_data_size):int((round_id *
                                                round_data_size) +
                                               round_data_size)]
     y = train_data.y[int(round_id *
                          round_data_size):int((round_id *
                                                round_data_size) +
                                               round_data_size)]
     chunk = DataContainer(x, y)
     return super(TorchChunkTrainer, self).train(model, chunk, round_id,
                                                 config)
Example #11
0
    def infer(self, model: nn.Module, test_data: DataContainer):
        model.to(self.device)
        model.eval()
        test_loss = test_acc = test_total = 0.
        criterion = self.criterion
        with torch.no_grad():
            for batch_idx, (x, target) in enumerate(
                    test_data.batch(self.batch_size)):
                x = x.to(self.device)
                target = target.to(self.device)
                pred = model(x)
                loss = criterion(pred, target)
                _, predicted = torch.max(pred, -1)
                correct = predicted.eq(target).sum()

                test_acc += correct.item()
                test_loss += loss.item() * target.size(0)
                test_total += target.size(0)

        return test_acc / test_total, test_loss / test_total
Example #12
0
def dict2dc(dc: DataContainer, key: int, val: DataContainer) -> DataContainer:
    dc = DataContainer([], []) if dc is None else dc
    return dc.concat(val)
Example #13
0
def as_numpy(_, val: DataContainer):
    return val.as_numpy()
Example #14
0
def clients_features(nb_features) -> typing.Callable:
    return lambda cid, data: DataContainer(data.x[:, 0:nb_features], data.y)
Example #15
0
import logging

import h5py

from src.data.data_container import DataContainer
from src.data.data_provider import PickleDataProvider

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')

path_train = "../../raw/femnist/fed_emnist_train.h5"
path_test = "../../raw/femnist/fed_emnist_test.h5"

f = h5py.File(path_train, 'r')
x = []
y = []
for name in f:
    for user in f[name]:
        h5_x = f[name][user]['pixels']
        h5_y = f[name][user]['label']
        logging.info(f"processing user {user} - num raw {len(h5_x)}")
        for i in range(len(h5_x)):
            x.append(f[name][user]['pixels'][i].flatten().tolist())
            y.append(f[name][user]['label'][i])

dc = DataContainer(x, y)
print("saving...")
PickleDataProvider.save(dc, '../../pickles/femnist.pkl')
Example #16
0
import json
import mysql.connector

from libs import language_tools
from src.data.data_container import DataContainer
from src.data.data_provider import PickleDataProvider

file = open('../../raw/shakespeare/shakespeare_all_data.json', 'r')
shakespear = json.load(file)

user_data = shakespear['user_data']

finished = 0
all_x = []
all_y = []
for user_id, data in user_data.items():
    print(f"start with {user_id}")
    for x, y in zip(data['x'], data['y']):
        all_x.append(language_tools.word_to_indices(x))
        all_y.append(language_tools.letter_to_index(y))
    finished += 1
    print(f"finished with {user_id}")
    print(f"finished: {finished / len(user_data) * 100}%")
dc = DataContainer(all_x, all_y)
PickleDataProvider.save(dc, '../../pickles/shakespeare.pkl')
Example #17
0
def as_tensor(_, val: DataContainer):
    return val.as_tensor()
Example #18
0
    }
    image = Image.open(line['path']).crop((line['x1'], line['y1'], line['x2'], line['y2'])) \
        .resize((224, 224), Image.ANTIALIAS)
    image_x = asarray(image, dtype=int)
    try:
        x = image_x.transpose((2, 1, 0)).flatten().tolist()
        if len(x) != 150528:
            raise Exception('invalid size')
        y = line['label']
        all_x.append(x)
        all_y.append(y)
    except Exception:
        print(line)
sx = asarray(all_x, dtype=int)
sy = asarray(all_y, dtype=int)
dc = DataContainer(sx, sy)
print("saving...")
PickleDataProvider.save(dc, '../../pickles/cars.pkl')

#
# images = "../../raw/cars"
# print("loading...")
# all_x = []
# all_y = []
# errors = 0
# show_count = 100
# for location, folders, files in os.walk(images):
#     for index, file in enumerate(files):
#         if file.startswith("a"):
#             file_path = location + "/" + file
#             try: