Beispiel #1
0
    def fed_train_step(self):
        '''
        A single communication round (Upload weights for non-delayed nodes)
        '''
        self.weight_list = []
        self.nb_list = []

        if self.bad_node:
            print('bad node !!!!! number{}, size{}!!!!!!'.format(
                self.bad_node_nb, self.bad_node_size))
            for i in range(self.bad_node_nb):
                bad_model = ANN_model()
                bad_weight = bad_model.get_weights()
                bad_model_w = get_nb_matrix(bad_weight, self.bad_node_size)
                self.weight_list.append(bad_weight)
                self.nb_list.append(bad_model_w)

        for index_path in self.nodes_p:
            # Run each nodes and collect their weights
            model_weights, nb = node_training_process(
                index_path, self.shared_index, self.central_p,
                self.local_epoch, self.batch_size, self.augument,
                self.local_iid, self.node_evl)
            model_w = get_nb_matrix(model_weights, nb)
            self.weight_list.append(model_weights)
            self.nb_list.append(model_w)
            tf.keras.backend.clear_session(
            )  ########## to solve memory leak 2/2

        self.epo = self.epo + 1

        # memory(self.epo)  ## Testing memory usage

        return True
Beispiel #2
0
def node_training_process(index_path,
                          shared_index,
                          central_weight_path,
                          local_epoch,
                          batch_size=50,
                          augment=False,
                          local_iid=False,
                          node_evl=False):
    '''
    1. Get index and initial_weights from central,
    2. Load & prepare the dataset accordingly,
    3. Training,
    4. Return weights to central
    * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir')
    * Saving node weights locally can be a safe way if node have many data, but here we just neglect this
    '''
    g = tf.Graph()
    with g.as_default():  #tf.graph to solve memory leak

        # load & processing data
        (x_train, y_train), (x_test,
                             y_test) = tf.keras.datasets.cifar10.load_data()

        autotune = tf.data.experimental.AUTOTUNE

        # load index
        index1 = np.load(index_path)
        ori_traning = index1.shape[0]  # node's own data size

        # assign node_evl_set (1/2)
        if node_evl:
            evl_p = index_path[:-9] + 'evl_index.npy'
            evl_index = np.load(evl_p)
            x_test_n = x_test[evl_index]
            y_test_n = y_test[evl_index]
            node_evl_list = []
            total_node_evl_list = []
            for i in range(10):
                index0 = np.where(y_test_n == i)
                index = index0[0]
                x_evl = tf.data.Dataset.from_tensor_slices(x_test_n[index])
                y_evl = tf.data.Dataset.from_tensor_slices(y_test_n[index])
                node_evl_set = tf.data.Dataset.zip((x_evl, y_evl))
                node_evl_set = node_evl_set.repeat().batch(1).prefetch(
                    buffer_size=autotune)
                total_node_evl = len(index)
                node_evl_list.append(node_evl_set)
                total_node_evl_list.append(total_node_evl)

        # if shared_index!=[]:
        #     shared_test_index = np.array([0])
        #     for x in shared_index:
        #         b=np.load(x)
        #         index1 = np.concatenate((index1, b))
        #         shared_test_index = np.concatenate((shared_test_index, b))
        #     shared_test_index = shared_test_index[1:]
        #     x_test_shared=x_train[shared_test_index]
        #     y_test_shared=y_train[shared_test_index]
        #     x_shared_evl=tf.data.Dataset.from_tensor_slices(x_test_shared)
        #     y_shared_evl=tf.data.Dataset.from_tensor_slices(y_test_shared)
        #     shared_evl_set = tf.data.Dataset.zip((x_shared_evl, y_shared_evl))
        #     shared_evl_set = shared_evl_set.repeat().batch(batch_size).prefetch(buffer_size=autotune)
        #     total_shared_evl = shared_test_index.shape[0] ###################################

        x_train_i = x_train[index1]
        y_train_i = y_train[index1]

        print(np.unique(y_train_i,
                        return_counts=True))  ##############################

        if -1 in index1:
            iii = [random.randint(0, 40000) for i in range(len(index1))]
            x_train_i = x_train[iii]
            iii = [random.randint(0, 40000) for i in range(len(index1))]
            y_train_i = y_train[iii]
            print(np.unique(
                y_train_i, return_counts=True))  ##############################

        buffer_size = x_train_i.shape[0]
        # total_traning=index1.shape[0]

        x_tr = tf.data.Dataset.from_tensor_slices(x_train_i)
        y_tr = tf.data.Dataset.from_tensor_slices(y_train_i)
        total_traning = len(x_train_i)

        if local_iid == True:
            y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i)
            print(
                np.unique(y_train_i2,
                          return_counts=True))  ##############################
            total_traning = len(x_train_i2)
            x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2)
            y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2)

        print(np.unique(y_train_i,
                        return_counts=True))  ##############################
        train_set = tf.data.Dataset.zip((x_tr, y_tr))
        if augment == True:
            train_set = train_set.map(img_augument).shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)
        else:
            train_set = train_set.shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)

    # Training & save
    # THIS LINE SHOULD BE THE FIRST
        save_dir = index_path[:-9]

        model = ANN_model()
        model.load_weights(central_weight_path)

        # node_evl before training (2/2)
        if node_evl:
            filename = os.path.join(save_dir, 'node_EVAL_before_training.txt')
            with open(filename, 'a') as file_handle:
                for i in range(10):
                    if total_node_evl_list[i] == 0:
                        file_handle.write('200')
                        file_handle.write(' ')
                    else:
                        [loss, acc
                         ] = model.evaluate(node_evl_list[i],
                                            steps=total_node_evl_list[i] // 1,
                                            verbose=0)
                        file_handle.write(str(acc))
                        file_handle.write(' ')
                file_handle.write('\n')

        # # see if overtrained over the shared index
        # if shared_index!=[]:
        #     [loss, acc]=model.evaluate(shared_evl_set,steps=total_shared_evl//batch_size,verbose=0)
        #     filename = os.path.join(save_dir,'shared_EVAL.txt')
        #     with open(filename,'a') as file_handle:
        #             file_handle.write(str(loss))
        #             file_handle.write(' ')
        #             file_handle.write(str(acc))
        #             file_handle.write('\n')

        # test the loaded model to see if it's overtrainned? mention it's last epo's acc
        [self_loss,
         self_acc] = model.evaluate(train_set,
                                    steps=total_traning // batch_size,
                                    verbose=0)
        filename = os.path.join(save_dir, 'self_EVAL.txt')
        with open(filename, 'a') as file_handle:
            file_handle.write(str(self_loss))
            file_handle.write(' ')
            file_handle.write(str(self_acc))
            file_handle.write('\n')

        history = model.fit(train_set,
                            epochs=local_epoch,
                            steps_per_epoch=total_traning // batch_size,
                            verbose=0)

        # return model_weight
        model_weights = model.get_weights()

    del model

    # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes.
    #       And compare this weighted average to current one (a centralized testing set)

    return model_weights, total_traning
def node_training_process(index_path,
                          shared_index,
                          central_weight_path,
                          local_epoch,
                          batch_size=50,
                          augment=False,
                          local_iid=False):
    '''
    1. Get index and initial_weights from central,
    2. Load & prepare the dataset accordingly,
    3. Training,
    4. Return weights to central
    * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir')
    * Saving node weights locally can be a safe way if node have many data, but here we just neglect this
    '''
    g = tf.Graph()
    with g.as_default():  #tf.graph to solve memory leak

        # load index
        index1 = np.load(index_path)

        if shared_index != []:
            for x in shared_index:
                b = np.load(x)
                index1 = np.concatenate((index1, b))

        # load & processing data
        (x_train, y_train), (_, _) = tf.keras.datasets.cifar10.load_data()

        x_train_i = x_train[index1]
        y_train_i = y_train[index1]

        autotune = tf.data.experimental.AUTOTUNE
        buffer_size = x_train_i.shape[0]
        total_traning = index1.shape[0]

        x_tr = tf.data.Dataset.from_tensor_slices(x_train_i)
        y_tr = tf.data.Dataset.from_tensor_slices(y_train_i)

        if local_iid == True:
            y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i)
            x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2)
            y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2)

        train_set = tf.data.Dataset.zip((x_tr, y_tr))
        if augment == True:
            train_set = train_set.map(img_augument).shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)
        else:
            train_set = train_set.shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)

    # Training & save
        save_dir = index_path[:-9]

        model = ANN_model()
        model.load_weights(central_weight_path)

        # test the loaded model to see if it's overtrainned? mention it's last epo's acc
        [self_loss,
         self_acc] = model.evaluate(train_set,
                                    steps=total_traning // batch_size,
                                    verbose=0)
        filename = os.path.join(save_dir, 'self_EVAL.txt')
        with open(filename, 'a') as file_handle:
            file_handle.write(str(self_loss))
            file_handle.write(' ')
            file_handle.write(str(self_acc))
            file_handle.write('\n')

        history = model.fit(train_set,
                            epochs=local_epoch,
                            steps_per_epoch=total_traning // batch_size,
                            verbose=0)

        # return model_weight
        model_weights = model.get_weights()

    del model

    # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes.
    #       And compare this weighted average to current one (a centralized testing set)

    return model_weights, total_traning


# index_path