def __init__(self, args):
     super(MATCHAManager, self).__init__(args)
     path_to_save_network = os.path.join("loggs", args_to_string(args), "matcha", "colored_network.gml")
     path_to_matching_history_file = os.path.join("loggs", args_to_string(args), "matcha", "matching_history.csv")
     self.topology_generator = RandomTopologyGenerator(self.network,
                                                       args.communication_budget,
                                                       network_save_path=path_to_save_network,
                                                       path_to_history_file=path_to_matching_history_file)
    def __init__(self, args):
        self.device = args.device
        self.batch_size = args.bz
        self.network = get_network(args.network_name, args.architecture)
        self.world_size = self.network.number_of_nodes() + 1  # we add node representing the network manager
        self.log_freq = args.log_freq

        # create logger
        logger_path = os.path.join("loggs", args_to_string(args), args.architecture)
        self.logger = SummaryWriter(logger_path)

        self.round_idx = 0  # index of the current communication round

        self.train_dir = os.path.join("data", args.experiment, "train")
        self.test_dir = os.path.join("data", args.experiment, "test")

        self.train_path = os.path.join(self.train_dir, "train" + EXTENSIONS[args.experiment])
        self.test_path = os.path.join(self.test_dir, "test" + EXTENSIONS[args.experiment])

        self.train_iterator = get_iterator(args.experiment, self.train_path, self.device, self.batch_size)
        self.test_iterator = get_iterator(args.experiment, self.test_path, self.device, self.batch_size)

        self.gather_list = [get_model(args.experiment, self.device, self.train_iterator)
                            for _ in range(self.world_size)]

        self.scatter_list = [get_model(args.experiment, self.device, self.train_iterator)
                             for _ in range(self.world_size)]

        # print initial logs
        self.write_logs()
           "amazon_us": "1e-3",
           "geantdistance": "1e-3",
           "exodus": "1e-1",
           "ebone": "1e-1"}

if __name__ == "__main__":
    for network_name in ["gaia", "amazon_us", "geantdistance", "exodus", "ebone"]:
        print("{}:".format(network_name))
        args = parse_args(["inaturalist",
                           "--network", network_name,
                           "--bz", "16",
                           "--lr", lr_dict[network_name],
                           "--decay", "sqrt",
                           "--local_steps", "1"])

        args_string = args_to_string(args)

        loggs_dir = os.path.join("loggs", args_to_string(args))
        loggs_to_json(loggs_dir)

        loggs_dir_path = os.path.join("loggs", args_to_string(args))
        path_to_json = os.path.join("results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1]))
        with open(path_to_json, "r") as f:
            data = json.load(f)

        for architecture in ["centralized", "ring", "matcha"]:
            values = data['Train/Acc'][architecture]
            rounds = data["Round"][architecture]

            for ii, value in enumerate(values):
                if value > trsh_dict[network_name]:
Exemple #4
0
        for rank_ in range(world_size):
            p = Process(target=init_process,
                        args=(rank_, world_size, args, run))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

    else:
        print("Run experiment in sequential setting..")

        if args.architecture == "centralized":
            network = CentralizedNetwork(args)
        elif args.architecture == "matcha" or args.architecture == "matcha+" or\
                args.architecture == "matcha+mst" or args.architecture == "matcha+ring" or\
                args.architecture == "matcha+delta_mbst":
            network = MATCHANetwork(args)
        elif args.architecture == "dynamic_ring":
            network = RingNetwork(args)
        else:
            network = Peer2PeerNetwork(args)

        for k in range(args.n_rounds):
            network.mix()

        network.write_logs()

    loggs_dir = os.path.join("loggs", args_to_string(args))
    loggs_to_json(loggs_dir)
def make_plots(args, mode=0):
    os.makedirs(os.path.join("results", "plots", args.experiment),
                exist_ok=True)

    loggs_dir_path = os.path.join("loggs", args_to_string(args))
    path_to_json = os.path.join(
        "results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1]))
    with open(path_to_json, "r") as f:
        data = json.load(f)

    # fig, axs = plt.subplots(2, 5, figsize=(20, 8))
    x_lim = np.inf
    for idx, tag in enumerate(TAGS):
        fig = plt.figure(figsize=(12, 10))
        for architecture in [
                "centralized", "matcha", "mst", "mct_congest", "ring"
        ]:
            try:
                values = data[tag][architecture]
                rounds = data["Round"][architecture]
            except:
                continue

            if mode == 0:
                min_len = min(len(values), len(rounds))

                if rounds[-1] * cycle_time_dict[network_name][
                        architecture] < x_lim:
                    x_lim = rounds[-1] * cycle_time_dict[network_name][
                        architecture]

                plt.plot(cycle_time_dict[network_name][architecture] *
                         np.array(rounds) / 1000,
                         values[:min_len],
                         label=labels_dict[architecture],
                         linewidth=5.0)
                plt.grid(True, linewidth=2)
                plt.xlim(0, x_lim / 1000)
                plt.ylabel("{}".format(tag_dict[tag]), fontsize=50)
                plt.xlabel("time (s)", fontsize=50)
                plt.tick_params(axis='both', labelsize=40)
                plt.tick_params(axis='x')
                plt.legend(fontsize=35)

            else:
                min_len = min(len(values), len(rounds))

                if rounds[:min_len][-1] < x_lim:
                    x_lim = rounds[:min_len][-1]

                plt.plot(rounds[:min_len],
                         values[:min_len],
                         label=labels_dict[architecture],
                         linewidth=5.0)
                plt.ylabel("{}".format(tag_dict[tag]), fontsize=50)
                plt.xlabel("Rounds", fontsize=50)
                plt.tick_params(axis='both', labelsize=40)
                plt.legend(fontsize=35)
                plt.grid(True, linewidth=2)
                plt.xlim(0, x_lim)

        if mode == 0:
            fig_path = os.path.join(
                "results", "plots", args.experiment,
                "{}_{}_vs_time.png".format(args.network_name, path_dict[tag]))
            plt.savefig(fig_path, bbox_inches='tight')
        else:
            fig_path = os.path.join(
                "results", "plots", args.experiment,
                "{}_{}_vs_iteration.png".format(args.network_name,
                                                path_dict[tag]))
            plt.savefig(fig_path, bbox_inches='tight')
    def __init__(self, args):
        """
        Abstract class representing a network of worker collaborating to train a machine learning model,
        each worker has a local model and a local data iterator.
         Should implement `mix` to precise how the communication is done
        :param args: parameters defining the network
        """
        self.args = args
        self.device = args.device
        self.batch_size = args.bz
        self.network = get_network(args.network_name, args.architecture)
        self.n_workers = self.network.number_of_nodes()
        self.local_steps = args.local_steps
        self.log_freq = args.log_freq
        self.fit_by_epoch = args.fit_by_epoch
        self.initial_lr = args.lr
        self.optimizer_name = args.optimizer
        self.lr_scheduler_name = args.decay

        # create logger
        logger_path = os.path.join("loggs", args_to_string(args),
                                   args.architecture)
        os.makedirs(logger_path, exist_ok=True)
        self.logger = SummaryWriter(logger_path)

        self.round_idx = 0  # index of the current communication round

        # get data loaders
        if args.experiment == "inaturalist":
            self.train_dir = os.path.join("data", args.experiment,
                                          "train_{}".format(args.network_name))
            self.test_dir = os.path.join("data", args.experiment,
                                         "test_{}".format(args.network_name))
        else:
            self.train_dir = os.path.join("data", args.experiment, "train")
            self.test_dir = os.path.join("data", args.experiment, "test")

        self.train_path = os.path.join(self.train_dir,
                                       "train" + EXTENSIONS[args.experiment])
        self.test_path = os.path.join(self.test_dir,
                                      "test" + EXTENSIONS[args.experiment])

        self.train_iterator = get_iterator(args.experiment, self.train_path,
                                           self.device, self.batch_size)
        self.test_iterator = get_iterator(args.experiment, self.test_path,
                                          self.device, self.batch_size)

        self.workers_iterators = []
        self.local_function_weights = np.zeros(self.n_workers)
        train_data_size = 0
        for worker_id in range(self.n_workers):
            data_path = os.path.join(
                self.train_dir,
                str(worker_id) + EXTENSIONS[args.experiment])
            self.workers_iterators.append(
                get_iterator(args.experiment, data_path, self.device,
                             self.batch_size))
            train_data_size += len(self.workers_iterators[-1])
            self.local_function_weights[worker_id] = len(
                self.workers_iterators[-1].dataset)

        self.epoch_size = int(train_data_size / self.n_workers)
        self.local_function_weights = self.local_function_weights / self.local_function_weights.sum(
        )

        # create workers models
        if args.use_weighted_average:
            self.workers_models = [
                get_model(args.experiment,
                          self.device,
                          self.workers_iterators[w_i],
                          optimizer_name=self.optimizer_name,
                          lr_scheduler=self.lr_scheduler_name,
                          initial_lr=self.initial_lr,
                          epoch_size=self.epoch_size,
                          coeff=self.local_function_weights[w_i])
                for w_i in range(self.n_workers)
            ]
        else:
            self.workers_models = [
                get_model(args.experiment,
                          self.device,
                          self.workers_iterators[w_i],
                          optimizer_name=self.optimizer_name,
                          lr_scheduler=self.lr_scheduler_name,
                          initial_lr=self.initial_lr,
                          epoch_size=self.epoch_size)
                for w_i in range(self.n_workers)
            ]

        # average model of all workers
        self.global_model = get_model(args.experiment,
                                      self.device,
                                      self.train_iterator,
                                      epoch_size=self.epoch_size)

        # write initial performance
        self.write_logs()