def __init__(self, args): super(MATCHAManager, self).__init__(args) path_to_save_network = os.path.join("loggs", args_to_string(args), "matcha", "colored_network.gml") path_to_matching_history_file = os.path.join("loggs", args_to_string(args), "matcha", "matching_history.csv") self.topology_generator = RandomTopologyGenerator(self.network, args.communication_budget, network_save_path=path_to_save_network, path_to_history_file=path_to_matching_history_file)
def __init__(self, args): self.device = args.device self.batch_size = args.bz self.network = get_network(args.network_name, args.architecture) self.world_size = self.network.number_of_nodes() + 1 # we add node representing the network manager self.log_freq = args.log_freq # create logger logger_path = os.path.join("loggs", args_to_string(args), args.architecture) self.logger = SummaryWriter(logger_path) self.round_idx = 0 # index of the current communication round self.train_dir = os.path.join("data", args.experiment, "train") self.test_dir = os.path.join("data", args.experiment, "test") self.train_path = os.path.join(self.train_dir, "train" + EXTENSIONS[args.experiment]) self.test_path = os.path.join(self.test_dir, "test" + EXTENSIONS[args.experiment]) self.train_iterator = get_iterator(args.experiment, self.train_path, self.device, self.batch_size) self.test_iterator = get_iterator(args.experiment, self.test_path, self.device, self.batch_size) self.gather_list = [get_model(args.experiment, self.device, self.train_iterator) for _ in range(self.world_size)] self.scatter_list = [get_model(args.experiment, self.device, self.train_iterator) for _ in range(self.world_size)] # print initial logs self.write_logs()
"amazon_us": "1e-3", "geantdistance": "1e-3", "exodus": "1e-1", "ebone": "1e-1"} if __name__ == "__main__": for network_name in ["gaia", "amazon_us", "geantdistance", "exodus", "ebone"]: print("{}:".format(network_name)) args = parse_args(["inaturalist", "--network", network_name, "--bz", "16", "--lr", lr_dict[network_name], "--decay", "sqrt", "--local_steps", "1"]) args_string = args_to_string(args) loggs_dir = os.path.join("loggs", args_to_string(args)) loggs_to_json(loggs_dir) loggs_dir_path = os.path.join("loggs", args_to_string(args)) path_to_json = os.path.join("results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1])) with open(path_to_json, "r") as f: data = json.load(f) for architecture in ["centralized", "ring", "matcha"]: values = data['Train/Acc'][architecture] rounds = data["Round"][architecture] for ii, value in enumerate(values): if value > trsh_dict[network_name]:
for rank_ in range(world_size): p = Process(target=init_process, args=(rank_, world_size, args, run)) p.start() processes.append(p) for p in processes: p.join() else: print("Run experiment in sequential setting..") if args.architecture == "centralized": network = CentralizedNetwork(args) elif args.architecture == "matcha" or args.architecture == "matcha+" or\ args.architecture == "matcha+mst" or args.architecture == "matcha+ring" or\ args.architecture == "matcha+delta_mbst": network = MATCHANetwork(args) elif args.architecture == "dynamic_ring": network = RingNetwork(args) else: network = Peer2PeerNetwork(args) for k in range(args.n_rounds): network.mix() network.write_logs() loggs_dir = os.path.join("loggs", args_to_string(args)) loggs_to_json(loggs_dir)
def make_plots(args, mode=0): os.makedirs(os.path.join("results", "plots", args.experiment), exist_ok=True) loggs_dir_path = os.path.join("loggs", args_to_string(args)) path_to_json = os.path.join( "results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1])) with open(path_to_json, "r") as f: data = json.load(f) # fig, axs = plt.subplots(2, 5, figsize=(20, 8)) x_lim = np.inf for idx, tag in enumerate(TAGS): fig = plt.figure(figsize=(12, 10)) for architecture in [ "centralized", "matcha", "mst", "mct_congest", "ring" ]: try: values = data[tag][architecture] rounds = data["Round"][architecture] except: continue if mode == 0: min_len = min(len(values), len(rounds)) if rounds[-1] * cycle_time_dict[network_name][ architecture] < x_lim: x_lim = rounds[-1] * cycle_time_dict[network_name][ architecture] plt.plot(cycle_time_dict[network_name][architecture] * np.array(rounds) / 1000, values[:min_len], label=labels_dict[architecture], linewidth=5.0) plt.grid(True, linewidth=2) plt.xlim(0, x_lim / 1000) plt.ylabel("{}".format(tag_dict[tag]), fontsize=50) plt.xlabel("time (s)", fontsize=50) plt.tick_params(axis='both', labelsize=40) plt.tick_params(axis='x') plt.legend(fontsize=35) else: min_len = min(len(values), len(rounds)) if rounds[:min_len][-1] < x_lim: x_lim = rounds[:min_len][-1] plt.plot(rounds[:min_len], values[:min_len], label=labels_dict[architecture], linewidth=5.0) plt.ylabel("{}".format(tag_dict[tag]), fontsize=50) plt.xlabel("Rounds", fontsize=50) plt.tick_params(axis='both', labelsize=40) plt.legend(fontsize=35) plt.grid(True, linewidth=2) plt.xlim(0, x_lim) if mode == 0: fig_path = os.path.join( "results", "plots", args.experiment, "{}_{}_vs_time.png".format(args.network_name, path_dict[tag])) plt.savefig(fig_path, bbox_inches='tight') else: fig_path = os.path.join( "results", "plots", args.experiment, "{}_{}_vs_iteration.png".format(args.network_name, path_dict[tag])) plt.savefig(fig_path, bbox_inches='tight')
def __init__(self, args): """ Abstract class representing a network of worker collaborating to train a machine learning model, each worker has a local model and a local data iterator. Should implement `mix` to precise how the communication is done :param args: parameters defining the network """ self.args = args self.device = args.device self.batch_size = args.bz self.network = get_network(args.network_name, args.architecture) self.n_workers = self.network.number_of_nodes() self.local_steps = args.local_steps self.log_freq = args.log_freq self.fit_by_epoch = args.fit_by_epoch self.initial_lr = args.lr self.optimizer_name = args.optimizer self.lr_scheduler_name = args.decay # create logger logger_path = os.path.join("loggs", args_to_string(args), args.architecture) os.makedirs(logger_path, exist_ok=True) self.logger = SummaryWriter(logger_path) self.round_idx = 0 # index of the current communication round # get data loaders if args.experiment == "inaturalist": self.train_dir = os.path.join("data", args.experiment, "train_{}".format(args.network_name)) self.test_dir = os.path.join("data", args.experiment, "test_{}".format(args.network_name)) else: self.train_dir = os.path.join("data", args.experiment, "train") self.test_dir = os.path.join("data", args.experiment, "test") self.train_path = os.path.join(self.train_dir, "train" + EXTENSIONS[args.experiment]) self.test_path = os.path.join(self.test_dir, "test" + EXTENSIONS[args.experiment]) self.train_iterator = get_iterator(args.experiment, self.train_path, self.device, self.batch_size) self.test_iterator = get_iterator(args.experiment, self.test_path, self.device, self.batch_size) self.workers_iterators = [] self.local_function_weights = np.zeros(self.n_workers) train_data_size = 0 for worker_id in range(self.n_workers): data_path = os.path.join( self.train_dir, str(worker_id) + EXTENSIONS[args.experiment]) self.workers_iterators.append( get_iterator(args.experiment, data_path, self.device, self.batch_size)) train_data_size += len(self.workers_iterators[-1]) self.local_function_weights[worker_id] = len( self.workers_iterators[-1].dataset) self.epoch_size = int(train_data_size / self.n_workers) self.local_function_weights = self.local_function_weights / self.local_function_weights.sum( ) # create workers models if args.use_weighted_average: self.workers_models = [ get_model(args.experiment, self.device, self.workers_iterators[w_i], optimizer_name=self.optimizer_name, lr_scheduler=self.lr_scheduler_name, initial_lr=self.initial_lr, epoch_size=self.epoch_size, coeff=self.local_function_weights[w_i]) for w_i in range(self.n_workers) ] else: self.workers_models = [ get_model(args.experiment, self.device, self.workers_iterators[w_i], optimizer_name=self.optimizer_name, lr_scheduler=self.lr_scheduler_name, initial_lr=self.initial_lr, epoch_size=self.epoch_size) for w_i in range(self.n_workers) ] # average model of all workers self.global_model = get_model(args.experiment, self.device, self.train_iterator, epoch_size=self.epoch_size) # write initial performance self.write_logs()