def setUpClass(cls): cls.config = DottableConfigParser( os.path.join(cls.CURR_DIR, 'bird_trainer_tst.cfg')) parent = os.path.join(cls.CURR_DIR, '..') cls.data_path = cls.config.getpath('Paths', 'root_train_test_data', relative_to=parent)
def read_configuration(self, conf_file): ''' Parses config file that describes training parameters, various file paths, and how many GPUs different machines have. Syntax follows Python's configfile package, which includes sections, and attr/val pairs in each section. Expected sections: o Paths: various file paths for the application o Training: holds batch sizes, number of epochs, etc. o Parallelism: holds number of GPUs on different machines For Parallelism, expect entries like: foo.bar.com = 4 127.0.0.1 = 5 localhost = 3 172.12.145.1 = 6 Method identifies which of the entries is 'localhost' by comparing against local hostname. Though 'localhost' or '127.0.0.1' may be provided. Returns a dict of dicts: config[section-names][attr-names-within-section] Types of standard entries, such as epochs, batch_size, etc. are coerced, so that, e.g. config['Training']['epochs'] will be an int. Clients may add non-standard entries. For those the client must convert values from string (the type in which values are stored by default) to the required type. This can be done the usual way: int(...), or using one of the configparser's retrieval methods getboolean(), getint(), and getfloat(): config['Training'].getfloat('learning_rate') :param other_gpu_config_file: path to configuration file :type other_gpu_config_file: str :return: a dict of dicts mirroring the config file sections/entries :rtype: dict[dict] :raises ValueErr :raises TypeError ''' if conf_file is None: return self.init_defaults() config = DottableConfigParser(conf_file) if len(config.sections()) == 0: # Config file exists, but empty: return (self.init_defaults(config)) # Do type conversion also in other entries that # are standard: types = { 'epochs': int, 'batch_size': int, 'kernel_size': int, 'sample_width': int, 'sample_height': int, 'seed': int, 'pytorch_comm_port': int, 'num_pretrained_layers': int, 'root_train_test_data': str, 'net_name': str, } for section in config.sections(): for attr_name in config[section].keys(): try: str_val = config[section][attr_name] required_type = types[attr_name] config[section][attr_name] = required_type(str_val) except KeyError: # Current attribute is not standard; # users of the corresponding value need # to do their own type conversion when # accessing this configuration entry: continue except TypeError: raise ValueError( f"Config file error: {section}.{attr_name} should be convertible to {required_type}" ) return config
def setUp(self): self.curr_dir = os.path.dirname(__file__) self.config_path = os.path.join(self.curr_dir, 'bird_trainer_tst.cfg') self.config = DottableConfigParser(self.config_path)
class TestLauncher(unittest.TestCase): @classmethod def setUpClass(cls): cls.full_host_name = socket.getfqdn() def setUp(self): self.curr_dir = os.path.dirname(__file__) self.config_path = os.path.join(self.curr_dir, 'bird_trainer_tst.cfg') self.config = DottableConfigParser(self.config_path) def tearDown(self): pass #------------------------------------ # test_read_world_map #------------------- @unittest.skipIf(TEST_ALL != True, 'skipping temporarily') def test_read_world_map(self): launcher = TrainScriptLauncher(unittesting=True) tst_world_map_path = os.path.join(self.curr_dir, 'world_map_for_testing.json') world_map = launcher.read_world_map(tst_world_map_path) expected = { 'quintus.stanford.edu': { "master": 'Yes', "gpus": 2 }, 'quatro.stanford.edu': { "gpus": 2, "devices": [1, 2] } } self.assertDictEqual(world_map, expected) #------------------------------------ # test_training_script_start_cmd #------------------- @unittest.skipIf(TEST_ALL != True, 'skipping temporarily') def test_training_script_start_cmd(self): launcher = TrainScriptLauncher(unittesting=True) tst_world_map_path = os.path.join(self.curr_dir, 'world_map_for_testing.json') world_map = launcher.read_world_map(tst_world_map_path) world_map = self.substitute_local_hostname(world_map) launcher.config = self.config launcher.build_compute_landscape(world_map) rank = 0 local_rank = 0 min_rank_this_machine = 0 gpus_to_use = 2 # The entry of this machine in the world map # Set some instance vars that would # normally be set in the the launcher's # constructor: launcher.WORLD_SIZE = 4 launcher.MASTER_PORT = self.config.getint('Parallelism', 'master_port') launcher.log = LoggingService() script_args = { 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': 5678, 'RANK': rank, 'LOCAL_RANK': local_rank, 'WORLD_SIZE': 4, 'MIN_RANK_THIS_MACHINE': min_rank_this_machine, 'config': self.config_path } script_path = os.path.join(self.curr_dir, '../birds_train_parallel') launch_args = {'training_script': script_path} start_cmd = launcher.training_script_start_cmd(rank, local_rank, gpus_to_use, min_rank_this_machine, launch_args, script_args) print(start_cmd) #------------------------------------ # test_build_gpu_landscape #------------------- @unittest.skipIf(TEST_ALL != True, 'skipping temporarily') def test_build_gpu_landscape(self): launcher = TrainScriptLauncher(unittesting=True) # Pretend we are on one of the hostnames # that are mentioned in the testing world map: launcher.hostname = 'quintus.stanford.edu' tst_world_map_path = os.path.join(self.curr_dir, 'world_map_for_testing.json') world_map = launcher.read_world_map(tst_world_map_path) gpu_landscape = launcher.build_compute_landscape(world_map) expected = OrderedDict([('quatro.stanford.edu', { 'num_gpus': 2, 'gpu_device_ids': [1, 2], 'rank_range': [2, 3], 'start_rank': 2 }), ('quintus.stanford.edu', { 'num_gpus': 2, 'gpu_device_ids': [0, 1], 'rank_range': [0, 1], 'start_rank': 0 })]) self.assertDictEqual(gpu_landscape, expected) # ------------------ Utils --------------- def substitute_local_hostname(self, world_map): ''' Replace the hard-coded name 'quintus.stanford.edu' in the test world map with the name of the current machine. Else some of the methods being tested will complain that the local machine is not represented in the map. :param world_map: a world_map that contains the hard coded key 'quintus.stanford.edu' :type world_map: {str : {str : Any}} :return: modified world map :rtype: {str : {str : Any}} ''' if self.full_host_name == 'quintus.stanford.edu': # We happen to be on the quintus machine... return world_map world_map[self.full_host_name] = world_map['quintus.stanford.edu'] del world_map['quintus.stanford.edu'] return world_map
def gather_world_layout(self, launch_args): ''' # Compute a unique number for each GPU within # the group of nodes (machines). Starting with # the master node's first GPU as 0 (if master node # has a GPU. # The resulting GPU layout is assigned to # variable gpu_landscape: :param launch_args: :type launch_args: ''' try: config_file = launch_args['config'] if not os.path.exists(config_file): raise ConfigError( f"Configuration file {config_file} that was provided as command line arg does not exist." ) except KeyError: raise RuntimeError( "Error: launch args must include a config file. See config.cfg.Example in project root" ) self.config = DottableConfigParser(config_file) # Ensure that the launch_args contains # the path to the training script. It # will be there if provided on the cmd line. # But it may instead be under Path:train_script # in the configuration: try: self.launch_args['training_script'] except KeyError: # The training script was not specified # on the command line. Is it in the config # file: try: self.launch_args['training_script'] = self.config.getpath( 'Paths', 'train_script', relative_to=self.curr_dict) except KeyError: raise ValueError( "No training script specified on command line or in config file" ) try: self.world_map_path = self.config.getpath( 'Paths', 'world_map', relative_to=self.curr_dir) except KeyError: raise RuntimeError( f"Could not find entry for 'world_map' in config file {config_file}" ) self.world_map = self.read_world_map(self.world_map_path) # Ensure that this machine has an # entry in the world_map: try: # Get this machine's info (sub)dict: _my_world_info = self.world_map[self.hostname] except KeyError: raise ConfigError( f"World map file does not contain entry for this machine ({self.hostname})" ) self.compute_landscape = {} # Whether or not machine running this # code is the master node: self.am_master_node = False # Build gpu_landscape, which maps # machine names to the rank range # that they occupy via the number of # their GPUs # # {machine_name1 : [1], # machine_name2 : [0], # machine_name3 : [1,2,3], self.gpu_landscape = self.build_compute_landscape(self.world_map) if self.master_hostname is None: raise ConfigError( f'No master machine in {self.world_map_path}; one entry needs to be "master" : 1' ) # Common pytorch port is either in the config file, # or we use the pytorch default self.MASTER_PORT = self.config.getint('Parallelism', 'master_port', self.COMM_PORT) # Handle special case: no GPUs anywere, and # we are on node 0: in that case start a single # copy of the training script. If it is written # properly, it will detect the absence of a GPU, # and use the CPU. This happens during debugging # on a laptop: if self.WORLD_SIZE == 0 and self.am_master_node: self.WORLD_SIZE += 1 # If trying to launch on a node without GPUs, # when GPUs are available elsewhere, refuse to # start the script (is this needed?): if not TESTING: if self.my_gpus == 0 and self.WORLD_SIZE > 0: raise RuntimeError( "This machine does not have any GPU, but others do; training script not started." )
class TrainScriptLauncher: #------------------------------------ # Constructor #------------------- # Use distributed torch default port: COMM_PORT = '5678' def __init__(self, unittesting=False): self.hostname = socket.getfqdn() if unittesting: # Let unittests create an instance # and call individual methods: return # Logging to console during launch: self.log = LoggingService() # Convenience: directory of this # script, and project root directory curr_dir = Path(__file__).parent proj_root = curr_dir.joinpath('../..').resolve() self.curr_dir = str(curr_dir) self.proj_root = str(proj_root) args_parser = BirdsTrainingArgumentsParser( formatter_class=BirdsTrainingArgumentsParser. BlankLinesHelpFormatter, description="PyTorch distributed training launch " "helper to spawn multiple distributed " "birds_train_parallel.py processes") all_args = args_parser.parse_args() # Separate the args for this launch script # from the args destined for the copies of # the train script: self.launch_args = all_args['launch_args'] self.script_args = all_args['script_args'] # Build the gpu_landscape dict: self.gather_world_layout(self.launch_args) self.GPUS_USED_THIS_MACHINE = self.gpu_landscape[ self.hostname]['num_gpus'] #------------------------------------ # gather_world_layout #------------------- def gather_world_layout(self, launch_args): ''' # Compute a unique number for each GPU within # the group of nodes (machines). Starting with # the master node's first GPU as 0 (if master node # has a GPU. # The resulting GPU layout is assigned to # variable gpu_landscape: :param launch_args: :type launch_args: ''' try: config_file = launch_args['config'] if not os.path.exists(config_file): raise ConfigError( f"Configuration file {config_file} that was provided as command line arg does not exist." ) except KeyError: raise RuntimeError( "Error: launch args must include a config file. See config.cfg.Example in project root" ) self.config = DottableConfigParser(config_file) # Ensure that the launch_args contains # the path to the training script. It # will be there if provided on the cmd line. # But it may instead be under Path:train_script # in the configuration: try: self.launch_args['training_script'] except KeyError: # The training script was not specified # on the command line. Is it in the config # file: try: self.launch_args['training_script'] = self.config.getpath( 'Paths', 'train_script', relative_to=self.curr_dict) except KeyError: raise ValueError( "No training script specified on command line or in config file" ) try: self.world_map_path = self.config.getpath( 'Paths', 'world_map', relative_to=self.curr_dir) except KeyError: raise RuntimeError( f"Could not find entry for 'world_map' in config file {config_file}" ) self.world_map = self.read_world_map(self.world_map_path) # Ensure that this machine has an # entry in the world_map: try: # Get this machine's info (sub)dict: _my_world_info = self.world_map[self.hostname] except KeyError: raise ConfigError( f"World map file does not contain entry for this machine ({self.hostname})" ) self.compute_landscape = {} # Whether or not machine running this # code is the master node: self.am_master_node = False # Build gpu_landscape, which maps # machine names to the rank range # that they occupy via the number of # their GPUs # # {machine_name1 : [1], # machine_name2 : [0], # machine_name3 : [1,2,3], self.gpu_landscape = self.build_compute_landscape(self.world_map) if self.master_hostname is None: raise ConfigError( f'No master machine in {self.world_map_path}; one entry needs to be "master" : 1' ) # Common pytorch port is either in the config file, # or we use the pytorch default self.MASTER_PORT = self.config.getint('Parallelism', 'master_port', self.COMM_PORT) # Handle special case: no GPUs anywere, and # we are on node 0: in that case start a single # copy of the training script. If it is written # properly, it will detect the absence of a GPU, # and use the CPU. This happens during debugging # on a laptop: if self.WORLD_SIZE == 0 and self.am_master_node: self.WORLD_SIZE += 1 # If trying to launch on a node without GPUs, # when GPUs are available elsewhere, refuse to # start the script (is this needed?): if not TESTING: if self.my_gpus == 0 and self.WORLD_SIZE > 0: raise RuntimeError( "This machine does not have any GPU, but others do; training script not started." ) #------------------------------------ # launch_scripts #------------------- def launch_scripts(self): ''' Launch (possibly) multiple copies of the training script. Use world_map.json to know how many, and which GPUs this machine is to use. Each copy is told: o MASTER_ADDR # Where to reach the coordinating process o MASTER_PORT # Corresponding port o RANK # The copy's sequence number, which is # Unique across all participating machines o LOCAL_RANK # Which of this machine's GPU to use (0-origin) o WORLD_SIZE # How many GPUs are used on all machines together o GPUS_USED_THIS_MACHINE # Number of GPUs *used* on this # machine, according to the world_map. ''' # Compute a unique number for each GPU within # the group of nodes (machines). Starting with # the master node's first GPU as 0 (if master node # has a GPU. # The resulting GPU layout is assigned to # variable gpu_landscape: # # {<machine_name> : # This machine's range of ranks: rank_range = self.gpu_landscape[self.hostname]['rank_range'] this_machine_gpu_ids = self.gpu_landscape[ self.hostname]['gpu_device_ids'] min_rank_this_machine = self.gpu_landscape[self.hostname]['start_rank'] local_rank = 0 # Map from process object to rank (for debug msgs): self.who_is_who = OrderedDict() for rank in rank_range: cmd = self.training_script_start_cmd( rank, len(this_machine_gpu_ids), local_rank, min_rank_this_machine, self.launch_args, self.script_args) # Copy stdin, and give the copy to the subprocess. # This enables the subprocess to ask user whether # to save training state in case of a cnt-C: newstdin = os.fdopen(os.dup(sys.stdin.fileno())) # Spawn one training script. process = subprocess.Popen( cmd, stdin=newstdin, stdout=None, # Script inherits this launch stderr=None # ... script's stdout/stderr ) self.who_is_who[process] = rank local_rank += 1 if not self.launch_args['quiet']: print( f"Node {self.hostname} {os.path.basename(sys.argv[0])}: Num processes launched: {len(self.who_is_who)}" ) if self.am_master_node: print(f"Awaiting {self.WORLD_SIZE} process(es) to finish...") else: print(f"Awaiting {self.my_gpus} process(es) to finish...") failed_processes = [] try: for process in self.who_is_who.keys(): process.wait() if process.returncode != 0: failed_processes.append(process) continue except KeyboardInterrupt: # Gently kill the training scripts: self.handle_cnt_c() pass # See which processes get the interrupt num_failed = len(failed_processes) if num_failed > 0: print(f"Number of failed training scripts: {num_failed}") for failed_process in failed_processes: train_script = self.launch_args['training_script'] script_rank = self.who_is_who[failed_process] msg = ( f"Training script {train_script} (rank {script_rank}) encountered error(s); see logfile" ) print(msg) #------------------------------------ # training_script_start_cmd #------------------- def training_script_start_cmd(self, rank, gpus_used_this_machine, local_rank, min_rank_this_machine, launch_args, script_args): ''' From provided information, creates a legal command string for starting the training script. :param rank: rank of the script; i.e. it's process' place in the sequence of all train script processes across all machines :type rank: int :param gpus_used_this_machine: number of GPU devices to be used, according to the world_map; may be less than number of available GPUs :type gpus_used_this_machine: int :param local_rank: index into the local sequence of GPUs for for the GPU that the script is to use :type local_rank: int :param min_rank_this_machine: the lowest of the ranks among the training scripts on this machine :type min_rank_this_machine: int :param launch_args: command line arguments intended for the launch script, as opposed to being destined for the train script :type launch_args: {str : Any} :param script_args: additional args for the train script :type script_args: {str : Any} ''' # Build the shell command line, # starting with 'python -u': cmd = [sys.executable, "-u"] cmd.append(launch_args['training_script']) # Add the args for the script that were # in the command line: for arg_name in script_args.keys(): script_arg_val = script_args[arg_name] if script_arg_val is None or arg_name == 'config': # Skip over non-specified CLI args: continue cmd.append(f"--{arg_name}={script_args[arg_name]}") # Add the 'secret' args that tell the training # script all the communication parameters: cmd.extend([ f"--MASTER_ADDR={self.MASTER_ADDR}", f"--MASTER_PORT={self.MASTER_PORT}", f"--RANK={rank}", f"--LOCAL_RANK={local_rank}", f"--MIN_RANK_THIS_MACHINE={min_rank_this_machine}", f"--WORLD_SIZE={self.WORLD_SIZE}", f"--GPUS_USED_THIS_MACHINE={gpus_used_this_machine}" ]) # Finally, the obligatory non-option arg # to the training script: the configuration # file: config_file_name = script_args['config'] cmd.append(config_file_name) self.log.debug(f"****** Launch: the cmd is {cmd}") return cmd #------------------------------------ # read_world_map #------------------- def read_world_map(self, path): ''' Read the JSON5 world map file, and return a corresponding dict. JSON5 allows something like: /* This is a block comment. Notice the lacking quote chars around the keys below. The are optional in JSON5 */ {quintus.stanford.edu : { "master" : Yes "gpus" : 2 }, quatro.stanford.edu : { "gpus" : 2, "devices" : [1,2] } } BUT: JSON5 gets angry at dots in the keys. So we first read the file, and try to find the machine names. We temporarily replace them with an acceptable marker, and then convert back. :param path: path to world map file :type path: string ''' dot_substitute = '___' try: # Read all the world map file lines: with open(path, 'r') as world_map_fd: tmp_world_map = world_map_fd.readlines() except IOError as e: raise IOError(f"World map file at {path} not found") from e # Replace occurrences of '.' with dot_substitute: new_text = [] for line in tmp_world_map: new_text.append(line.replace('.', dot_substitute)) # ... and make one string from all the lines: json_str = '\n'.join(new_text) try: # Hopefully, JSON5 will eat it now: world_map_almost = json5.loads(json_str) except JSONError as e: raise JSONError( f"World map file at {path} contains bad JSON") from e # Need to fix all the dot substitutions. # At this point the data structure is # { <machine_name> : {spec_attr1 : val1, # spec_attr2 : val2, # } # } # Fix the machine names first: mach_names_fixed = [ machine_name.replace(dot_substitute, '.') for machine_name in world_map_almost.keys() ] machine_specs_fixed = [] # Now dig into each of the nested machine spec # dicts, and fix attrs and values there: for spec in world_map_almost.values(): # Spec is a dict nested inside the outer one: spec_fixed = { key.replace(dot_substitute, '.'): val.replace( dot_substitute, '.') if isinstance(val, str) else val for key, val in spec.items() } machine_specs_fixed.append(spec_fixed) # Put it all together: world_map = { machine_name: spec_dict for machine_name, spec_dict in zip(mach_names_fixed, machine_specs_fixed) } return world_map #------------------------------------ # build_compute_landscape #------------------- def build_compute_landscape(self, world_map): ''' # Using the world_map.json config file, build # a dict self.gpu_landscape like this: # # {'machine_name1' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # {'machine_name2' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # } # # Also sets # o self.master_hostname, the hostname # running the one process that coordinates all others. # o self.WORLD_SIZE, number of GPUs used across all machines # o self.my_gpus, the number of GPUs on this machine :param world_map: :type world_map: :return: information about how many GPUs are on each node :rtype: OrderedDict ''' if not self.hostname in world_map.keys(): raise ConfigError( f"World map does not contain an entry for this machine {self.hostname}" ) # World size is the number of training script processes, # which is equal to number of GPUs used on all participating # machines combined: # Number of GPUs across all machines: self.WORLD_SIZE = 0 self.master_hostname = None # Go through the world map, machine (a.k.a. node) # one at a time, in alpha order of the machine # names to ensure all copies of this script # come to the same conclusions about ranks # Build gpu_landscape: # # {'machine_name1' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # {'machine_name2' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # } # # The structure is an OrderedDict(), containing # machines alphabetically by name. This discipline # is required so that all copies of this launch script # (one copy per machine) arrive at the same ordering of # GPUs: gpu_landscape = OrderedDict({}) for machine_name in sorted(world_map.keys()): # Get dict of info about the machine: machine_info = world_map[machine_name] try: machine_gpus = machine_info['gpus'] except KeyError: print( "World map must include a 'gpus' entry; the value may be 0" ) gpu_landscape[machine_name] = {} gpu_landscape[machine_name]['num_gpus'] = machine_gpus # List of GPU numbers to use is optional # in world_maps: machine_gpus_to_use = machine_info.get('devices', None) if machine_gpus_to_use is None: # Use all GPUs on that machine: machine_gpus_to_use = list(range(machine_gpus)) gpu_landscape[machine_name]['gpu_device_ids'] = machine_gpus_to_use # Accept all kinds of affirmatives as values: # for identification of the master node entry: is_master_node = machine_info.get('master', False) \ in [1, 'True', 'true', 'Yes', 'yes'] if is_master_node: self.master_hostname = machine_name if machine_name == self.hostname: self.am_master_node = True try: self.MASTER_ADDR = socket.gethostbyname(machine_name) except socket.gaierror: # For machines that have no # findable IP address: self.MASTER_ADDR = '127.0.0.1' self.WORLD_SIZE += machine_gpus # Go through the machine enries in gpu_landscape, and # assign rank ranges to each. Must start with # the master node, b/c it must start with rank 0. # For the master node, it is possible that it has # no GPUs master_info = gpu_landscape[self.master_hostname] master_info['rank_range'] = list(range(master_info['num_gpus'])) master_info['start_rank'] = 0 if len(master_info['rank_range']) == 0: # Master only has a GPU: master_info['rank_range'] = [0] # Start assigning more ranks after # the GPUs of the master: running_rank = master_info['rank_range'][-1] + 1 for machine_name in gpu_landscape.keys(): if machine_name == self.master_hostname: # We already did the master node continue mach_info = gpu_landscape[machine_name] mach_info['start_rank'] = running_rank num_gpus = mach_info['num_gpus'] range_bound = running_rank + (num_gpus if num_gpus > 0 else 1) mach_info['rank_range'] = list(range(running_rank, range_bound)) running_rank += (num_gpus if num_gpus > 0 else 1) self.my_gpus = gpu_landscape[self.hostname]['num_gpus'] self.gpu_landscape = gpu_landscape return gpu_landscape #------------------------------------ # handle_cnt_c #------------------- def handle_cnt_c(self): ''' Given a list of process instances, Send SIGINT (cnt-C) to them: :param procs: :type procs: ''' # Line processes up, highest rank first, # master process last: procs_terminate = sorted([proc for proc in self.who_is_who.keys()], key=lambda obj: self.who_is_who[obj], reverse=True) for process in procs_terminate: # If process is no longer running, # forget about it: if process.poll is not None: # Process dead: continue process.send_signal(signal.SIGTERM) process.wait()