def get_cluster_from_args(args, gpus): node_ips = [x.strip() for x in args.ips.split(',')] if len(node_ips) == 1: node_ip = node_ips[0] else: _, node_ip = get_host_name_ip() # node_ip = args.node_ip assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ % (node_ip, node_ips) node_rank = node_ips.index(node_ip) logger.debug( "parsed from args: node_ips:{} node_ip:{} node_rank:{}".format( node_ips, node_ip, node_rank)) free_ports = None if not cloud_utils.use_paddlecloud() and len( node_ips) <= 1 and os.environ.get('FLAGS_START_PORT') is None: free_ports = find_free_ports(len(gpus)) if free_ports is not None: free_ports = list(free_ports) else: start_port = 6070 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') free_ports = [x for x in range(start_port, start_port + len(gpus))] return get_cluster(node_ips, node_ip, free_ports, gpus)
def get_cluster_from_args(args, device_mode, devices_per_proc): node_ips = [x.strip() for x in args.ips.split(',')] if len(node_ips) == 1: node_ip = node_ips[0] else: _, node_ip = get_host_name_ip() assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ % (node_ip, node_ips) node_rank = node_ips.index(node_ip) logger.debug( "parsed from args: node_ips:{} node_ip:{} node_rank:{}".format( node_ips, node_ip, node_rank)) free_ports = None if not cloud_utils.use_paddlecloud() and len( node_ips) <= 1 and os.environ.get('FLAGS_START_PORT') is None: free_ports = find_free_ports(len(devices_per_proc)) if free_ports is not None: free_ports = list(free_ports) else: start_port = 6070 if os.environ.get('FLAGS_START_PORT') is not None: start_port = int(os.environ.get('FLAGS_START_PORT')) free_ports = [ x for x in range(start_port, start_port + len(devices_per_proc)) ] trainer_endpoints = [] for ip in node_ips: trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc)
def launch_collective(args): # parse arguments, used for cloud-single-machine and local (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') if cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster(args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, devices_per_proc) logger.debug("get cluster from args:{}".format(cluster)) global_envs = copy.copy(os.environ.copy()) gloo_rendezvous_dir = tempfile.mkdtemp() # add gloo env global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir procs = start_local_trainers( cluster, pod, training_script=args.training_script, training_script_args=args.training_script_args, log_dir=args.log_dir, envs=global_envs) while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) if not alive: logger.info("Local processes completed.") logger.debug("POD info:{}".format(pod)) break time.sleep(3) if os.path.exists(gloo_rendezvous_dir): shutil.rmtree(gloo_rendezvous_dir)
def launch_ps(args, distribute_mode): cloud_flag = cloud_utils.use_paddlecloud() # for ps-cpu on paddlecloud if cloud_flag and distribute_mode == DistributeMode.PS: direct_start(args) return elif cloud_flag and distribute_mode == DistributeMode.PS_HETER: cloud_ps_heter_env_set(args) args.workers = os.getenv("PADDLE_TRAINER_ENDPOINTS") args.servers = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST") args.heter_workers = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST") ps_launcher = ParameterServerLauncher(args, distribute_mode) ps_launcher.start_ps() return
def launch_collective(args): # parse arguments, used for cloud-single-machine and local gpus = get_gpus(args.gpus) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} gpus:{}".format( trainers_num, gpus)) cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') if cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, gpus) logger.debug("get cluster from args:{}".format(cluster)) procs = start_local_trainers( cluster, pod, training_script=args.training_script, training_script_args=args.training_script_args, log_dir=args.log_dir) while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) if not alive: logger.info("Local processes completed.") logger.debug("POD info:{}".format(pod)) break time.sleep(3)
def _get_subprocess_env_list(nprocs, options): # contruct processes env list processes_env_list = [] # get args from kwargs args = ParallelEnvArgs() # deal with `ips` args.cluster_node_ips = options.get('ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = options.get('cluster_node_ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = "127.0.0.1" # deal with `gpus` or `xpus` # set default selected devices(gpus or xpus) # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ] # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place, # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id if core.is_compiled_with_cuda(): args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_cuda_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`CUDA_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `gpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected gpu card %s cannot found in " "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif core.is_compiled_with_xpu(): args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("XPU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_xpu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`XPU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `xpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) # set other inner args args.node_ip = options.get('node_ip', None) if args.node_ip is None: args.node_ip = _get_node_ip(args.cluster_node_ips) args.started_port = options.get('started_port', None) args.use_paddlecloud = options.get('use_paddlecloud', None) if args.use_paddlecloud is None: args.use_paddlecloud = use_paddlecloud() # get cluster and pod config cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: processes_env_list.append(_prepare_trainer_env(cluster, trainer)) # [Debug] print config args.print_config = options.get('print_config', False) if args.print_config: _print_arguments(args) return processes_env_list
def get_cluster_info(args): # parse arguments, used for cloud-single-machine and local if args.backend == 'gloo': cpuonly_check(args) if args.enable_auto_mapping: (device_mode, devices_per_proc) = (DeviceMode.GPU, []) else: (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') # auto mapping between processes and devices for auto-parallel if args.enable_auto_mapping == True: assert args.cluster_topo_path is not None, \ "The cluster topology must be provied when enabling auto mapping." rank_mapping_path = args.rank_mapping_path or os.getenv( "PADDLE_RANK_MAPPING_PATH") if not rank_mapping_path: os.environ["PADDLE_NEED_RANK_MAPPING"] = str(True) os.environ["PADDLE_ENABLE_ELASTIC"] = str( enable_elastic(args, device_mode)) cwd = pathlib.Path().resolve() rank_mapping_path = os.path.join(cwd, "auto_parallel_rank_mapping.json") os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) original_args = sys.argv[1:] os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args) os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( args.enable_auto_mapping) cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping( args, device_mode) else: os.environ["PADDLE_NEED_RANK_MAPPING"] = str(False) os.environ["PADDLE_ENABLE_ELASTIC"] = str( enable_elastic(args, device_mode)) os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( args.enable_auto_mapping) cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping( args, device_mode) elif cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster( args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) elif device_mode == DeviceMode.ASCEND_NPU: # for ascend cluster, pod = ascend_utils.get_cloud_cluster( rank_table_file=os.getenv("RANK_TABLE_FILE", None), device_mode=device_mode, start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, devices_per_proc) logger.debug("get cluster from args:{}".format(cluster)) return cluster, pod
def __init__(self, args, etcd_client): self.args = args server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER') name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID') self.min_np, self.max_np = self._parse_np(args.np) host = args.host or os.getenv('POD_IP') scale = args.scale or int(os.getenv('PADDLE_ELASTIC_SCALE', 0)) force = args.force or os.getenv('PADDLE_ELASTIC_FORCE') self.host = host if host else self._get_host() (self.device_mode, self.devices_per_proc) = launch_utils.get_device_proc_info(args) self.elastic_timeout = int( os.getenv('PADDLE_ELASTIC_TIMEOUT', ELASTIC_TIMEOUT)) elastic_ttl = int(os.getenv('PADDLE_ELASTIC_TTL', ELASTIC_TTL)) self.start_port = None if cloud_utils.use_paddlecloud(): self.trainers = os.getenv('PADDLE_TRAINERS', '') self.np = len(self.trainers.split(",")) self.start_port = int(os.getenv("PADDLE_PORT", "6170")) self.dist_endpoints = os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS', '') trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS', '') self.trainer_endpoints_list = trainer_endpoints.split(",") else: self.trainers = args.ips or os.getenv('PADDLE_TRAINERS', '') node_ips = self.trainers.split(",") self.np = len(node_ips) self.start_port = int(os.getenv("FLAGS_START_PORT", "6170")) self.dist_endpoints = self._host_to_endpoints( node_ips, self.devices_per_proc, self.start_port) self.trainer_endpoints_list = [ "%s:%d" % (ip, self.start_port) for ip in node_ips ] self.curr_host = "%s:%d" % (self.host, self.start_port) logger.info(f'start job with np={self.np}') logger.info( f"trainers={self.trainers}, trainer_endpoints_list={self.trainer_endpoints_list}" ) # auto correct the value of elastic_level # 1: Fault tolerant, 2: Elastic self.elastic_level = int( os.getenv('PADDLE_ELASTIC_FAULT_TOLERANC_LEVEL', ElasticLevel.FAULT_TOLERANCE)) if self.min_np == self.max_np or \ (self.min_np > 0 and self.max_np == 0): self.elastic_level = ElasticLevel.FAULT_TOLERANCE logger.info(f'start job with ElasticLevel.FAULT_TOLERANCE') if self.min_np > 0 and self.max_np > self.min_np: self.elastic_level = ElasticLevel.ELASTIC logger.info(f'start job with ElasticLevel.ELASTIC') # compatible with kuberntes service discovery if not server and os.getenv( 'PADDLE_ELASTIC_ETCD_SERVICE_HOST') and os.getenv( 'PADDLE_ELASTIC_ETCD_SERVICE_PORT'): server = '{}:{}'.format( os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_HOST'), os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_PORT')) logger.debug('init with server {} host {}'.format(server, host)) self.hosts = [] self.stopped = False self.sigint = 0 self.need_sync = False self.elastic_startup_time = None if not server or ':' not in server or not name or not self.np: logger.info( 'Elastic is not enabled with server {} name {} and np {}'. format(server, name, self.np)) self.enable = False return else: self.enable = True self.etcd = etcd_client # etcd data self.prefix = "/paddle/" + name self.node_prefix = self.prefix + '/nodes' self.np_path = self.prefix + '/np' self.endpoints_path = self.prefix + '/endpoints' node_tag = ''.join( random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6)) self.host_path = '{}/{}{}'.format(self.node_prefix, node_tag, time.time()) ''' 0 group mode, be aware of healthy status of other workers 1 decouple mode, check own status only ''' self.etcd.put(self.prefix, b'0') # register callback def host_call_back(event): self.hosts = [ six.ensure_str(i[0]) for i in self.etcd.get_prefix(self.node_prefix) ] self.hosts = list(set(self.hosts)) if self.hosts else self.hosts logger.info( f"host_call_back curr_host={self.curr_host}, hosts:{self.hosts}" ) self.need_sync = True self.elastic_startup_time = None host_watch = self.etcd.add_watch_prefix_callback( self.node_prefix, host_call_back) host_lease = self.etcd.lease(elastic_ttl) # register etcd lease heartbeat def lease_heartbeat(): while True: try: host_lease.refresh() hosts = [ six.ensure_str(i[0]) for i in self.etcd.get_prefix(self.node_prefix) ] hosts = list(set(hosts)) if hosts else hosts logger.info( f"[lease_heartbeat] curr_host={self.curr_host}, hosts={hosts}" ) if self.curr_host not in hosts: logger.info( f"[lease_heartbeat] register host={self.curr_host}" ) self.etcd.put(self.host_path, six.b(self.curr_host), lease=host_lease) except Exception as e: logger.error( "[lease_heartbeat] internal error:{} {}".format( e, traceback.format_exc())) break time.sleep(elastic_ttl / 3) keepalived_thread = threading.Thread(name='lease_heartbeat', target=lease_heartbeat, daemon=True) keepalived_thread.start() self.etcd.put(self.host_path, six.b(self.curr_host), lease=host_lease) # endpoints handle DISTRIBUTED_TRAINER_ENDPOINTS and PADDLE_TRAINERS self.etcd.put( self.endpoints_path, six.b('{}|{}'.format(self.dist_endpoints, self.trainers))) def endpoints_call_back(event): if not self.dist_endpoints: return edps = six.ensure_str(self.etcd.get(self.endpoints_path)[0] or '') self.dist_endpoints, self.trainers = edps.split('|') logger.info("set DISTRIBUTED_TRAINER_ENDPOINTS {} ".format( self.dist_endpoints)) logger.info("set PADDLE_TRAINERS {} ".format(self.trainers)) endpoints_watch = self.etcd.add_watch_callback(self.endpoints_path, endpoints_call_back) self.watches = [host_watch, endpoints_watch] self.launcher = None
def _get_subprocess_env_list(nprocs, options): # NOTE (xiongkun03) Why put backend deduction here ? # Becase _get_subprocess_env_list is used by many testcases. # So for campability, we put backend deduction here # logic for handle backend option if 'backend' not in options or options['backend'] == 'auto': options['backend'] = _get_default_backend() check_backend(options['backend']) block_windows_and_macos(options['backend']) # contruct processes env list processes_env_list = [] # get args from kwargs args = ParallelEnvArgs() # deal with `ips` args.cluster_node_ips = options.get('ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = options.get('cluster_node_ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = "127.0.0.1" # deal with `gpus` or `xpus` # set default selected devices(gpus or xpus) # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ] # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place, # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id if options['backend'] == 'nccl': args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_cuda_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`CUDA_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `gpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected gpu card %s cannot found in " "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'bkcl': args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("XPU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_xpu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`XPU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `xpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'cncl': args.selected_devices = options.get('mlus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("MLU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_mlu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`MLU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `mlus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected mlu card %s cannot found in " "MLU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'gloo': # TODO check gpu / xpu flag must not exist warnings.warn( "Your model will be trained under CPUONLY mode by using GLOO," "because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device." ) args.paddle_cpuonly = True args.selected_devices = None args.ips = args.cluster_node_ips assert options.get( 'use_paddlecloud', None) is None, "CPUONLY spawn doesn't support use paddle cloud" assert len( args.cluster_node_ips.split(',') ) <= 1, "CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s." assert _get_trainers_num( ) == 1, "CPUONLY spawn doesn't support multi-trainer" # set other inner args args.node_ip = options.get('node_ip', None) if args.node_ip is None: args.node_ip = _get_node_ip(args.cluster_node_ips) args.started_port = options.get('started_port', None) args.use_paddlecloud = options.get('use_paddlecloud', None) if args.use_paddlecloud is None: args.use_paddlecloud = use_paddlecloud() # get cluster and pod config if options['backend'] == 'gloo': devices_per_proc = [x for x in range(0, nprocs)] cluster, pod = get_cluster_from_args(args, DeviceMode.CPU, devices_per_proc) else: cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: processes_env_list.append( _prepare_trainer_env(cluster, trainer, options['backend'])) # [Debug] print config args.print_config = options.get('print_config', False) if args.print_config: _print_arguments(args) return processes_env_list