def _get_vars_to_partition(self): """ Analyzes the strategy and returns mappings for the vars to partition and the vars to not. Returns: vars_to_partition (Dict): Mapping of variable names to the tuple of partition_str and reduction devices. unpartitioned_vars (Dict): Mapping from variable name to gradient name of unpartitioned vars. """ vars_to_partition = {} unpartitioned_vars = {} for node in self.node_config: partitioner = getattr(node, 'partitioner') if partitioner: reduction_destinations = [] for part in node.part_config: synchronizer = getattr(part, part.WhichOneof('synchronizer')) if hasattr(synchronizer, 'reduction_destination'): reduction_destinations.append(synchronizer.reduction_destination) else: reduction_destinations.append('') vars_to_partition[node.var_name] = (partitioner, reduction_destinations) logging.info("Partitioning variable {} with configuration {}".format(node.var_name, partitioner)) else: grad, _, _ = self.graph_item.var_op_name_to_grad_info[get_op_name(node.var_name)] unpartitioned_vars[node.var_name] = grad return vars_to_partition, unpartitioned_vars
def _parse_node(self, node, num_nodes): host_address = node['address'] if is_loopback_address(host_address) and num_nodes > 1: raise ValueError( "Can't (currently) use a loopback address when there are multiple nodes." ) if node.get('chief') or num_nodes == 1: # 2 cases for marking this node as chief: # 1) The node was marked as chief # 2) If there is only one node, it is chief by default logging.info("Chief: %s" % host_address) self.__chief_address = host_address host_cpu = DeviceSpec(host_address, device_index=0) self._add_device(host_cpu) # handle any other CPUs when GPU is unavailable if len(node.get('gpus', [])) == 0: for cpu_index in set(sorted(node.get('cpus', []))) - {0}: cpu = DeviceSpec(host_address, host_cpu, DeviceType.CPU, cpu_index) self._add_device(cpu) # handle GPUs for gpu_index in set(sorted(node.get('gpus', []))): gpu = DeviceSpec(host_address, host_cpu, DeviceType.GPU, gpu_index) self._add_device(gpu) self.__ssh_group[host_address] = node.get('ssh_config') if self.__ssh_group[ host_address] is None and self.__chief_address != host_address: raise ValueError( "Need to define SSH groups for all non-chief nodes.")
def _compile_strategy(self, strategy): logging.debug('Raw strategy: %s' % strategy) device_resolver = DeviceResolver(self._cluster) compiled_strategy = base.StrategyCompiler(self._original_graph_item) \ .set_device_resolver(device_resolver.resolve_to_device_str) \ .compile(strategy) logging.info('Compiled strategy: %s' % compiled_strategy) return compiled_strategy
def _apply(self, *args, **kwargs): # pylint: disable-msg=too-many-locals """Partition the variables, returning a new GraphItem and a new corresponding Strategy.""" # Get ops to partition vars_to_partition, unpartitioned_vars = self._get_vars_to_partition() if not vars_to_partition: return self.graph_item, self.node_config # Get everything we want to delete to_delete, top_update_op_scopes = self._get_ops_to_delete( vars_to_partition) # In GraphDef, move everything in to_rename under a separate name scope # This allows us to create new ops with the to-be-deleted ops' original names new_graph_item = self._batch_prepend_name_scope( to_delete, AUTODIST_TO_DELETE_SCOPE) # Create new variables and ops in the new graph new_graph_item.copy_gradient_info_from(self.graph_item) new_vars, partition_config = self._create_new_vars( new_graph_item, vars_to_partition, unpartitioned_vars) # Remove the ops that are marked for deletion output_graph_item = self._delete_marked_ops(new_graph_item, AUTODIST_TO_DELETE_SCOPE) # Update graph item with proper variable information # The new list contains: # 1) The new vars we created (`new_vars`) # 2) The new vars the optimizer created (`new_globals`) # 3) The old untrainable vars that weren't deleted during partitioning (`untrainable_vars`) new_vars = set(new_vars) new_globals = set( new_graph_item.graph.get_collection( ops.GraphKeys.GLOBAL_VARIABLES)) deleted_tensor_names = { o.outputs[0].name for o in to_delete if o.outputs } untrainable_vars = [ v for v in self.graph_item.info.untrainable_variables if v.variable_name not in deleted_tensor_names ] new_var_list = list(new_globals | new_vars) + untrainable_vars self.info.update_variables(new_var_list, replace=True) output_graph_item.info = self.info.copy() output_graph_item.copy_gradient_info_from(new_graph_item) with self.graph_item.graph.as_default(): # this can be used to get the shape for partitioned vars ori_vars = self.graph_item.get_all_variables() with output_graph_item.graph.as_default(): self._update_save_ops(graph_item=output_graph_item, ori_vars=ori_vars, update_op_scopes=top_update_op_scopes, partition_config=partition_config) logging.info('Successfully partitioned variables') return output_graph_item, self.node_config
def _log_timeline(run_metadata, name='timeline', step=0): fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() directory = os.path.join(autodist.const.DEFAULT_WORKING_DIR, "traces") os.makedirs(directory, exist_ok=True) # TODO(Hao): add a runner step count and use it here. p = os.path.join(directory, "{}_{}.json".format(name, step)) with open(p, "w") as f: f.write(chrome_trace) logging.info('Traced timeline written to: %s' % p)
def _setup(self, strategy): """Prepare for the execution.""" if IS_AUTODIST_CHIEF: # we should only have one single coordinator for one single AutoDist() instance scope, # even though we could have multiple strategies. self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster) self._cluster.start() self._coordinator.launch_clients() logging.info('Current PID {} belongs to address {}'.format( os.getpid(), self._cluster.get_local_address()))
def transform(self): """Call graph transformer to transform a graph item based on strategy and cluster.""" logging.info( 'Transforming the original graph to a distributed graph...') with context.graph_mode(): graph_item = self.graph_item # Ensure the transformation happens under graph mode, no matter the outer mode is under eager or graph. visualization_util.log_graph(graph=graph_item.graph, name='0-original') graph_item, self._strategy.node_config = VariablePartitioner.apply( self._strategy.node_config, graph_item) visualization_util.log_graph(graph=graph_item.graph, name='1-after-partition') # Create Synchronizers for each node in the strategy self._initialize_synchronizers() # Replicate the graph (both in-graph and between-graph) new_graph_item = Replicator.apply( config=self._strategy.graph_config.replicas, cluster=self._cluster, graph_item=graph_item) # Apply synchronizers if self._num_local_replicas >= 1: new_graph_item = self._in_graph_apply(new_graph_item) logging.debug( 'Successfully applied local in-graph replication') visualization_util.log_graph(new_graph_item.graph, '2-after-in-graph') if self._num_workers >= 1: new_graph_item = self._between_graph_apply(new_graph_item) logging.debug('Successfully applied between-graph replication') final_item = new_graph_item logging.info('Successfully built the distributed graph.') visualization_util.log_graph(graph=final_item.graph, name='3-transformed') return final_item
def __init__(self, resource_spec: ResourceSpec): self.cluster_spec = self._get_default_cluster_spec(resource_spec) self._cpu_devices = self._get_node_cpu_devices(resource_spec) self._gpu_devices = self._get_node_gpu_devices(resource_spec) self._chief = resource_spec.chief self._full_addresses = [ full_address for tasks in self.cluster_spec.values() for full_address in tasks ] # noinspection PyTypeChecker self._address_to_port = dict( a.split(':') for a in self._full_addresses) self._task_to_address = { (job_name, task_index): a.split(':')[0] for job_name, tasks in self.cluster_spec.items() for task_index, a in enumerate(tasks) } self.subprocesses = [] logging.info('ClusterSpec: {}'.format(self.cluster_spec))
def remote_pre_start_tf_server(self, hostname, tf_server_starter_filepath, working_dir=DEFAULT_WORKING_DIR): """ Prepare to start a TensorFlow server remotely. Args: hostname (str): host name or address tf_server_starter_filepath (str): local starter file path working_dir (str): remote working directory """ logging.info("Copying necessary files to %s" % hostname) self.remote_copy(local_path=tf_server_starter_filepath, remote_path=working_dir, hostname=hostname) self.remote_file_write( remote_path=os.path.join(working_dir, 'cluster_spec.json'), data=json.dumps(self.cluster_spec), hostname=hostname, )
def start(self): """ Start tf.servers on all nodes. Note that this only runs (and only should run) on the chief node. """ # pylint: disable=import-outside-toplevel from autodist.utils import server_starter # atexit registration should be placed # - before the beginning of the start # (to ensure the clean termination if the start fails in its half way); and # - at the same module as the start # (to follow the python assumption that # lower level modules will normally be imported # before higher level modules and thus must be cleaned up later). atexit.register(self.terminate) envs = {ENV.AUTODIST_MIN_LOG_LEVEL.name: 'ERROR'} envs = ['{}={}'.format(k, v) for k, v in envs.items()] module_name = server_starter.__name__ module_file = server_starter.__file__ for job_name, tasks in self.cluster_spec.items(): for task_index, full_address in enumerate(tasks): address = full_address.split(':')[0] args = [ '--job_name=%s' % job_name, '--task_index=%d' % task_index, '--cpu_device_num=%d' % len(self._cpu_devices[address]) ] if address in self._gpu_devices: envs_cuda = [] else: envs_cuda = ['CUDA_VISIBLE_DEVICES=""'] if self.is_chief(address): json.dump( self.cluster_spec, open( os.path.join(DEFAULT_WORKING_DIR, 'cluster_spec.json'), 'w+')) cmd = envs + envs_cuda + [ sys.executable, '-m', module_name ] + args # pylint: disable=subprocess-popen-preexec-fn proc = subprocess.Popen(' '.join(cmd), shell=True, preexec_fn=os.setsid) self.subprocesses.append(proc) # The above line immediately follows the Popen # to ensure no gap for termination failure due to the empty proc list. logging.debug( '$ local tf.server started at {}: job_name={} task_index={}' .format(full_address, job_name, task_index)) else: # remote self.remote_pre_start_tf_server( address, tf_server_starter_filepath=module_file) file = os.path.join(DEFAULT_WORKING_DIR, os.path.basename(module_file)) bash = envs + envs_cuda + ['python', '-u', file] + args logging.info("Launching tf.server on %s" % address) proc = self.remote_exec(bash, hostname=address) # The above line immediately follows the Popen # to ensure no gap for termination failure due to the empty proc list. self.subprocesses.append(proc)