コード例 #1
0
ファイル: partitioner.py プロジェクト: zeta1999/autodist
    def _get_vars_to_partition(self):
        """
        Analyzes the strategy and returns mappings for the vars to partition and the vars to not.

        Returns:
            vars_to_partition (Dict): Mapping of variable names to the tuple of partition_str and reduction devices.
            unpartitioned_vars (Dict): Mapping from variable name to gradient name of unpartitioned vars.
        """
        vars_to_partition = {}
        unpartitioned_vars = {}
        for node in self.node_config:
            partitioner = getattr(node, 'partitioner')
            if partitioner:
                reduction_destinations = []
                for part in node.part_config:
                    synchronizer = getattr(part, part.WhichOneof('synchronizer'))
                    if hasattr(synchronizer, 'reduction_destination'):
                        reduction_destinations.append(synchronizer.reduction_destination)
                    else:
                        reduction_destinations.append('')
                vars_to_partition[node.var_name] = (partitioner, reduction_destinations)
                logging.info("Partitioning variable {} with configuration {}".format(node.var_name, partitioner))
            else:
                grad, _, _ = self.graph_item.var_op_name_to_grad_info[get_op_name(node.var_name)]
                unpartitioned_vars[node.var_name] = grad
        return vars_to_partition, unpartitioned_vars
コード例 #2
0
 def _parse_node(self, node, num_nodes):
     host_address = node['address']
     if is_loopback_address(host_address) and num_nodes > 1:
         raise ValueError(
             "Can't (currently) use a loopback address when there are multiple nodes."
         )
     if node.get('chief') or num_nodes == 1:
         # 2 cases for marking this node as chief:
         # 1) The node was marked as chief
         # 2) If there is only one node, it is chief by default
         logging.info("Chief: %s" % host_address)
         self.__chief_address = host_address
     host_cpu = DeviceSpec(host_address, device_index=0)
     self._add_device(host_cpu)
     # handle any other CPUs when GPU is unavailable
     if len(node.get('gpus', [])) == 0:
         for cpu_index in set(sorted(node.get('cpus', []))) - {0}:
             cpu = DeviceSpec(host_address, host_cpu, DeviceType.CPU,
                              cpu_index)
             self._add_device(cpu)
     # handle GPUs
     for gpu_index in set(sorted(node.get('gpus', []))):
         gpu = DeviceSpec(host_address, host_cpu, DeviceType.GPU, gpu_index)
         self._add_device(gpu)
     self.__ssh_group[host_address] = node.get('ssh_config')
     if self.__ssh_group[
             host_address] is None and self.__chief_address != host_address:
         raise ValueError(
             "Need to define SSH groups for all non-chief nodes.")
コード例 #3
0
 def _compile_strategy(self, strategy):
     logging.debug('Raw strategy: %s' % strategy)
     device_resolver = DeviceResolver(self._cluster)
     compiled_strategy = base.StrategyCompiler(self._original_graph_item) \
         .set_device_resolver(device_resolver.resolve_to_device_str) \
         .compile(strategy)
     logging.info('Compiled strategy: %s' % compiled_strategy)
     return compiled_strategy
コード例 #4
0
    def _apply(self, *args, **kwargs):  # pylint: disable-msg=too-many-locals
        """Partition the variables, returning a new GraphItem and a new corresponding Strategy."""
        # Get ops to partition
        vars_to_partition, unpartitioned_vars = self._get_vars_to_partition()

        if not vars_to_partition:
            return self.graph_item, self.node_config

        # Get everything we want to delete
        to_delete, top_update_op_scopes = self._get_ops_to_delete(
            vars_to_partition)

        # In GraphDef, move everything in to_rename under a separate name scope
        # This allows us to create new ops with the to-be-deleted ops' original names
        new_graph_item = self._batch_prepend_name_scope(
            to_delete, AUTODIST_TO_DELETE_SCOPE)

        # Create new variables and ops in the new graph
        new_graph_item.copy_gradient_info_from(self.graph_item)
        new_vars, partition_config = self._create_new_vars(
            new_graph_item, vars_to_partition, unpartitioned_vars)
        # Remove the ops that are marked for deletion
        output_graph_item = self._delete_marked_ops(new_graph_item,
                                                    AUTODIST_TO_DELETE_SCOPE)

        # Update graph item with proper variable information
        # The new list contains:
        # 1) The new vars we created (`new_vars`)
        # 2) The new vars the optimizer created (`new_globals`)
        # 3) The old untrainable vars that weren't deleted during partitioning (`untrainable_vars`)
        new_vars = set(new_vars)
        new_globals = set(
            new_graph_item.graph.get_collection(
                ops.GraphKeys.GLOBAL_VARIABLES))
        deleted_tensor_names = {
            o.outputs[0].name
            for o in to_delete if o.outputs
        }
        untrainable_vars = [
            v for v in self.graph_item.info.untrainable_variables
            if v.variable_name not in deleted_tensor_names
        ]
        new_var_list = list(new_globals | new_vars) + untrainable_vars

        self.info.update_variables(new_var_list, replace=True)
        output_graph_item.info = self.info.copy()
        output_graph_item.copy_gradient_info_from(new_graph_item)

        with self.graph_item.graph.as_default():
            # this can be used to get the shape for partitioned vars
            ori_vars = self.graph_item.get_all_variables()
        with output_graph_item.graph.as_default():
            self._update_save_ops(graph_item=output_graph_item,
                                  ori_vars=ori_vars,
                                  update_op_scopes=top_update_op_scopes,
                                  partition_config=partition_config)
        logging.info('Successfully partitioned variables')
        return output_graph_item, self.node_config
コード例 #5
0
def _log_timeline(run_metadata, name='timeline', step=0):
    fetched_timeline = timeline.Timeline(run_metadata.step_stats)
    chrome_trace = fetched_timeline.generate_chrome_trace_format()
    directory = os.path.join(autodist.const.DEFAULT_WORKING_DIR, "traces")
    os.makedirs(directory, exist_ok=True)
    # TODO(Hao): add a runner step count and use it here.
    p = os.path.join(directory, "{}_{}.json".format(name, step))
    with open(p, "w") as f:
        f.write(chrome_trace)
        logging.info('Traced timeline written to: %s' % p)
コード例 #6
0
 def _setup(self, strategy):
     """Prepare for the execution."""
     if IS_AUTODIST_CHIEF:
         # we should only have one single coordinator for one single AutoDist() instance scope,
         # even though we could have multiple strategies.
         self._coordinator = Coordinator(strategy=strategy,
                                         cluster=self._cluster)
         self._cluster.start()
         self._coordinator.launch_clients()
     logging.info('Current PID {} belongs to address {}'.format(
         os.getpid(), self._cluster.get_local_address()))
コード例 #7
0
    def transform(self):
        """Call graph transformer to transform a graph item based on strategy and cluster."""
        logging.info(
            'Transforming the original graph to a distributed graph...')
        with context.graph_mode():
            graph_item = self.graph_item
            # Ensure the transformation happens under graph mode, no matter the outer mode is under eager or graph.

            visualization_util.log_graph(graph=graph_item.graph,
                                         name='0-original')

            graph_item, self._strategy.node_config = VariablePartitioner.apply(
                self._strategy.node_config, graph_item)

            visualization_util.log_graph(graph=graph_item.graph,
                                         name='1-after-partition')

            # Create Synchronizers for each node in the strategy
            self._initialize_synchronizers()

            # Replicate the graph (both in-graph and between-graph)
            new_graph_item = Replicator.apply(
                config=self._strategy.graph_config.replicas,
                cluster=self._cluster,
                graph_item=graph_item)

            # Apply synchronizers
            if self._num_local_replicas >= 1:
                new_graph_item = self._in_graph_apply(new_graph_item)
                logging.debug(
                    'Successfully applied local in-graph replication')
                visualization_util.log_graph(new_graph_item.graph,
                                             '2-after-in-graph')

            if self._num_workers >= 1:
                new_graph_item = self._between_graph_apply(new_graph_item)
                logging.debug('Successfully applied between-graph replication')

            final_item = new_graph_item
            logging.info('Successfully built the distributed graph.')
            visualization_util.log_graph(graph=final_item.graph,
                                         name='3-transformed')

        return final_item
コード例 #8
0
 def __init__(self, resource_spec: ResourceSpec):
     self.cluster_spec = self._get_default_cluster_spec(resource_spec)
     self._cpu_devices = self._get_node_cpu_devices(resource_spec)
     self._gpu_devices = self._get_node_gpu_devices(resource_spec)
     self._chief = resource_spec.chief
     self._full_addresses = [
         full_address for tasks in self.cluster_spec.values()
         for full_address in tasks
     ]
     # noinspection PyTypeChecker
     self._address_to_port = dict(
         a.split(':') for a in self._full_addresses)
     self._task_to_address = {
         (job_name, task_index): a.split(':')[0]
         for job_name, tasks in self.cluster_spec.items()
         for task_index, a in enumerate(tasks)
     }
     self.subprocesses = []
     logging.info('ClusterSpec: {}'.format(self.cluster_spec))
コード例 #9
0
    def remote_pre_start_tf_server(self,
                                   hostname,
                                   tf_server_starter_filepath,
                                   working_dir=DEFAULT_WORKING_DIR):
        """
        Prepare to start a TensorFlow server remotely.

        Args:
            hostname (str): host name or address
            tf_server_starter_filepath (str): local starter file path
            working_dir (str): remote working directory
        """
        logging.info("Copying necessary files to %s" % hostname)
        self.remote_copy(local_path=tf_server_starter_filepath,
                         remote_path=working_dir,
                         hostname=hostname)
        self.remote_file_write(
            remote_path=os.path.join(working_dir, 'cluster_spec.json'),
            data=json.dumps(self.cluster_spec),
            hostname=hostname,
        )
コード例 #10
0
    def start(self):
        """
        Start tf.servers on all nodes.

        Note that this only runs (and only should run) on the chief node.
        """
        # pylint: disable=import-outside-toplevel
        from autodist.utils import server_starter

        # atexit registration should be placed
        #   - before the beginning of the start
        #   (to ensure the clean termination if the start fails in its half way); and
        #   - at the same module as the start
        #   (to follow the python assumption that
        #   lower level modules will normally be imported
        #   before higher level modules and thus must be cleaned up later).
        atexit.register(self.terminate)
        envs = {ENV.AUTODIST_MIN_LOG_LEVEL.name: 'ERROR'}
        envs = ['{}={}'.format(k, v) for k, v in envs.items()]
        module_name = server_starter.__name__
        module_file = server_starter.__file__

        for job_name, tasks in self.cluster_spec.items():
            for task_index, full_address in enumerate(tasks):
                address = full_address.split(':')[0]
                args = [
                    '--job_name=%s' % job_name,
                    '--task_index=%d' % task_index,
                    '--cpu_device_num=%d' % len(self._cpu_devices[address])
                ]
                if address in self._gpu_devices:
                    envs_cuda = []
                else:
                    envs_cuda = ['CUDA_VISIBLE_DEVICES=""']
                if self.is_chief(address):
                    json.dump(
                        self.cluster_spec,
                        open(
                            os.path.join(DEFAULT_WORKING_DIR,
                                         'cluster_spec.json'), 'w+'))
                    cmd = envs + envs_cuda + [
                        sys.executable, '-m', module_name
                    ] + args
                    # pylint: disable=subprocess-popen-preexec-fn
                    proc = subprocess.Popen(' '.join(cmd),
                                            shell=True,
                                            preexec_fn=os.setsid)
                    self.subprocesses.append(proc)
                    # The above line immediately follows the Popen
                    # to ensure no gap for termination failure due to the empty proc list.
                    logging.debug(
                        '$ local tf.server started at {}: job_name={} task_index={}'
                        .format(full_address, job_name, task_index))
                else:  # remote
                    self.remote_pre_start_tf_server(
                        address, tf_server_starter_filepath=module_file)
                    file = os.path.join(DEFAULT_WORKING_DIR,
                                        os.path.basename(module_file))
                    bash = envs + envs_cuda + ['python', '-u', file] + args
                    logging.info("Launching tf.server on %s" % address)
                    proc = self.remote_exec(bash, hostname=address)
                    # The above line immediately follows the Popen
                    # to ensure no gap for termination failure due to the empty proc list.
                    self.subprocesses.append(proc)