Exemple #1
0
    def build(self, graph_item, resource_spec):
        """Generate the strategy."""
        expr = Strategy()

        # For each variable, generate variable synchronizer config
        expr.graph_config.replicas.extend(
            [k for k, v in resource_spec.gpu_devices])
        reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
        self.loads = {ps: 0.0 for ps in reduction_device_names}

        # Generate node config
        node_config = []
        for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
            var_op_name = get_op_name(var.name)
            grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
            if isinstance(grad, ops.Tensor):  # this is a dense variable
                group_id = idx // self.chunk_size
                config = self._gen_all_reduce_node_config(var.name,
                                                          group=group_id)
            else:  # sparse updates
                # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
                # Sparse variables are likely larger, so keeping copies would be costlier,
                # and usually each device only requires a small part of the overall variable.
                config = self._gen_ps_node_config(
                    var,
                    False,  # For Parallax Strategy, all PS vars are sparse which does not need proxy.
                    self._sync,
                    self._staleness)
            node_config.append(config)
        expr.node_config.extend(node_config)

        return expr
Exemple #2
0
    def build(self, graph_item, resource_spec):
        """Generate the Strategy."""
        expr = Strategy()

        # get each variable, generate variable synchronizer config
        expr.graph_config.replicas.extend(
            [k for k, v in resource_spec.gpu_devices])
        for k, v in resource_spec.node_cpu_devices.items():
            if k not in resource_spec.node_gpu_devices:
                expr.graph_config.replicas.extend(v)

        # find all variables
        variables = graph_item.trainable_var_op_to_var.values()
        reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
        self.loads = {ps: 0.0 for ps in reduction_device_names}

        # Mark each variable to be synchronized with a Parameter Server
        node_config = [
            self._gen_ps_node_config(var, self._local_proxy_variable,
                                     self._sync, self._staleness)
            for var in variables
        ]
        expr.node_config.extend(node_config)

        return expr
Exemple #3
0
    def build(self, graph_item, resource_spec):
        """Generate the strategy."""
        expr = Strategy()

        # get each variable, generate variable synchronizer config
        expr.graph_config.replicas.extend(
            [k for k, v in resource_spec.gpu_devices])
        for k, v in resource_spec.node_cpu_devices.items():
            if k not in resource_spec.node_gpu_devices:
                expr.graph_config.replicas.extend(v)

        # find all variables
        variables = graph_item.get_trainable_variables()

        # Mark each variable to be synchronized with allreduce
        for i, var in enumerate(variables):
            group_id = i // self.chunk_size
            node_config = self._gen_all_reduce_node_config(
                var.name,
                group=group_id,
                all_reduce_spec=self.all_reduce_spec,
                compressor=self.compressor)
            expr.node_config.append(node_config)

        return expr
Exemple #4
0
    def build(self, graph_item, resource_spec):
        """Generate the strategy."""
        expr = Strategy()

        # For each variable, generate variable synchronizer config
        # resouce_spec.gpu_devices = dict_items([('162.105.146.118:GPU:0', <DeviceSpec: 162.105.146.118:GPU:0>), ('162.105.146.118:GPU:1', <DeviceSpec: 162.105.146.118:GPU:1>), ('162.105.146.118:GPU:2', <DeviceSpec: 162.105.146.118:GPU:2>), ('162.105.146.118:GPU:3', <DeviceSpec: 162.105.146.118:GPU:3>), ('162.105.146.118:GPU:4', <DeviceSpec: 162.105.146.118:GPU:4>), ('162.105.146.118:GPU:5', <DeviceSpec: 162.105.146.118:GPU:5>), ('162.105.146.118:GPU:6', <DeviceSpec: 162.105.146.118:GPU:6>), ('162.105.146.118:GPU:7', <DeviceSpec: 162.105.146.118:GPU:7>)]), ('162.105.146.119:GPU:0', <DeviceSpec: 162.105.146.119:GPU:0>), ('162.105.146.119:GPU:1', <DeviceSpec: 162.105.146.119:GPU:1>), ('162.105.146.119:GPU:2', <DeviceSpec: 162.105.146.119:GPU:2>), ('162.105.146.119:GPU:3', <DeviceSpec: 162.105.146.119:GPU:3>), ('162.105.146.119:GPU:4', <DeviceSpec: 162.105.146.119:GPU:4>), ('162.105.146.119:GPU:5', <DeviceSpec: 162.105.146.119:GPU:5>), ('162.105.146.119:GPU:6', <DeviceSpec: 162.105.146.119:GPU:6>), ('162.105.146.119:GPU:7', <DeviceSpec: 162.105.146.119:GPU:7>)])
        gpu_devices = dict()
        for k, v in resource_spec.gpu_devices:
            if '119' not in k:
                gpu_devices[k] = v
        print(resource_spec.gpu_devices)
        #expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices])
        expr.graph_config.replicas.extend([k for k, v in gpu_devices.items()])
        for k, v in resource_spec.node_cpu_devices.items():
            if k not in resource_spec.node_gpu_devices:
                expr.graph_config.replicas.extend(v)
        reduction_device_names = [
            k for k, _ in resource_spec.cpu_devices if '119' in k
        ]
        self.loads = {ps: 0.0 for ps in reduction_device_names}

        # Generate node config
        node_config = []
        for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
            var_op_name = get_op_name(var.name)
            grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
            if isinstance(grad, ops.Tensor):  # this is a dense variable
                group_id = idx // self.chunk_size
                config = self._gen_all_reduce_node_config(var.name,
                                                          group=group_id)
            else:  # sparse updates
                # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
                # Sparse variables are likely larger, so keeping copies would be costlier,
                # and usually each device only requires a small part of the overall variable.
                config = self._gen_ps_node_config(
                    var,
                    False,  # For Parallax Strategy, all PS vars are sparse which does not need proxy.
                    self._sync,
                    self._staleness)
            node_config.append(config)
        expr.node_config.extend(node_config)

        return expr
Exemple #5
0
    def build(self, graph_item, resource_spec):
        """Generate the Strategy."""
        expr = Strategy()

        # data-parallel graph replication first
        expr.graph_config.replicas.extend(
            [k for k, v in resource_spec.gpu_devices])
        for k, v in resource_spec.node_cpu_devices.items():
            if k not in resource_spec.node_gpu_devices:
                expr.graph_config.replicas.extend(v)

        # find all variables
        variables = graph_item.trainable_var_op_to_var.values()

        # Mark each variable to be synchronized with allreduce
        var_counter = 0
        for var in variables:
            node_config, num_shards = self._gen_node_config(var, var_counter)
            var_counter += num_shards
            expr.node_config.append(node_config)

        return expr