def build(self, graph_item, resource_spec): """Generate the strategy.""" expr = Strategy() # For each variable, generate variable synchronizer config expr.graph_config.replicas.extend( [k for k, v in resource_spec.gpu_devices]) reduction_device_names = [k for k, _ in resource_spec.cpu_devices] self.loads = {ps: 0.0 for ps in reduction_device_names} # Generate node config node_config = [] for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()): var_op_name = get_op_name(var.name) grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name] if isinstance(grad, ops.Tensor): # this is a dense variable group_id = idx // self.chunk_size config = self._gen_all_reduce_node_config(var.name, group=group_id) else: # sparse updates # For Parallax Strategy, all PS vars are sparse so we don't use a proxy. # Sparse variables are likely larger, so keeping copies would be costlier, # and usually each device only requires a small part of the overall variable. config = self._gen_ps_node_config( var, False, # For Parallax Strategy, all PS vars are sparse which does not need proxy. self._sync, self._staleness) node_config.append(config) expr.node_config.extend(node_config) return expr
def build(self, graph_item, resource_spec): """Generate the Strategy.""" expr = Strategy() # get each variable, generate variable synchronizer config expr.graph_config.replicas.extend( [k for k, v in resource_spec.gpu_devices]) for k, v in resource_spec.node_cpu_devices.items(): if k not in resource_spec.node_gpu_devices: expr.graph_config.replicas.extend(v) # find all variables variables = graph_item.trainable_var_op_to_var.values() reduction_device_names = [k for k, _ in resource_spec.cpu_devices] self.loads = {ps: 0.0 for ps in reduction_device_names} # Mark each variable to be synchronized with a Parameter Server node_config = [ self._gen_ps_node_config(var, self._local_proxy_variable, self._sync, self._staleness) for var in variables ] expr.node_config.extend(node_config) return expr
def build(self, graph_item, resource_spec): """Generate the strategy.""" expr = Strategy() # get each variable, generate variable synchronizer config expr.graph_config.replicas.extend( [k for k, v in resource_spec.gpu_devices]) for k, v in resource_spec.node_cpu_devices.items(): if k not in resource_spec.node_gpu_devices: expr.graph_config.replicas.extend(v) # find all variables variables = graph_item.get_trainable_variables() # Mark each variable to be synchronized with allreduce for i, var in enumerate(variables): group_id = i // self.chunk_size node_config = self._gen_all_reduce_node_config( var.name, group=group_id, all_reduce_spec=self.all_reduce_spec, compressor=self.compressor) expr.node_config.append(node_config) return expr
def build(self, graph_item, resource_spec): """Generate the strategy.""" expr = Strategy() # For each variable, generate variable synchronizer config # resouce_spec.gpu_devices = dict_items([('162.105.146.118:GPU:0', <DeviceSpec: 162.105.146.118:GPU:0>), ('162.105.146.118:GPU:1', <DeviceSpec: 162.105.146.118:GPU:1>), ('162.105.146.118:GPU:2', <DeviceSpec: 162.105.146.118:GPU:2>), ('162.105.146.118:GPU:3', <DeviceSpec: 162.105.146.118:GPU:3>), ('162.105.146.118:GPU:4', <DeviceSpec: 162.105.146.118:GPU:4>), ('162.105.146.118:GPU:5', <DeviceSpec: 162.105.146.118:GPU:5>), ('162.105.146.118:GPU:6', <DeviceSpec: 162.105.146.118:GPU:6>), ('162.105.146.118:GPU:7', <DeviceSpec: 162.105.146.118:GPU:7>)]), ('162.105.146.119:GPU:0', <DeviceSpec: 162.105.146.119:GPU:0>), ('162.105.146.119:GPU:1', <DeviceSpec: 162.105.146.119:GPU:1>), ('162.105.146.119:GPU:2', <DeviceSpec: 162.105.146.119:GPU:2>), ('162.105.146.119:GPU:3', <DeviceSpec: 162.105.146.119:GPU:3>), ('162.105.146.119:GPU:4', <DeviceSpec: 162.105.146.119:GPU:4>), ('162.105.146.119:GPU:5', <DeviceSpec: 162.105.146.119:GPU:5>), ('162.105.146.119:GPU:6', <DeviceSpec: 162.105.146.119:GPU:6>), ('162.105.146.119:GPU:7', <DeviceSpec: 162.105.146.119:GPU:7>)]) gpu_devices = dict() for k, v in resource_spec.gpu_devices: if '119' not in k: gpu_devices[k] = v print(resource_spec.gpu_devices) #expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices]) expr.graph_config.replicas.extend([k for k, v in gpu_devices.items()]) for k, v in resource_spec.node_cpu_devices.items(): if k not in resource_spec.node_gpu_devices: expr.graph_config.replicas.extend(v) reduction_device_names = [ k for k, _ in resource_spec.cpu_devices if '119' in k ] self.loads = {ps: 0.0 for ps in reduction_device_names} # Generate node config node_config = [] for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()): var_op_name = get_op_name(var.name) grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name] if isinstance(grad, ops.Tensor): # this is a dense variable group_id = idx // self.chunk_size config = self._gen_all_reduce_node_config(var.name, group=group_id) else: # sparse updates # For Parallax Strategy, all PS vars are sparse so we don't use a proxy. # Sparse variables are likely larger, so keeping copies would be costlier, # and usually each device only requires a small part of the overall variable. config = self._gen_ps_node_config( var, False, # For Parallax Strategy, all PS vars are sparse which does not need proxy. self._sync, self._staleness) node_config.append(config) expr.node_config.extend(node_config) return expr
def build(self, graph_item, resource_spec): """Generate the Strategy.""" expr = Strategy() # data-parallel graph replication first expr.graph_config.replicas.extend( [k for k, v in resource_spec.gpu_devices]) for k, v in resource_spec.node_cpu_devices.items(): if k not in resource_spec.node_gpu_devices: expr.graph_config.replicas.extend(v) # find all variables variables = graph_item.trainable_var_op_to_var.values() # Mark each variable to be synchronized with allreduce var_counter = 0 for var in variables: node_config, num_shards = self._gen_node_config(var, var_counter) var_counter += num_shards expr.node_config.append(node_config) return expr