def __init__(self, epochs, layers=[], weights=[], objective_function=None, metrics=[], callbacks=[], summary_dir=None): # Scalar fields self.epochs = epochs self.summary_dir = summary_dir # Get connected layers self.layers = list(lbann.core.layer.traverse_layer_graph(layers)) # Get weights associated with layers self.weights = set(make_iterable(weights)) for l in self.layers: self.weights.update(l.weights) # Construct objective function if needed obj_type = lbann.core.objective_function.ObjectiveFunction if isinstance(objective_function, obj_type): self.objective_function = objective_function elif objective_function is None: self.objective_function = obj_type() else: self.objective_function = obj_type(objective_function) # Metrics and callbacks self.metrics = make_iterable(metrics) self.callbacks = make_iterable(callbacks)
def __init__(self, mini_batch_size, epochs, layers=[], weights=[], objective_function=None, metrics=[], callbacks=[]): # Scalar fields self.mini_batch_size = mini_batch_size self.epochs = epochs self.block_size = 256 # TODO: Make configurable self.num_parallel_readers = 0 # TODO: Make configurable self.procs_per_trainer = 0 # TODO: Make configurable # Get connected layers self.layers = list(lbann.layer.traverse_layer_graph(layers)) # Get weights associated with layers self.weights = set(make_iterable(weights)) for l in self.layers: self.weights.update(l.weights) # Construct objective function if needed obj_type = lbann.objective_function.ObjectiveFunction if isinstance(objective_function, obj_type): self.objective_function = objective_function elif objective_function is None: self.objective_function = obj_type() else: self.objective_function = obj_type(objective_function) # Metrics and callbacks self.metrics = make_iterable(metrics) self.callbacks = make_iterable(callbacks)
def add_parallel_command(self, command, work_dir=None, nodes=None, procs_per_node=None, reservation=None, launcher=None, launcher_args=None): """Add command to be executed in parallel. The command is launched with jsrun. Parallel processes are distributed evenly amongst the compute nodes. Args: command (`str` or `Iterable` of `str`s): Command to be executed in parallel. work_dir (str, optional): Working directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of parallel processes per compute node. reservation (str, optional): Scheduler advance reservation. launcher (str, optional): jsrun executable. launcher_args (`Iterable` of `str`s, optional): Command-line arguments to jsrun. """ # Use default values if needed if work_dir is None: work_dir = self.work_dir if nodes is None: nodes = self.nodes if procs_per_node is None: procs_per_node = self.procs_per_node if reservation is None: reservation = self.reservation if launcher is None: launcher = self.launcher if launcher_args is None: launcher_args = self.launcher_args # Construct jsrun invocation args = [launcher] args.extend(make_iterable(launcher_args)) args.append(f'--chdir {work_dir}') args.extend([ f'--nrs {nodes}', '--rs_per_host 1', f'--tasks_per_rs {procs_per_node}', '--launch_distribution packed', '--cpu_per_rs ALL_CPUS', '--gpu_per_rs ALL_GPUS', ]) args.extend(make_iterable(command)) self.add_command(args)
def export_proto(self): """Construct and return a protobuf message.""" # Construct Protobuf message if base_has_export_proto: proto = base_class.export_proto(self) message = getattr(proto, base_field_name) message.SetInParent() else: # TODO (trb 08/01/2019): This list would have to be # updated any time another _pb2 file is created. It might # be better to have this as a global `frozenset` # (ndryden's suggestion) that gets maintained # elsewhere. But this code either works or doesn't get # executed now, so I vote delaying this fix until a need # arises. proto_modules = [ callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, operators_pb2, optimizers_pb2, training_algorithm_pb2, weights_pb2 ] proto_type = None while proto_type is None: proto_type = getattr(proto_modules.pop(), message_name, None) proto = proto_type() message = proto # Set message for field_name in field_names: val = getattr(self, field_name) if val is not None: try: field = getattr(message, field_name) field_descriptor = field_descriptors[field_name] if field_descriptor.message_type in _protobuf_type_wrappers: field.SetInParent() field.value = val elif field_descriptor.label == google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED: iterable_val = make_iterable(val) if field_descriptor.type == field_descriptor.TYPE_MESSAGE: field.extend( [x.export_proto() for x in iterable_val]) else: field.extend(iterable_val) elif isinstance(val, google.protobuf.message.Message): getattr(message, field_name).MergeFrom(val) elif callable(getattr(val, "export_proto", None)): # 'val' is (hopefully) an LBANN class # representation of a protobuf message. getattr(message, field_name).MergeFrom(val.export_proto()) else: setattr(message, field_name, val) except: raise TypeError('{} is invalid type for {}.{}'.format( type(val).__name__, self.__class__.__name__, field_name)) # Return Protobuf message return proto
def add_parallel_command(self, command, work_dir=None, nodes=None, procs_per_node=None, launcher=None, launcher_args=None): """Add command to be executed in parallel. The command is launched with mpiexec. Parallel processes are distributed evenly amongst the compute nodes. Args: command (`str` or `Iterable` of `str`s): Command to be executed in parallel. work_dir (str, optional): Working directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of parallel processes per compute node. launcher (str, optional): mpiexec executable. launcher_args (`Iterable` of `str`s, optional): Command-line arguments to mpiexec. """ # Use default values if needed if work_dir is None: work_dir = self.work_dir if nodes is None: nodes = self.nodes if procs_per_node is None: procs_per_node = self.procs_per_node if launcher is None: launcher = self.launcher if launcher_args is None: launcher_args = self.launcher_args # Construct mpiexec invocation args = [launcher] args.extend(make_iterable(launcher_args)) args.extend([ f'-n {nodes*procs_per_node}', f'--map-by ppr:{procs_per_node}:node', f'-wdir {work_dir}' ]) args.extend(make_iterable(command)) self.add_command(args)
def add_command(self, command): """Add executable command to script. Args: command (`str` or `Iterable` of `str`s): Program invocation or sequence of program arguments. """ self.add_body_line(' '.join(make_iterable(command)))
def __init__(self, size, bias = True, weights=[], name=None, data_layout='data_parallel'): """Initialize LSTM cell. Args: size (int): Size of output tensor. bias (bool): Whether to apply biases after linearity. weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two - a matrix ((4*size) x (input_size+size) dimensions) and a bias (4*size entries). If weights are not provided, the matrix and bias will be initialized in a similar manner as PyTorch (uniform random values from [-1/sqrt(size), 1/sqrt(size)]). name (str): Default name is in the form 'lstmcell<index>'. data_layout (str): Data layout. """ super().__init__() LSTMCell.global_count += 1 self.step = 0 self.size = size self.name = (name if name else 'lstmcell{0}'.format(LSTMCell.global_count)) self.data_layout = data_layout # Initial state self.last_output = lbann.Constant(value=0.0, num_neurons=str(size), name=self.name + '_init_output', data_layout=self.data_layout) self.last_cell = lbann.Constant(value=0.0, num_neurons=str(size), name=self.name + '_init_cell', data_layout=self.data_layout) # Weights self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`LSTMCell` has at most two weights, ' 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), max=-1/sqrt(self.size)), name=self.name+'_matrix')) if len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), max=-1/sqrt(self.size)), name=self.name+'_bias')) # Linearity self.fc = FullyConnectedModule(4*size, bias=bias, weights=self.weights, name=self.name + '_fc', data_layout=self.data_layout)
def __init__(self, size, bias=False, weights=[], activation=None, transpose=False, name=None, parallel_strategy={}): """Initalize channelwise fully connected module Args: size (int or list): Dimension of the output tensor bias (bool): Whether to apply bias after linearity. transpose (bool): Whether to apply transpose of weights matrix. weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the matrix will be initialized with He normal initialization and the bias with zeros. activation (type): Layer class for activation function. name (str): Default name is in the form 'channelwisefc<index>'. parallel_strategy (dict): Data partitioning scheme. """ super().__init__() ChannelwiseFullyConnectedModule.global_count += 1 self.instance = 0 self.size = size self.bias = bias self.transpose = transpose self.parallel_strategy = parallel_strategy self.name = (name if name else 'channelwisefc{0}'.format( ChannelwiseFullyConnectedModule.global_count)) self.data_layout = 'data_parallel' self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`FullyConnectedModule` has ' 'at most two weights, ' 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.HeNormalInitializer(), name=self.name + '_matrix')) if self.bias and len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), name=self.name + '_bias')) self.activation = None if activation: if isinstance(activation, type): self.activation = activation else: self.activation = type(activation) if not issubclass(self.activation, lbann.Layer): raise ValueError('activation must be a layer')
def __init__(self, terms=[]): """Create an objective function with layer terms and regularization. `terms` should be a sequence of `ObjectiveFunctionTerm`s and `Layer`s. """ self.terms = [] for t in make_iterable(terms): self.add_term(t)
def __init__(self, size, bias=True, weights=[], activation=None, name=None, data_layout='data_parallel'): """Initialize fully-connected module. Args: size (int): Size of output tensor. activation (type): Layer class for activation function. bias (bool): Whether to apply bias after linearity. weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the matrix will be initialized with He normal initialization and the bias with zeros. name (str): Default name is in the form 'fcmodule<index>'. data_layout (str): Data layout. """ super().__init__() FullyConnectedModule.global_count += 1 self.instance = 0 self.size = size self.bias = bias self.name = (name if name else 'fcmodule{0}'.format(FullyConnectedModule.global_count)) self.data_layout = data_layout # Initialize weights # Note: If weights are not provided, matrix weights are # initialized with He normal scheme and bias weights are # initialized with zeros. self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`FullyConnectedModule` has ' 'at most two weights, ' 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.HeNormalInitializer(), name=self.name+'_matrix')) if len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), name=self.name+'_bias')) # Initialize activation layer self.activation = None if activation: if isinstance(activation, type): self.activation = activation else: self.activation = type(activation) if not issubclass(self.activation, lbann.Layer): raise ValueError('activation must be a layer')
def run( trainer, model, data_reader, optimizer, lbann_exe=lbann.lbann_exe(), lbann_args=[], overwrite_script=False, setup_only=False, batch_job=False, *args, **kwargs, ): """Run LBANN with system-specific optimizations. This is intended to match the behavior of `lbann.run`, with defaults and optimizations for the current system. See that function for a full list of options. """ # Create batch script generator script = make_batch_script(*args, **kwargs) # Batch script prints start time script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann_exe] lbann_command.extend(make_iterable(lbann_args)) prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) script.add_parallel_command(lbann_command) script.add_command('status=$?') # Batch script prints finish time and returns status script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, run, or submit batch script status = 0 if setup_only: script.write(overwrite=overwrite_script) elif batch_job: status = script.submit(overwrite=overwrite_script) else: status = script.run(overwrite=overwrite_script) return status
def __init__(self, parents=[], children=[], weights=[], name=None, data_layout='data_parallel', hint_layer=None): """Constructor. Args: parents (Iterable of Layer, optional): Sources of input tensors. children (Iterable of Layer, optional): Destinations of output tensors. weights (Iterable of Weights, optional): Trainable parameters. name (str, optional): Unique identifier (default is 'layer<index>'). data_layout (str, optional): Data distribution scheme. hint_layer (Layer, optional): Hint for output dimensions. """ Layer.global_count += 1 self.parents = [] self.children = [] self.weights = [] self.name = name if name else 'layer{0}'.format(Layer.global_count) self.data_layout = data_layout self.hint_layer = hint_layer # Initialize parents, children, and weights for l in make_iterable(parents): self.add_parent(l) for l in make_iterable(children): self.add_child(child) for w in make_iterable(weights): self.add_weights(w)
def __init__(self, num_channels, size, bias=True, weights=[], name=None): """Initialize GRU cell. Args: num_channels (int): The number of rows in the matrix to perform GRU size (int): Size of output tensor. bias (bool): Whether to apply biases after linearity. weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most four - two matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two biases (3*size entries) each. If weights are not provided, the matrix and bias will be initialized in a similar manner as PyTorch (uniform random values from [-1/sqrt(size), 1/sqrt(size)]). name (str): Default name is in the form 'gru<index>'. data_layout (str): Data layout. """ super().__init__() ChannelwiseGRU.global_count += 1 self.step = 0 self.size = size self.num_channels = num_channels self.name = (name if name else f'gru{ChannelwiseGRU.global_count}') self.data_layout = 'data_parallel' scale = 1 / math.sqrt(self.size) self.weights = list(make_iterable(weights)) weight_name = ['_ih_matrix', '_ih_bias', '_hh_matrix', '_hh_bias'] for i in range(4): if (len(self.weights) == i): self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer( min=-scale, max=scale), name=self.name + weight_name[i])) self.ih_fc = ChannelwiseFullyConnectedModule(3 * size, bias=bias, weights=self.weights[:2], name=self.name + '_ih_fc') self.hh_fc = ChannelwiseFullyConnectedModule(3 * size, bias=bias, weights=self.weights[2:], name=self.name + '_hh_fc') self.ones = lbann.Constant(value=1.0, num_neurons=str_list([num_channels, size]), name=self.name + '_ones')
def __init__(self, mini_batch_size, name=None, procs_per_trainer=None, num_parallel_readers=None, random_seed=None, callbacks=[]): self.name = name self.procs_per_trainer = procs_per_trainer self.num_parallel_readers = num_parallel_readers self.random_seed = random_seed self.mini_batch_size = mini_batch_size self.hydrogen_block_size = None # Callbacks self.callbacks = make_iterable(callbacks)
def __init__(self, mini_batch_size, name=None, num_parallel_readers=None, random_seed=None, serialize_io=None, training_algo=None, callbacks=[]): self.name = name self.num_parallel_readers = num_parallel_readers self.random_seed = random_seed self.serialize_io = serialize_io self.mini_batch_size = mini_batch_size self.hydrogen_block_size = None self.training_algo = training_algo # Callbacks self.callbacks = make_iterable(callbacks)
def __init__(self, strategy: str = "checkpoint_binary", weights_names: list[str] = [], exchange_hyperparameters: bool = False, checkpoint_dir: str = None): """Construct a new exchange strategy. Args: strategy: Which strategy to use (default: "checkpoint_binary"). weights_names: A list of weights names that should be exchanged. exchange_hyperparameters: If True, exchange all optimizer state. Only applies to the "sendrecv_weights" strategy. checkpoint_dir: A path to a directory for storing the checkpoint files. Only applies to "checkpoint_file". """ self.strategy = strategy self.exchange_hyperparameters = exchange_hyperparameters self.weights_names = make_iterable(weights_names) self.checkpoint_dir = checkpoint_dir
def traverse_layer_graph(layers): """Topologically ordered traversal of layer graph. All layers that are connected to `layers` will be traversed. The layer graph is assumed to be acyclic. No checks are made for cycles and strange things may happen if one exists. Args: layers (Layer or Iterator of Layer): Node(s) in layer graph. Yields: Layer: Node in layer graph, in a topological order. """ # DFS to find root nodes in layer graph roots = [] visited = set() stack = list(make_iterable(layers)) while stack: l = stack.pop() if l not in visited: visited.add(l) stack.extend(l.parents) stack.extend(l.children) if not l.parents: roots.append(l) # DFS to traverse layer graph in topological order visited = set() stack = roots while stack: l = stack.pop() if (l not in visited and all([(p in visited) for p in l.parents])): visited.add(l) stack.extend(l.children) yield l
def __init__(self, weights=[], scale=1.0): self.scale = scale self.weights = list(make_iterable(weights))
def __init__(self, num_dims, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, weights=[], activation=None, name=None, transpose=False, parallel_strategy={}): """Initialize convolution module. Args: num_dims (int): Number of dimensions. out_channels (int): Number of output channels, i.e. number of filters. kernel_size (int): Size of convolution kernel. stride (int): Convolution stride. padding (int): Convolution padding. dilation (int): Convolution dilation. groups (int): Number of convolution groups. bias (bool): Whether to apply channel-wise bias after convolution. weights (`Weights` or iterator of `Weights`): Weights in convolution layer. There are at most two: the kernel and the bias. If weights are not provided, the kernel will be initialized with He normal initialization and the bias with zeros. name (str): Default name is in the form 'convmodule<index>'. transpose (bool): If true call deconvolution (or convolution transpose) parallel_strategy dict): Data partitioning scheme. """ super().__init__() ConvolutionModule.global_count += 1 self.instance = 0 self.num_dims = num_dims self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.bias = bias self.weights = list(make_iterable(weights)) self.name = (name if name else 'convmodule{0}'.format( ConvolutionModule.global_count)) self.transpose = transpose self.parallel_strategy = parallel_strategy # Initialize weights # Note: If weights are not provided, kernel weights are # initialized with He normal scheme and bias weights are # initialized with zeros. self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`ConvolutionModule` has ' 'at most two weights, ' 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.HeNormalInitializer(), name=self.name + '_kernel')) if len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), name=self.name + '_bias')) # Initialize activation layer self.activation = None if activation: if isinstance(activation, type): self.activation = activation else: self.activation = type(activation) if not issubclass(self.activation, lbann.Layer): raise ValueError('activation must be a layer')
def add_weights(self, w): """Add w to this layer's weights.""" self.weights.extend(make_iterable(w))
def add_child(self, child): """"This layer will send an output tensor to `child`.""" for c in make_iterable(child): self.children.append(c) c.parents.append(self)
def add_parent(self, parent): """This layer will receive an input tensor from `parent`.""" for p in make_iterable(parent): self.parents.append(p) p.children.append(self)
def run( trainer, model, data_reader, optimizer, work_dir=None, proto_file_name='experiment.prototext', nodes=1, procs_per_node=1, time_limit=None, scheduler=None, job_name='lbann', partition=None, account=None, reservation=None, launcher_args=[], lbann_exe=lbann.lbann_exe(), lbann_args=[], procs_per_trainer=None, environment={}, overwrite_script=False, setup_only=False, batch_job=False, nvprof=False, nvprof_output_name=None, experiment_dir=None, ): """Run LBANN. This is intended to interface with job schedulers on HPC clusters. It will either submit a batch job (if on a login node) or run with an existing node allocation (if on a compute node). Behavior may vary across schedulers. If an experiment directory is not provided, a timestamped directory is created (by default in the current working directory). The location of autogenerated experiment directories can be set with the environment variable `LBANN_EXPERIMENT_DIR`. Args: trainer (lbann.Trainer): LBANN trainer. model (lbann.Model): Neural network model. data_reader (lbann.reader_pb2.DataReader): Data reader. optimizer (lbann.model.Optimizer): Default optimizer for model. work_dir (str, optional): Working directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute node. time_limit (int, optional): Job time limit, in minutes. scheduler (str, optional): Job scheduler. job_name (str, optional): Batch job name. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. launcher_args (str, optional): Command-line arguments to launcher. lbann_exe (str, optional): LBANN executable. lbann_args (str, optional): Command-line arguments to LBANN executable. procs_per_trainer (int, optional): Number of processes per LBANN trainer. Default is all processes in one trainer. environment (dict of {str: str}, optional): Environment variables. overwrite_script (bool, optional): Whether to overwrite script file if it already exists. setup_only (bool, optional): If true, the experiment is not run after the experiment directory is initialized. batch_job (bool, optional): If true, the experiment is submitted to the scheduler as a batch job. nvprof (bool, optional): If true, an nvprof command is added to the beginning of LBANN executable. nvprof_output_name (str, optional): nvprof output filename. Filename should be unique to each process by using %q{ENV} (see https://docs.nvidia.com/cuda/profiler-users-guide/). experiment_dir (str, optional, deprecated): See `work_dir`. Returns: int: Exit status. """ # Create batch script generator if not work_dir: work_dir = experiment_dir script = make_batch_script(work_dir=work_dir, nodes=nodes, procs_per_node=procs_per_node, time_limit=time_limit, scheduler=scheduler, job_name=job_name, partition=partition, account=account, reservation=reservation, launcher_args=launcher_args, environment=environment) # Batch script prints start time script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann_exe] if nvprof: lbann_command = nvprof_command( work_dir=work_dir, output_name=nvprof_output_name) + lbann_command lbann_command.extend(make_iterable(lbann_args)) prototext_file = os.path.join(script.work_dir, proto_file_name) lbann.proto.save_prototext(prototext_file, trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) if procs_per_trainer is not None: lbann_command.append(f'--procs_per_trainer={procs_per_trainer}') script.add_parallel_command(lbann_command) script.add_command('status=$?') # Batch script prints finish time and returns status script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, submit, or run batch script status = 0 if setup_only: script.write(overwrite=overwrite_script) elif batch_job: status = script.submit(overwrite=overwrite_script) else: status = script.run(overwrite=overwrite_script) return status
def str_list(l): """Convert an iterable object to a space-separated string.""" return ' '.join(str(i) for i in make_iterable(l))
def make_batch_script( system=system(), procs_per_node=procs_per_node(), scheduler=scheduler(), launcher_args=[], environment={}, *args, **kwargs, ): """Construct batch script manager with NERSC-specific optimizations. This is a wrapper around `lbann.launcher.make_batch_script`, with defaults and optimizations for NERSC systems. See that function for a full list of options. """ # Create shallow copies of input arguments launcher_args = list(make_iterable(launcher_args)) environment = environment.copy() # Helper function to configure environment variables # Note: User-provided values take precedence, followed by values # in the environment, followed by default values. def set_environment(key, default): if key not in environment: environment[key] = os.getenv(key, default) # Optimizations for Cori GPU nodes if system == 'cgpu': cores_per_proc = cores_per_node(system) // procs_per_node set_environment( 'AL_PROGRESS_RANKS_PER_NUMA_NODE', math.ceil(procs_per_node / numa_nodes_per_node(system))) set_environment('OMP_NUM_THREADS', cores_per_proc - 1) if scheduler == 'slurm': masks = [2**cores_per_proc - 1] while len(masks) < procs_per_node: masks.append(masks[-1] << cores_per_proc) mask_str = ','.join([hex(mask) for mask in masks]) launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str)) launcher_args.extend([ '--qos=regular', f'--cpus-per-task={cores_per_proc}', '--gpus-per-task=1', '--constraint=gpu' ]) # Hack to enable process forking # Note: InfiniBand is known to experience hangs if an MPI # process is forked (see # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork). # Setting IBV_FORK_SAFE seems to fix this issue, but it may # hurt performance (see # https://linux.die.net/man/3/ibv_fork_init). set_environment('IBV_FORK_SAFE', 1) set_environment('MV2_ENABLE_AFFINITY', 0) set_environment('MV2_USE_CUDA', 1) set_environment('MKL_THREADING_LAYER', 'GNU') return lbann.launcher.make_batch_script( procs_per_node=procs_per_node, scheduler=scheduler, launcher_args=launcher_args, environment=environment, *args, **kwargs, )
def run(command, experiment_dir=os.getcwd(), nodes=1, procs_per_node=1, time_limit=-1, job_name=None, partition=None, account=None, reservation=None, jsrun_args='', environment={}, setup_only=False): """Run executable with LSF. Creates an LSF batch script in the experiment directory. If a LSF job allocation is detected, the script is run directly. Otherwise, the script is submitted to bsub. Args: command (str): Program to run under LSF, i.e. an executable and its command-line arguments. experiment_dir (str, optional): Experiment directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute node. time_limit (int, optional): Job time limit, in minutes. A negative value implies the system-default time limit. job_name (str, optional): Batch job name. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. jsrun_args (str, optional): Command-line arguments to jsrun. environment (dict of {str: str}, optional): Environment variables. setup_only (bool, optional): If true, the experiment is not run after the batch script is created. """ # Check for an existing job allocation. # Note: Settings for existing allocations take precedence. has_allocation = 'LSB_JOBID' in os.environ if has_allocation: job_name = os.environ['LSB_JOBNAME'] partition = os.environ['LSB_QUEUE'] # LSF does not provide a way to get the account via env vars. time_limit = -1 # Experiment directory experiment_dir = os.path.abspath(experiment_dir) os.makedirs(experiment_dir, exist_ok=True) batch_file = os.path.join(experiment_dir, 'batch.sh') out_file = os.path.join(experiment_dir, 'out.log') err_file = os.path.join(experiment_dir, 'err.log') nodes_file = os.path.join(experiment_dir, 'nodes.txt') # Create batch script. s = '#!/bin/sh\n' if job_name: s += '#BSUB -J {}\n'.format(job_name) s += '#BSUB -nnodes {}\n'.format(nodes) if partition: s += '#BSUB -q {}\n'.format(partition) if account: s += '#BSUB -G {}\n'.format(account) else: raise ValueError('LSF requires an account') if reservation: s += '#BSUB -U {}\n'.format(reservation) s += '#BSUB -cwd {}\n'.format(experiment_dir) s += '#BSUB -o {}\n'.format(out_file) s += '#BSUB -e {}\n'.format(err_file) if time_limit >= 0: s += '#BSUB -W {}\n'.format(time_limit) # Set environment variables. if environment: s += '\n# ==== Environment ====\n' for variable, value in environment.items(): s += 'export {}={}\n'.format(variable, value) # Time and node list. s += '\n# ==== Useful info ====\n' s += 'date\n' s += 'jsrun -n {} -a 1 hostname > {}\n'.format(nodes, nodes_file) s += 'sort --unique --output={0} {0}\n'.format(nodes_file) # Run experiment. s += '\n# ==== Experiment ====\n' for cmd in make_iterable(command): s += 'jsrun -n {} -a {} {} {}\n'.format(nodes, procs_per_node, jsrun_args, cmd) with open(batch_file, 'w') as f: f.write(s) # Make batch script executable. os.chmod(batch_file, 0o755) # Launch if needed. if not setup_only: if has_allocation: run_proc = subprocess.Popen(['sh', batch_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=experiment_dir) else: # bsub requires the batch script be read from its stdin. run_proc = subprocess.Popen('bsub < {}'.format(batch_file), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=experiment_dir, shell=True) out_proc = subprocess.Popen(['tee', out_file], stdin=run_proc.stdout, cwd=experiment_dir) err_proc = subprocess.Popen(['tee', err_file], stdin=run_proc.stderr, cwd=experiment_dir) run_proc.stdout.close() run_proc.stderr.close() run_proc.wait() out_proc.wait() err_proc.wait()
def add_parallel_command(self, command, work_dir=None, nodes=None, procs_per_node=None, time_limit=None, job_name=None, partition=None, account=None, launcher=None, launcher_args=None): """Add command to be executed in parallel. The command is launched with srun. Parallel processes are distributed evenly amongst the compute nodes. Args: command (`str` or `Iterable` of `str`s): Command to be executed in parallel. work_dir (str, optional): Working directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of parallel processes per compute node. time_limit (int, optional): Job time limit, in minutes. job_name (str, optional): Job name. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. launcher (str, optional): srun executable. launcher_args (`Iterable` of `str`s, optional): Command-line arguments to srun. """ # Use default values if needed if work_dir is None: work_dir = self.work_dir if nodes is None: nodes = self.nodes if procs_per_node is None: procs_per_node = self.procs_per_node if time_limit is None: time_limit = self.time_limit if job_name is None: job_name = self.job_name if partition is None: partition = self.partition if account is None: account = self.account if launcher is None: launcher = self.launcher if launcher_args is None: launcher_args = self.launcher_args # Construct srun invocation args = [launcher] args.extend(make_iterable(launcher_args)) args.append(f'--chdir={work_dir}') args.append(f'--nodes={nodes}') args.append(f'--ntasks={nodes * procs_per_node}') args.append(f'--ntasks-per-node={procs_per_node}') if time_limit is not None: args.append(f'--time={_time_string(time_limit)}') if job_name: args.append(f'--job-name={job_name}') if partition: args.append(f'--partition={partition}') if account: args.append(f'--account={account}') args.extend(make_iterable(command)) self.add_command(args)
def run(command, experiment_dir=os.getcwd(), nodes=1, procs_per_node=1, time_limit=-1, job_name=None, partition=None, account=None, reservation=None, srun_args='', environment={}, setup_only=False): """Run executable with Slurm. Creates a Slurm batch script in the experiment directory. If a Slurm job allocation is detected, the script is run directly. Otherwise, the script is submitted to sbatch. Args: command (str): Program to run under Slurm, i.e. an executable and its command-line arguments. experiment_dir (str, optional): Experiment directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute node. time_limit (int, optional): Job time limit, in minutes. A negative value implies the system-default time limit. job_name (str, optional): Batch job name. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. srun_args (str, optional): Command-line arguments to srun. environment (dict of {str: str}, optional): Environment variables. setup_only (bool, optional): If true, the experiment is not run after the batch script is created. """ # Check for an existing job allocation from Slurm # Note: Settings for current job allocation take precedence has_allocation = 'SLURM_JOB_ID' in os.environ if has_allocation: job_name = os.environ['SLURM_JOB_NAME'] partition = os.environ['SLURM_JOB_PARTITION'] account = os.environ['SLURM_JOB_ACCOUNT'] time_limit = -1 # Experiment directory experiment_dir = os.path.abspath(experiment_dir) os.makedirs(experiment_dir, exist_ok=True) batch_file = os.path.join(experiment_dir, 'batch.sh') out_file = os.path.join(experiment_dir, 'out.log') err_file = os.path.join(experiment_dir, 'err.log') nodes_file = os.path.join(experiment_dir, 'nodes.txt') # Write batch script with open(batch_file, 'w') as f: f.write('#!/bin/sh\n') # Slurm job settings if job_name: f.write('#SBATCH --job-name={}\n'.format(job_name)) f.write('#SBATCH --nodes={}\n'.format(nodes)) if partition: f.write('#SBATCH --partition={}\n'.format(partition)) if account: f.write('#SBATCH --account={}\n'.format(account)) if reservation: raise ValueError('Slurm reservations not supported') f.write('#SBATCH --workdir={}\n'.format(experiment_dir)) f.write('#SBATCH --output={}\n'.format(out_file)) f.write('#SBATCH --error={}\n'.format(err_file)) if time_limit >= 0: seconds = int((time_limit % 1) * 60) hours, minutes = divmod(int(time_limit), 60) days, hours = divmod(hours, 24) f.write('#SBATCH --time={}-{:02d}:{:02d}:{:02d}\n'.format( days, hours, minutes, seconds)) # Set environment if environment: f.write('\n') f.write('# ==== Environment ====\n') for variable, value in environment.items(): f.write('export {}={}\n'.format(variable, value)) # Display time and node list f.write('\n') f.write('# ==== Useful info ====\n') f.write('date\n') f.write('srun --nodes={0} --ntasks={0} hostname > {1}\n'.format( nodes, nodes_file)) f.write('sort --unique --output={0} {0}\n'.format(nodes_file)) # Run experiment f.write('\n') f.write('# ==== Experiment ====\n') for cmd in make_iterable(command): f.write('srun {} --nodes={} --ntasks={} {}\n'.format( srun_args, nodes, nodes * procs_per_node, cmd)) # Make batch script executable os.chmod(batch_file, 0o755) # Launch job if needed # Note: Pipes output to log files if not setup_only: run_exe = 'sh' if has_allocation else 'sbatch' run_proc = subprocess.Popen([run_exe, batch_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=experiment_dir) out_proc = subprocess.Popen(['tee', out_file], stdin=run_proc.stdout, cwd=experiment_dir) err_proc = subprocess.Popen(['tee', err_file], stdin=run_proc.stderr, cwd=experiment_dir) run_proc.stdout.close() run_proc.stderr.close() run_proc.wait() out_proc.wait() err_proc.wait()
def make_batch_script( system=system(), procs_per_node=procs_per_node(), scheduler=scheduler(), launcher_args=[], environment={}, *args, **kwargs, ): """Construct batch script manager with OLCF-specific optimizations. This is a wrapper around `lbann.launcher.make_batch_script`, with defaults and optimizations for LC systems. See that function for a full list of options. """ # Create shallow copies of input arguments launcher_args = list(make_iterable(launcher_args)) environment = environment.copy() # Helper function to configure environment variables # Note: User-provided values take precedence, followed by values # in the environment, followed by default values. def set_environment(key, default): if key not in environment: environment[key] = os.getenv(key, default) # Setup GPU bindings # Note: Each Hydrogen process is assigned to the GPU index that # matches its node communicator rank. This is not compatible with # mpibind, which assigns a GPU with index 0 to each process. We # can't use an exclusive GPU compute mode since processes may # touch the wrong GPU while figuring out ownership. if scheduler == 'slurm' and has_gpu(system): launcher_args.extend( ['--mpibind=off', '--nvidia_compute_mode=default']) # Optimizations for Summit-like systems if system in ('summit'): # Set thread affinity # Note: Aluminum's default thread affinity is incorrect since # hwloc treats GPUs as NUMA domains. # Note: There are actually 22 cores/socket, but it seems that # powers of 2 are better for performance. cores_per_socket = 16 procs_per_socket = (procs_per_node + 1) // 2 cores_per_proc = cores_per_socket // procs_per_socket set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_socket) set_environment('OMP_NUM_THREADS', cores_per_proc) if scheduler == 'lsf': launcher_args.append('--bind packed:{}'.format(cores_per_proc)) # Hack to enable process forking # Note: InfiniBand is known to experience hangs if an MPI # process is forked (see # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork). # Setting IBV_FORK_SAFE seems to fix this issue, but it may # hurt performance (see # https://linux.die.net/man/3/ibv_fork_init). set_environment('IBV_FORK_SAFE', 1) # Hacked bugfix for hcoll (1/23/19) # Note: Fixes hangs in MPI_Bcast. set_environment('HCOLL_ENABLE_SHARP', 0) set_environment('OMPI_MCA_coll_hcoll_enable', 0) # Hacked bugfix for Spectrum MPI PAMI (9/17/19) set_environment('PAMI_MAX_NUM_CACHED_PAGES', 0) # Configure NVSHMEM to load Spectrum MPI set_environment('NVSHMEM_MPI_LIB_NAME', 'libmpi_ibm.so') return lbann.launcher.make_batch_script( procs_per_node=procs_per_node, scheduler=scheduler, launcher_args=launcher_args, environment=environment, *args, **kwargs, )
def __init__(self, size, bias = True, weights=[], name=None, data_layout='data_parallel'): """Initialize GRU cell. Args: size (int): Size of output tensor. bias (bool): Whether to apply biases after linearity. weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most four - two matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two biases (3*size entries) each. If weights are not provided, the matrix and bias will be initialized in a similar manner as PyTorch (uniform random values from [-1/sqrt(size), 1/sqrt(size)]). name (str): Default name is in the form 'gru<index>'. data_layout (str): Data layout. """ super().__init__() GRU.global_count += 1 self.step = 0 self.size = size self.name = (name if name else 'gru{0}'.format(GRU.global_count)) self.data_layout = data_layout # Weights self.weights = list(make_iterable(weights)) if len(self.weights) > 4: raise ValueError('`GRU` has at most 4 weights, ' 'but got {0}'.format(len(self.weights))) ##@todo: use loop scale = 1 / math.sqrt(self.size) if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, max=scale), name=self.name+'_ih_matrix') ) if len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, max=scale), name=self.name+'_ih_bias') ) if len(self.weights) == 2: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, max=scale), name=self.name+'_hh_matrix') ) if len(self.weights) == 3: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, max=scale), name=self.name+'_hh_bias') ) # Linearity ####Learnable input-hidden weights self.ih_fc = FullyConnectedModule( 3*size, bias=bias, weights=self.weights[:2], name=self.name + '_ih_fc', data_layout=self.data_layout ) ###Learnable hidden-hidden weights self.hh_fc = FullyConnectedModule( 3*size, bias=bias, weights=self.weights[2:], name=self.name + '_hh_fc', data_layout=self.data_layout ) self.ones = lbann.Constant( value=1.0, num_neurons=str(size), data_layout=self.data_layout, name=self.name+'_ones', )