def set_popdist_args(args): if not popdist.isPopdistEnvSet(): args.use_popdist = False args.popdist_size = 1 args.popdist_rank = 0 return if args.inference: raise RuntimeError("Distributed execution is only supported for training") try: import horovod.popart as hvd hvd.init() except ImportError: raise ImportError("Could not find the PopART horovod extension. " "Please install the horovod .whl provided in the Poplar SDK.") args.use_popdist = True popdist_local_factor = popdist.getNumLocalReplicas() if args.replication_factor > 1 and args.replication_factor != popdist_local_factor: logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}") args.replication_factor = popdist_local_factor args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas() args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas() args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank) from mpi4py import MPI setup_comm(MPI.COMM_WORLD)
def get_basic_logger(name): log_levels_map = dict(CRITICAL=logging.CRITICAL, ERROR=logging.ERROR, WARNING=logging.WARNING, INFO=logging.INFO, DEBUG=logging.DEBUG, NOTSET=logging.NOTSET) log_level_env = os.getenv("RNNT_LOG_LEVEL") log_level = log_levels_map.get(log_level_env, logging.INFO) lh = logging.StreamHandler(sys.stdout) lh.setLevel(log_level) logging.basicConfig( format='%(asctime)s.%(msecs)03d %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', handlers=[lh]) logger = logging.getLogger(name) logger.setLevel(log_level) if popdist.isPopdistEnvSet(): instance_idx = popdist.popdist_core.getInstanceIndex() else: instance_idx = 0 if instance_idx > 0: # to avoid excess logging, disabling logging for instance_idxs > 0 logger.disabled = True return logger
def sync_metrics(outputs, factor=1, average=True): if popdist.isPopdistEnvSet(): if isinstance(outputs, float): return float( hvd.allreduce(torch.Tensor([outputs]), average=average).item()) else: return [ hvd.allreduce(output.div(factor), average=average).mean().item() for output in outputs ] else: if isinstance(outputs, float): return outputs else: return [output.div(factor).mean().item() for output in outputs]
def benchmark_throughput(dataloader, iteration=2): for _ in range(iteration): total_sample_size = 0 start_time = time.perf_counter() for input_data, _ in tqdm(dataloader, total=len(dataloader)): total_sample_size += input_data.size()[0] elapsed_time = time.perf_counter() - start_time if popdist.isPopdistEnvSet(): elapsed_time, total_sample_size = utils.synchronize_throughput_values( elapsed_time, total_sample_size, ) iteration_throughput = total_sample_size / elapsed_time print(f"Throughput of the iteration:{iteration_throughput:0.1f} img/sec")
def __iter__(self): worker_info = torch.utils.data.get_worker_info() if worker_info is not None: if popdist.isPopdistEnvSet(): self.worker_id = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex( ) self.shard = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex( ), worker_info.num_workers * popdist.getNumInstances() else: self.worker_id = worker_info.id self.shard = worker_info.id, worker_info.num_workers else: self.shard = None self.reset() if self.shuffle: np.random.shuffle(self.files) return self
def set_popdist_args(args): if not popdist.isPopdistEnvSet(): logger.info("No PopRun detected. Using single instance training") else: logger.info("PopRun is detected") args.use_popdist = True num_total_replicas = popdist.popdist_core.getNumTotalReplicas() args.local_replication_factor = popdist.getNumLocalReplicas() args.num_instances = popdist.popdist_core.getNumInstances() assert(num_total_replicas == args.local_replication_factor * args.num_instances) args.instance_idx = popdist.popdist_core.getInstanceIndex() if args.replication_factor != num_total_replicas: raise RuntimeError(f"Replication factor({args.replication_factor}) " f"should match popdist replication factor ({num_total_replicas})") if args.samples_per_step % args.num_instances != 0: raise RuntimeError(f"The number of samples per step({args.samples_per_step}) " f"has to be a integer multiple of the number of instances({args.num_instances})")
def parse_bert_args(args=None): pparser = argparse.ArgumentParser("BERT Configuration name", add_help=False) pparser.add_argument("--config", type=str, help="Configuration Name", default='demo_tiny_128') pargs, remaining_args = pparser.parse_known_args(args=args) config_name = pargs.config parser = argparse.ArgumentParser( "Poptorch BERT", add_help=True, formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Execution parser.add_argument( "--micro-batch-size", type=int, help= "Set the micro-batch-size. This is the single forward-backward path batch-size on one replica" ) parser.add_argument("--training-steps", type=int, help="Number of training steps") parser.add_argument("--batches-per-step", type=int, help="Number of batches per training step") parser.add_argument("--replication-factor", type=int, help="Number of replicas") parser.add_argument( "--gradient-accumulation", type=int, help="Number of gradients to accumulate before updating the weights") parser.add_argument( "--embedding-serialization-factor", type=int, help="Matmul serialization factor the embedding layers") parser.add_argument( "--recompute-checkpoint-every-layer", type=str_to_bool, nargs="?", const=True, default=False, help="This controls how recomputation is handled in pipelining. " "If True the output of each encoder layer will be stashed keeping the max liveness " "of activations to be at most one layer. " "However, the stash size scales with the number of pipeline stages so this may not always be beneficial. " "The added stash + code could be greater than the reduction in temporary memory.", ) parser.add_argument( "--enable-half-partials", type=str_to_bool, nargs="?", const=True, default=False, help="Enable half partials for matmuls and convolutions globally") parser.add_argument( "--optimizer-state-offchip", type=str_to_bool, nargs="?", const=True, default=True, help= "Set the tensor storage location for optimizer state to be offchip.") parser.add_argument( "--replicated-tensor-sharding", type=str_to_bool, nargs="?", const=True, default=False, help="Enable replicated tensor sharding of optimizer state") parser.add_argument("--ipus-per-replica", type=int, help="Number of IPUs required by each replica") parser.add_argument( "--layers-per-ipu", type=int, nargs="+", help= "Number of encoders placed on each IPU. Can be a single number, for an equal number encoder layers per IPU.\ Or it can be a list of numbers, specifying number of encoder layers for each individual IPU." ) parser.add_argument( "--matmul-proportion", type=float, nargs="+", help="Relative IPU memory proportion size allocated for matmul") parser.add_argument("--async-dataloader", type=str_to_bool, nargs="?", const=True, default=True, help="Enable asynchronous mode in the DataLoader") parser.add_argument("--random-seed", type=int, help="Seed for RNG") parser.add_argument("--num-epochs", type=int, help="SQuAD only - number of epochs to train for") # Optimizer parser.add_argument("--optimizer", type=str, choices=['AdamW', 'LAMB', 'LAMBNoBiasCorrection'], help="optimizer to use for the training") parser.add_argument( "--learning-rate", type=float, help= "Learning rate value for constant schedule, maximum for linear schedule." ) parser.add_argument( "--lr-schedule", type=str, choices=["constant", "linear"], help= "Type of learning rate schedule. --learning-rate will be used as the max value" ) parser.add_argument( "--lr-warmup", type=float, help= "Proportion of lr-schedule spent in warm-up. Number in range [0.0, 1.0]" ) parser.add_argument( "--loss-scaling", type=float, help="Loss scaling factor (recommend using powers of 2).\ If using automatic loss scaling, this value will be the initial value." ) parser.add_argument("--weight-decay", type=float, help="Set the weight decay") parser.add_argument( "--enable-half-first-order-momentum", type=str_to_bool, nargs="?", const=True, default=False, help="Use float16 for the first order momentum in the optimizer.") parser.add_argument("--squad-do-training", type=str_to_bool, nargs="?", const=True, default=True, help="Do SQuAD training (run_squad only)") parser.add_argument("--squad-do-validation", type=str_to_bool, nargs="?", const=True, default=True, help="Do SQuAD validation (run_squad only)") # Model parser.add_argument("--sequence-length", type=int, help="The max sequence length") parser.add_argument( "--mask-tokens", type=int, help="Set the max number of MLM tokens in the input dataset.") parser.add_argument("--vocab-size", type=int, help="Set the size of the vocabulary") parser.add_argument( "--hidden-size", type=int, help="The size of the hidden state of the transformer layers") parser.add_argument("--intermediate-size", type=int, help="hidden-size*4") parser.add_argument("--num-hidden-layers", type=int, help="The number of transformer layers") parser.add_argument("--num-attention-heads", type=int, help="Set the number of heads in self attention") parser.add_argument("--layer-norm-eps", type=float, help="The eps value for the layer norms") # Hugging Face specific parser.add_argument("--attention-probs-dropout-prob", type=float, nargs="?", const=True, help="Attention dropout probability") # Dataset parser.add_argument("--input-files", type=str, nargs="+", help="Input data files") parser.add_argument("--dataset", type=str, choices=['generated', 'pretraining'], help="dataset to use for the training") parser.add_argument("--synthetic-data", type=str_to_bool, nargs="?", const=True, default=False, help="No Host/IPU I/O, random data created on device") # Misc parser.add_argument("--dataloader-workers", type=int, help="The number of dataloader workers") parser.add_argument( "--profile-dir", type=str, help="Enable profiling and store results in this directory") parser.add_argument("--custom-ops", type=str_to_bool, nargs="?", const=True, default=True, help="Enable custom ops") parser.add_argument("--wandb", type=str_to_bool, nargs="?", const=True, default=False, help="Enabling logging to Weights and Biases") parser.add_argument( "--wandb-param-steps", type=int, default=None, help= "Log the model parameter statistics to Weights and Biases after every n training steps" ) parser.add_argument( "--disable-progress-bar", type=str_to_bool, nargs="?", const=True, default=False, help= "Disable the training progress bar. This is useful if you want to parse the stdout of a run" ) parser.add_argument( "--compile-only", type=str_to_bool, nargs="?", const=True, default=False, help= "Create an offline IPU target that can only be used for offline compilation." ) parser.add_argument( "--executable-cache-dir", type=str, default="", help= "Directory where Poplar executables are cached. If set, recompilation of identical graphs can be avoided. " "Required for both saving and loading executables.") # Checkpointing parser.add_argument("--checkpoint-output-dir", type=str, default="", help="Directory where checkpoints will be saved to.\ This can be either an absolute or relative path.") parser.add_argument( "--checkpoint-steps", type=int, default=None, help="Option to checkpoint model after every n training steps.") parser.add_argument( "--resume-training-from-checkpoint", type=str_to_bool, nargs="?", const=True, default=False, help= "Restore both the model checkpoint and training state in order to resume a training run." ) parser.add_argument( "--pretrained-checkpoint", type=str, default="", help="Checkpoint to be retrieved for further training. This can\ be either an absolute or relative path to the checkpoint directory or the name of a model on HuggingFace model hub." ) # This is here only for the help message parser.add_argument("--config", type=str, help="Configuration name") # Load the yaml yaml_args = dict() if config_name is not None: with open(config_file, "r") as f: try: yaml_args.update(**yaml.safe_load(f)[config_name]) except yaml.YAMLError as exc: print(exc) sys.exit(1) # Check the yaml args are valid known_args = set(vars(parser.parse_args(""))) unknown_args = set(yaml_args) - known_args if unknown_args: logger(f" Warning: Unknown arg(s) in config file: {unknown_args}") parser.set_defaults(**yaml_args) args = parser.parse_args(remaining_args) # Initialise PopDist if popdist.isPopdistEnvSet(): init_popdist(args) hvd.broadcast(torch.Tensor([args.random_seed]), root_rank=0) else: args.use_popdist = False # Expand layers_per_ipu input into list representation if isinstance(args.layers_per_ipu, int): args.layers_per_ipu = [args.layers_per_ipu] if len(args.layers_per_ipu) == 1: layers_per_ipu_ = args.layers_per_ipu[0] args.layers_per_ipu = [layers_per_ipu_ ] * (args.num_hidden_layers // layers_per_ipu_) if sum(args.layers_per_ipu) != args.num_hidden_layers: parser.error( f"layers_per_ipu not compatible with number of hidden layers: {args.layers_per_ipu} and {args.num_hidden_layers}" ) # Expand matmul_proportion input into list representation if isinstance(args.matmul_proportion, float): args.matmul_proportion = [args.matmul_proportion ] * args.ipus_per_replica if len(args.matmul_proportion) != args.ipus_per_replica: if len(args.matmul_proportion) == 1: args.matmul_proportion = args.matmul_proportion * args.ipus_per_replica else: parser.error( f"Length of matmul_proportion doesn't match ipus_per_replica: {args.matmul_proportion} vs {args.ipus_per_replica}" ) if args.checkpoint_steps is not None and args.checkpoint_steps < 1: parser.error("checkpoint-steps must be >=1") if args.use_popdist: args.global_batch_size = args.replication_factor * args.gradient_accumulation * args.micro_batch_size * args.popdist_size else: args.global_batch_size = args.replication_factor * args.gradient_accumulation * args.micro_batch_size args.samples_per_step = args.replication_factor * args.gradient_accumulation * args.micro_batch_size * args.batches_per_step args.intermediate_size = args.hidden_size * 4 return args
def add_pretraining_options(parser: argparse.ArgumentParser): group = parser.add_argument_group("Pretraining options") # Add pretraining-specific command line options here. return parser if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.ERROR) opts = make_global_options([add_pretraining_options]) opts['shards'] = ipu_utils.next_power_of_two( max(opts["device_mapping"]) + 1) if popdist.isPopdistEnvSet(): opts['use_popdist'] = True opts['replicas'] = popdist.getNumLocalReplicas() opts['total_replicas'] = popdist.getNumTotalReplicas() if opts['compile_only']: opts['select_ipu'] = None else: opts['select_ipu'] = popdist.getDeviceId() else: opts['use_popdist'] = False opts['total_replicas'] = opts['replicas'] opts['select_ipu'] = None set_defaults(opts) set_poplar_engine_options(execution_profile=opts['execution_profile'],
def handle_distributed_settings(args): # Initialise popdist if popdist.isPopdistEnvSet(): init_popdist(args) else: args.use_popdist = False
def logger(msg): if not popdist.isPopdistEnvSet() or popdist.getInstanceIndex() == 0: logging.info(msg)
bn_momentum = args.bn_momentum checkpoints = args.checkpoints clean_dir = args.clean_dir checkpoint_dir = args.checkpoint_dir label_smoothing = args.label_smoothing optimizer_name = args.optimizer optimizer_params = args.optimizer_params seed = args.seed internal_exchange_optimization_target = args.internal_exchange_optimization_target max_cross_replica_buffer_size = args.max_cross_replica_buffer_size max_reduce_many_buffer_size = args.max_reduce_many_buffer_size gather_conv_output = args.gather_conv_output pipeline_num_parallel = args.pipeline_num_parallel # check if the script has been called by poprun distributed_training = popdist.isPopdistEnvSet() if distributed_training: if num_replicas != popdist.getNumTotalReplicas(): logging.warning( f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) ' f'does not match the config (=={num_replicas}). Poprun will override the config.' ) num_replicas = popdist.getNumTotalReplicas() max_threads_per_instance = os.cpu_count() // popdist.getNumInstances() if pipeline_num_parallel > max_threads_per_instance: logging.warning( f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads ' f'divided by the number of instances, Poprun will override the config. ' )
def parse_arguments(): common_parser = utils.get_common_parser() parser = argparse.ArgumentParser(description='CNN training in PopTorch', parents=[common_parser]) parser.add_argument( '--data', choices=['cifar10', 'imagenet', 'synthetic', 'generated'], default='cifar10', help="Choose data") parser.add_argument( '--precision', choices=['16.16', '16.32', '32.32'], default='16.16', help= "Precision of Ops(weights/activations/gradients) and Master data types: 16.16, 16.32, 32.32" ) parser.add_argument('--imagenet-data-path', type=str, default="/localdata/datasets/imagenet-raw-data", help="Path of the raw imagenet data") parser.add_argument( '--gradient-accumulation', type=int, default=1, help="Number of batches to accumulate before a gradient update") parser.add_argument('--lr', type=float, default=0.01, help="Initial learning rate") parser.add_argument('--weight-decay', type=float, default=0.0001, help="L2 parameter penalty") parser.add_argument('--momentum', type=float, default=0.0, help="Momentum factor") parser.add_argument('--rmsprop-decay', type=float, default=0.99, help="RMSprop smoothing constant") parser.add_argument('--epoch', type=int, default=10, help="Number of training epochs") parser.add_argument( '--checkpoint-path', type=str, default="", help="Checkpoint path(if it is not defined, no checkpoint is created") parser.add_argument( '--validation-mode', choices=['none', 'during', 'after'], default="after", help= 'The model validation mode. none=no validation; during=validate after every epoch; after=validate after the training' ) parser.add_argument( '--disable-metrics', action='store_true', help= 'Do not calculate metrics during training, useful to measure peak throughput' ) parser.add_argument('--wandb', action='store_true', help="Add Weights & Biases logging") parser.add_argument('--seed', type=int, help="Set the random seed") parser.add_argument( '--enable-recompute', action='store_true', help= 'Enable the recomputation of network activations during backward pass instead ' 'of caching them during forward pass. This option turns on the recomputation for single-stage models. If the model is multi ' 'stage (pipelined) the recomputation is always enabled.') parser.add_argument( '--recompute-checkpoints', type=str, nargs='+', default=[], help= 'List of recomputation checkpoint rules: [conv:store convolution activations|norm: store normlayer activations]' ) parser.add_argument('--offload-optimizer', action='store_true', help='Offload the optimizer from the IPU memory') parser.add_argument( '--available-memory-proportion', type=float, default=[], nargs='+', help= 'Proportion of memory which is available for convolutions. Use a value of less than 0.6' ) parser.add_argument( '--logs-per-epoch', type=int, default=1, help= "The number of times the resuls are logged and a checkpoint is saved in each epoch" ) parser.add_argument( '--validation-frequency', type=int, default=4, help="How many training epochs to run between validation steps") parser.add_argument( '--label-smoothing', type=float, default=0.0, help='Label smoothing factor (Default=0 => no smoothing)') # LR schedule related params parser.add_argument('--lr-schedule', choices=["step", "cosine", "exponential"], default="step", help="Learning rate schedule") parser.add_argument('--lr-decay', type=float, default=0.5, help="Learning rate decay") parser.add_argument('--lr-epoch-decay', type=int, nargs='+', default=[], help="List of epoch, when lr drops") parser.add_argument('--warmup-epoch', type=int, default=0, help="Number of learning rate warmup epochs") parser.add_argument( '--lr-scheduler-freq', type=float, default=0, help= "Number of lr scheduler updates per epoch (0 to disable and update every iteration)" ) parser.add_argument('--optimizer', choices=['sgd', 'adamw', 'rmsprop'], default='sgd', help="Define the optimizer") # half precision training params parser.add_argument( '--loss-scaling', type=float, default=1.0, help= "Loss scaling factor. This value is reached by the end of the training." ) parser.add_argument( '--loss-velocity-scaling-ratio', type=float, default=1.0, help= "Only for SGD optimizer: Loss Velocity / Velocity scaling ratio. In case of large number of replicas >1.0 can increase numerical stability" ) parser.add_argument( '--initial-loss-scaling', type=float, help= "Initial loss scaling factor. The loss scaling interpolates between this and loss-scaling value." "Example: 100 epoch, initial loss scaling 16, loss scaling 128: Epoch 1-25 ls=16;Epoch 26-50 ls=32;Epoch 51-75 ls=64;Epoch 76-100 ls=128" ) parser.add_argument('--enable-stochastic-rounding', action="store_true", help="Enable Stochastic Rounding") parser.add_argument('--enable-fp-exceptions', action="store_true", help="Enable Floating Point Exceptions") # weight averaging params weight_avg.add_parser_arguments(parser) opts = utils.parse_with_config(parser, "configs.yml") if opts.initial_loss_scaling is None: opts.initial_loss_scaling = opts.loss_scaling # Initialise popdist if popdist.isPopdistEnvSet(): init_popdist(opts) else: opts.use_popdist = False if opts.seed is None: opts.seed = generate_random_seed(opts.use_popdist) # setup logging utils.Logger.setup_logging_folder(opts) num_stages = len(opts.pipeline_splits) + 1 num_amps = len(opts.available_memory_proportion) if num_stages == 1 and num_amps > 0: logging.error( '--available-memory-proportion should only be set when pipelining') sys.exit() elif num_stages > 1 and num_amps > 0 and num_amps != num_stages and num_amps != 1: logging.error( f'--available-memory-proportion number of elements should be either 1 or equal to the number of pipeline stages: {num_stages}' ) sys.exit() if opts.weight_avg_strategy != 'none' and opts.checkpoint_path == '': logging.error( 'Please provide a --checkpoint-path folder to apply weight averaging to.' ) sys.exit() if opts.batch_size == 1 and opts.norm_type == "batch": logging.warning( "BatchNorm with batch size of 1 may cause instability during inference." ) if num_stages > 1: logging.info("Recomputation is always enabled when using pipelining.") if not opts.enable_recompute and len(opts.recompute_checkpoints) > 0: logging.warning( "Recomputation is not enabled, whlile recomputation checkpoints are provided." ) return opts
def ipu_prog(num_replicas, gradient_accumulation): import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) popdist_on = popdist.isPopdistEnvSet() num_global_replicas = popdist.getNumTotalReplicas( ) if popdist_on else num_replicas num_instances = popdist.getNumInstances() if popdist_on else 1 dataset_size = global_batch_size = 16 micro_batch_size = int(global_batch_size / num_global_replicas / gradient_accumulation) X = np.arange(1, dataset_size + 1, 1, dtype=float) Y = [0] * dataset_size ds = tf.data.Dataset.from_tensor_slices((X, Y)) if popdist_on: ds = ds.shard(num_instances, index=popdist.getInstanceIndex()) ds = ds.batch(micro_batch_size, drop_remainder=True) ds = ds.repeat() cfg = ipu.config.IPUConfig() if popdist_on: cfg = popdist.tensorflow.set_ipu_config( cfg, ipus_per_replica=popdist.getNumIpusPerReplica(), configure_device=True) hvd.init() else: cfg.auto_select_ipus = num_global_replicas cfg.configure_ipu_system() strategy = popdist_strategy.PopDistStrategy( ) if popdist_on else ipu.ipu_strategy.IPUStrategy() with strategy.scope(): def get_model(): input_layer = tf.keras.Input(shape=1) kernel_initializer = tf.keras.initializers.Constant(1) x = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=kernel_initializer)(input_layer) return tf.keras.Model(input_layer, x) model = get_model() model.set_gradient_accumulation_options( gradient_accumulation_steps_per_replica=gradient_accumulation) model.build(input_shape=(micro_batch_size, 1)) if popdist_on: def gradient_normalizer(grads_and_vars): return [(grad / gradient_accumulation, var) for grad, var in grads_and_vars] else: def gradient_normalizer(grads_and_vars): return [ (grad / num_global_replicas / gradient_accumulation, var) for grad, var in grads_and_vars ] optimizer = tf.keras.optimizers.SGD( learning_rate=1.0, gradient_transformers=[gradient_normalizer]) loss_class = tf.keras.losses.MeanSquaredError loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue) loss = loss_class() micro_batches_per_weight_update = num_global_replicas * gradient_accumulation steps_per_execution = dataset_size // ( micro_batch_size * micro_batches_per_weight_update ) * micro_batches_per_weight_update model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.losses.MSE], steps_per_execution=steps_per_execution) callbacks = [ OutFeedQueueCallback(queue=loss_outfeed_queue, name='average_loss') ] if num_instances > 1: callbacks += [AllReduceMetricsCallback()] callbacks += [LoggingCallback(1)] model.fit(ds, steps_per_epoch=steps_per_execution, callbacks=callbacks) return model.get_weights()[0][0][0]