Esempio n. 1
0
def set_popdist_args(args):
    if not popdist.isPopdistEnvSet():
        args.use_popdist = False
        args.popdist_size = 1
        args.popdist_rank = 0
        return

    if args.inference:
        raise RuntimeError("Distributed execution is only supported for training")

    try:
        import horovod.popart as hvd
        hvd.init()
    except ImportError:
        raise ImportError("Could not find the PopART horovod extension. "
                          "Please install the horovod .whl provided in the Poplar SDK.")

    args.use_popdist = True
    popdist_local_factor = popdist.getNumLocalReplicas()
    if args.replication_factor > 1 and args.replication_factor != popdist_local_factor:
        logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}")
    args.replication_factor = popdist_local_factor

    args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas()
    args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas()
    args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank)

    from mpi4py import MPI
    setup_comm(MPI.COMM_WORLD)
Esempio n. 2
0
def get_basic_logger(name):
    log_levels_map = dict(CRITICAL=logging.CRITICAL,
                          ERROR=logging.ERROR,
                          WARNING=logging.WARNING,
                          INFO=logging.INFO,
                          DEBUG=logging.DEBUG,
                          NOTSET=logging.NOTSET)
    log_level_env = os.getenv("RNNT_LOG_LEVEL")
    log_level = log_levels_map.get(log_level_env, logging.INFO)

    lh = logging.StreamHandler(sys.stdout)
    lh.setLevel(log_level)
    logging.basicConfig(
        format='%(asctime)s.%(msecs)03d %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[lh])
    logger = logging.getLogger(name)
    logger.setLevel(log_level)

    if popdist.isPopdistEnvSet():
        instance_idx = popdist.popdist_core.getInstanceIndex()
    else:
        instance_idx = 0
    if instance_idx > 0:
        # to avoid excess logging, disabling logging for instance_idxs > 0
        logger.disabled = True

    return logger
Esempio n. 3
0
def sync_metrics(outputs, factor=1, average=True):
    if popdist.isPopdistEnvSet():
        if isinstance(outputs, float):
            return float(
                hvd.allreduce(torch.Tensor([outputs]), average=average).item())
        else:
            return [
                hvd.allreduce(output.div(factor),
                              average=average).mean().item()
                for output in outputs
            ]
    else:
        if isinstance(outputs, float):
            return outputs
        else:
            return [output.div(factor).mean().item() for output in outputs]
Esempio n. 4
0
def benchmark_throughput(dataloader, iteration=2):
    for _ in range(iteration):
        total_sample_size = 0
        start_time = time.perf_counter()
        for input_data, _ in tqdm(dataloader, total=len(dataloader)):
            total_sample_size += input_data.size()[0]
        elapsed_time = time.perf_counter() - start_time

        if popdist.isPopdistEnvSet():
            elapsed_time, total_sample_size = utils.synchronize_throughput_values(
                elapsed_time,
                total_sample_size,
            )

        iteration_throughput = total_sample_size / elapsed_time
        print(f"Throughput of the iteration:{iteration_throughput:0.1f} img/sec")
Esempio n. 5
0
 def __iter__(self):
     worker_info = torch.utils.data.get_worker_info()
     if worker_info is not None:
         if popdist.isPopdistEnvSet():
             self.worker_id = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             )
             self.shard = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             ), worker_info.num_workers * popdist.getNumInstances()
         else:
             self.worker_id = worker_info.id
             self.shard = worker_info.id, worker_info.num_workers
     else:
         self.shard = None
     self.reset()
     if self.shuffle:
         np.random.shuffle(self.files)
     return self
Esempio n. 6
0
def set_popdist_args(args):
    if not popdist.isPopdistEnvSet():
        logger.info("No PopRun detected. Using single instance training")
    else:
        logger.info("PopRun is detected")

        args.use_popdist = True
        num_total_replicas = popdist.popdist_core.getNumTotalReplicas()
        args.local_replication_factor = popdist.getNumLocalReplicas()
        args.num_instances = popdist.popdist_core.getNumInstances()
        assert(num_total_replicas == args.local_replication_factor * args.num_instances)
        args.instance_idx = popdist.popdist_core.getInstanceIndex()

        if args.replication_factor != num_total_replicas:
            raise RuntimeError(f"Replication factor({args.replication_factor}) "
                               f"should match popdist replication factor ({num_total_replicas})")

        if args.samples_per_step % args.num_instances != 0:
            raise RuntimeError(f"The number of samples per step({args.samples_per_step}) "
                               f"has to be a integer multiple of the number of instances({args.num_instances})")
Esempio n. 7
0
def parse_bert_args(args=None):
    pparser = argparse.ArgumentParser("BERT Configuration name",
                                      add_help=False)
    pparser.add_argument("--config",
                         type=str,
                         help="Configuration Name",
                         default='demo_tiny_128')
    pargs, remaining_args = pparser.parse_known_args(args=args)
    config_name = pargs.config

    parser = argparse.ArgumentParser(
        "Poptorch BERT",
        add_help=True,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Execution
    parser.add_argument(
        "--micro-batch-size",
        type=int,
        help=
        "Set the micro-batch-size. This is the single forward-backward path batch-size on one replica"
    )
    parser.add_argument("--training-steps",
                        type=int,
                        help="Number of training steps")
    parser.add_argument("--batches-per-step",
                        type=int,
                        help="Number of batches per training step")
    parser.add_argument("--replication-factor",
                        type=int,
                        help="Number of replicas")
    parser.add_argument(
        "--gradient-accumulation",
        type=int,
        help="Number of gradients to accumulate before updating the weights")
    parser.add_argument(
        "--embedding-serialization-factor",
        type=int,
        help="Matmul serialization factor the embedding layers")
    parser.add_argument(
        "--recompute-checkpoint-every-layer",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help="This controls how recomputation is handled in pipelining. "
        "If True the output of each encoder layer will be stashed keeping the max liveness "
        "of activations to be at most one layer. "
        "However, the stash size scales with the number of pipeline stages so this may not always be beneficial. "
        "The added stash + code could be greater than the reduction in temporary memory.",
    )
    parser.add_argument(
        "--enable-half-partials",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help="Enable half partials for matmuls and convolutions globally")
    parser.add_argument(
        "--optimizer-state-offchip",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=True,
        help=
        "Set the tensor storage location for optimizer state to be offchip.")
    parser.add_argument(
        "--replicated-tensor-sharding",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help="Enable replicated tensor sharding of optimizer state")
    parser.add_argument("--ipus-per-replica",
                        type=int,
                        help="Number of IPUs required by each replica")
    parser.add_argument(
        "--layers-per-ipu",
        type=int,
        nargs="+",
        help=
        "Number of encoders placed on each IPU. Can be a single number, for an equal number encoder layers per IPU.\
                              Or it can be a list of numbers, specifying number of encoder layers for each individual IPU."
    )
    parser.add_argument(
        "--matmul-proportion",
        type=float,
        nargs="+",
        help="Relative IPU memory proportion size allocated for matmul")
    parser.add_argument("--async-dataloader",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=True,
                        help="Enable asynchronous mode in the DataLoader")
    parser.add_argument("--random-seed", type=int, help="Seed for RNG")
    parser.add_argument("--num-epochs",
                        type=int,
                        help="SQuAD only - number of epochs to train for")

    # Optimizer
    parser.add_argument("--optimizer",
                        type=str,
                        choices=['AdamW', 'LAMB', 'LAMBNoBiasCorrection'],
                        help="optimizer to use for the training")
    parser.add_argument(
        "--learning-rate",
        type=float,
        help=
        "Learning rate value for constant schedule, maximum for linear schedule."
    )
    parser.add_argument(
        "--lr-schedule",
        type=str,
        choices=["constant", "linear"],
        help=
        "Type of learning rate schedule. --learning-rate will be used as the max value"
    )
    parser.add_argument(
        "--lr-warmup",
        type=float,
        help=
        "Proportion of lr-schedule spent in warm-up. Number in range [0.0, 1.0]"
    )
    parser.add_argument(
        "--loss-scaling",
        type=float,
        help="Loss scaling factor (recommend using powers of 2).\
                             If using automatic loss scaling, this value will be the initial value."
    )
    parser.add_argument("--weight-decay",
                        type=float,
                        help="Set the weight decay")
    parser.add_argument(
        "--enable-half-first-order-momentum",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help="Use float16 for the first order momentum in the optimizer.")
    parser.add_argument("--squad-do-training",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=True,
                        help="Do SQuAD training (run_squad only)")
    parser.add_argument("--squad-do-validation",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=True,
                        help="Do SQuAD validation (run_squad only)")

    # Model
    parser.add_argument("--sequence-length",
                        type=int,
                        help="The max sequence length")
    parser.add_argument(
        "--mask-tokens",
        type=int,
        help="Set the max number of MLM tokens in the input dataset.")
    parser.add_argument("--vocab-size",
                        type=int,
                        help="Set the size of the vocabulary")
    parser.add_argument(
        "--hidden-size",
        type=int,
        help="The size of the hidden state of the transformer layers")
    parser.add_argument("--intermediate-size", type=int, help="hidden-size*4")
    parser.add_argument("--num-hidden-layers",
                        type=int,
                        help="The number of transformer layers")
    parser.add_argument("--num-attention-heads",
                        type=int,
                        help="Set the number of heads in self attention")
    parser.add_argument("--layer-norm-eps",
                        type=float,
                        help="The eps value for the layer norms")

    # Hugging Face specific
    parser.add_argument("--attention-probs-dropout-prob",
                        type=float,
                        nargs="?",
                        const=True,
                        help="Attention dropout probability")

    # Dataset
    parser.add_argument("--input-files",
                        type=str,
                        nargs="+",
                        help="Input data files")
    parser.add_argument("--dataset",
                        type=str,
                        choices=['generated', 'pretraining'],
                        help="dataset to use for the training")
    parser.add_argument("--synthetic-data",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=False,
                        help="No Host/IPU I/O, random data created on device")

    # Misc
    parser.add_argument("--dataloader-workers",
                        type=int,
                        help="The number of dataloader workers")
    parser.add_argument(
        "--profile-dir",
        type=str,
        help="Enable profiling and store results in this directory")
    parser.add_argument("--custom-ops",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=True,
                        help="Enable custom ops")
    parser.add_argument("--wandb",
                        type=str_to_bool,
                        nargs="?",
                        const=True,
                        default=False,
                        help="Enabling logging to Weights and Biases")
    parser.add_argument(
        "--wandb-param-steps",
        type=int,
        default=None,
        help=
        "Log the model parameter statistics to Weights and Biases after every n training steps"
    )
    parser.add_argument(
        "--disable-progress-bar",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help=
        "Disable the training progress bar. This is useful if you want to parse the stdout of a run"
    )
    parser.add_argument(
        "--compile-only",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help=
        "Create an offline IPU target that can only be used for offline compilation."
    )
    parser.add_argument(
        "--executable-cache-dir",
        type=str,
        default="",
        help=
        "Directory where Poplar executables are cached. If set, recompilation of identical graphs can be avoided. "
        "Required for both saving and loading executables.")

    # Checkpointing
    parser.add_argument("--checkpoint-output-dir",
                        type=str,
                        default="",
                        help="Directory where checkpoints will be saved to.\
                             This can be either an absolute or relative path.")
    parser.add_argument(
        "--checkpoint-steps",
        type=int,
        default=None,
        help="Option to checkpoint model after every n training steps.")
    parser.add_argument(
        "--resume-training-from-checkpoint",
        type=str_to_bool,
        nargs="?",
        const=True,
        default=False,
        help=
        "Restore both the model checkpoint and training state in order to resume a training run."
    )
    parser.add_argument(
        "--pretrained-checkpoint",
        type=str,
        default="",
        help="Checkpoint to be retrieved for further training. This can\
                              be either an absolute or relative path to the checkpoint directory or the name of a model on HuggingFace model hub."
    )

    # This is here only for the help message
    parser.add_argument("--config", type=str, help="Configuration name")

    # Load the yaml
    yaml_args = dict()
    if config_name is not None:
        with open(config_file, "r") as f:
            try:
                yaml_args.update(**yaml.safe_load(f)[config_name])
            except yaml.YAMLError as exc:
                print(exc)
                sys.exit(1)

    # Check the yaml args are valid
    known_args = set(vars(parser.parse_args("")))
    unknown_args = set(yaml_args) - known_args

    if unknown_args:
        logger(f" Warning: Unknown arg(s) in config file: {unknown_args}")

    parser.set_defaults(**yaml_args)
    args = parser.parse_args(remaining_args)

    # Initialise PopDist
    if popdist.isPopdistEnvSet():
        init_popdist(args)
        hvd.broadcast(torch.Tensor([args.random_seed]), root_rank=0)
    else:
        args.use_popdist = False

    # Expand layers_per_ipu input into list representation
    if isinstance(args.layers_per_ipu, int):
        args.layers_per_ipu = [args.layers_per_ipu]

    if len(args.layers_per_ipu) == 1:
        layers_per_ipu_ = args.layers_per_ipu[0]
        args.layers_per_ipu = [layers_per_ipu_
                               ] * (args.num_hidden_layers // layers_per_ipu_)

    if sum(args.layers_per_ipu) != args.num_hidden_layers:
        parser.error(
            f"layers_per_ipu not compatible with number of hidden layers: {args.layers_per_ipu} and {args.num_hidden_layers}"
        )

    # Expand matmul_proportion input into list representation
    if isinstance(args.matmul_proportion, float):
        args.matmul_proportion = [args.matmul_proportion
                                  ] * args.ipus_per_replica

    if len(args.matmul_proportion) != args.ipus_per_replica:
        if len(args.matmul_proportion) == 1:
            args.matmul_proportion = args.matmul_proportion * args.ipus_per_replica
        else:
            parser.error(
                f"Length of matmul_proportion doesn't match ipus_per_replica: {args.matmul_proportion} vs {args.ipus_per_replica}"
            )

    if args.checkpoint_steps is not None and args.checkpoint_steps < 1:
        parser.error("checkpoint-steps must be >=1")

    if args.use_popdist:
        args.global_batch_size = args.replication_factor * args.gradient_accumulation * args.micro_batch_size * args.popdist_size
    else:
        args.global_batch_size = args.replication_factor * args.gradient_accumulation * args.micro_batch_size

    args.samples_per_step = args.replication_factor * args.gradient_accumulation * args.micro_batch_size * args.batches_per_step
    args.intermediate_size = args.hidden_size * 4

    return args
Esempio n. 8
0
def add_pretraining_options(parser: argparse.ArgumentParser):
    group = parser.add_argument_group("Pretraining options")
    # Add pretraining-specific command line options here.
    return parser


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)

    opts = make_global_options([add_pretraining_options])

    opts['shards'] = ipu_utils.next_power_of_two(
        max(opts["device_mapping"]) + 1)

    if popdist.isPopdistEnvSet():
        opts['use_popdist'] = True
        opts['replicas'] = popdist.getNumLocalReplicas()
        opts['total_replicas'] = popdist.getNumTotalReplicas()
        if opts['compile_only']:
            opts['select_ipu'] = None
        else:
            opts['select_ipu'] = popdist.getDeviceId()
    else:
        opts['use_popdist'] = False
        opts['total_replicas'] = opts['replicas']
        opts['select_ipu'] = None

    set_defaults(opts)

    set_poplar_engine_options(execution_profile=opts['execution_profile'],
Esempio n. 9
0
def handle_distributed_settings(args):
    # Initialise popdist
    if popdist.isPopdistEnvSet():
        init_popdist(args)
    else:
        args.use_popdist = False
Esempio n. 10
0
def logger(msg):
    if not popdist.isPopdistEnvSet() or popdist.getInstanceIndex() == 0:
        logging.info(msg)
Esempio n. 11
0
    bn_momentum = args.bn_momentum
    checkpoints = args.checkpoints
    clean_dir = args.clean_dir
    checkpoint_dir = args.checkpoint_dir
    label_smoothing = args.label_smoothing
    optimizer_name = args.optimizer
    optimizer_params = args.optimizer_params
    seed = args.seed
    internal_exchange_optimization_target = args.internal_exchange_optimization_target
    max_cross_replica_buffer_size = args.max_cross_replica_buffer_size
    max_reduce_many_buffer_size = args.max_reduce_many_buffer_size
    gather_conv_output = args.gather_conv_output
    pipeline_num_parallel = args.pipeline_num_parallel

    # check if the script has been called by poprun
    distributed_training = popdist.isPopdistEnvSet()

    if distributed_training:
        if num_replicas != popdist.getNumTotalReplicas():
            logging.warning(
                f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) '
                f'does not match the config (=={num_replicas}). Poprun will override the config.'
            )
            num_replicas = popdist.getNumTotalReplicas()

        max_threads_per_instance = os.cpu_count() // popdist.getNumInstances()
        if pipeline_num_parallel > max_threads_per_instance:
            logging.warning(
                f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads '
                f'divided by the number of instances,  Poprun will override the config. '
            )
Esempio n. 12
0
def parse_arguments():
    common_parser = utils.get_common_parser()
    parser = argparse.ArgumentParser(description='CNN training in PopTorch',
                                     parents=[common_parser])
    parser.add_argument(
        '--data',
        choices=['cifar10', 'imagenet', 'synthetic', 'generated'],
        default='cifar10',
        help="Choose data")
    parser.add_argument(
        '--precision',
        choices=['16.16', '16.32', '32.32'],
        default='16.16',
        help=
        "Precision of Ops(weights/activations/gradients) and Master data types: 16.16, 16.32, 32.32"
    )
    parser.add_argument('--imagenet-data-path',
                        type=str,
                        default="/localdata/datasets/imagenet-raw-data",
                        help="Path of the raw imagenet data")
    parser.add_argument(
        '--gradient-accumulation',
        type=int,
        default=1,
        help="Number of batches to accumulate before a gradient update")
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help="Initial learning rate")
    parser.add_argument('--weight-decay',
                        type=float,
                        default=0.0001,
                        help="L2 parameter penalty")
    parser.add_argument('--momentum',
                        type=float,
                        default=0.0,
                        help="Momentum factor")
    parser.add_argument('--rmsprop-decay',
                        type=float,
                        default=0.99,
                        help="RMSprop smoothing constant")
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help="Number of training epochs")
    parser.add_argument(
        '--checkpoint-path',
        type=str,
        default="",
        help="Checkpoint path(if it is not defined, no checkpoint is created")
    parser.add_argument(
        '--validation-mode',
        choices=['none', 'during', 'after'],
        default="after",
        help=
        'The model validation mode. none=no validation; during=validate after every epoch; after=validate after the training'
    )
    parser.add_argument(
        '--disable-metrics',
        action='store_true',
        help=
        'Do not calculate metrics during training, useful to measure peak throughput'
    )
    parser.add_argument('--wandb',
                        action='store_true',
                        help="Add Weights & Biases logging")
    parser.add_argument('--seed', type=int, help="Set the random seed")
    parser.add_argument(
        '--enable-recompute',
        action='store_true',
        help=
        'Enable the recomputation of network activations during backward pass instead '
        'of caching them during forward pass. This option turns on the recomputation for single-stage models. If the model is multi '
        'stage (pipelined) the recomputation is always enabled.')
    parser.add_argument(
        '--recompute-checkpoints',
        type=str,
        nargs='+',
        default=[],
        help=
        'List of recomputation checkpoint rules: [conv:store convolution activations|norm: store normlayer activations]'
    )
    parser.add_argument('--offload-optimizer',
                        action='store_true',
                        help='Offload the optimizer from the IPU memory')
    parser.add_argument(
        '--available-memory-proportion',
        type=float,
        default=[],
        nargs='+',
        help=
        'Proportion of memory which is available for convolutions. Use a value of less than 0.6'
    )
    parser.add_argument(
        '--logs-per-epoch',
        type=int,
        default=1,
        help=
        "The number of times the resuls are logged and a checkpoint is saved in each epoch"
    )
    parser.add_argument(
        '--validation-frequency',
        type=int,
        default=4,
        help="How many training epochs to run between validation steps")
    parser.add_argument(
        '--label-smoothing',
        type=float,
        default=0.0,
        help='Label smoothing factor (Default=0 => no smoothing)')
    # LR schedule related params
    parser.add_argument('--lr-schedule',
                        choices=["step", "cosine", "exponential"],
                        default="step",
                        help="Learning rate schedule")
    parser.add_argument('--lr-decay',
                        type=float,
                        default=0.5,
                        help="Learning rate decay")
    parser.add_argument('--lr-epoch-decay',
                        type=int,
                        nargs='+',
                        default=[],
                        help="List of epoch, when lr drops")
    parser.add_argument('--warmup-epoch',
                        type=int,
                        default=0,
                        help="Number of learning rate warmup epochs")
    parser.add_argument(
        '--lr-scheduler-freq',
        type=float,
        default=0,
        help=
        "Number of lr scheduler updates per epoch (0 to disable and update every iteration)"
    )
    parser.add_argument('--optimizer',
                        choices=['sgd', 'adamw', 'rmsprop'],
                        default='sgd',
                        help="Define the optimizer")
    # half precision training params
    parser.add_argument(
        '--loss-scaling',
        type=float,
        default=1.0,
        help=
        "Loss scaling factor. This value is reached by the end of the training."
    )
    parser.add_argument(
        '--loss-velocity-scaling-ratio',
        type=float,
        default=1.0,
        help=
        "Only for SGD optimizer: Loss Velocity / Velocity scaling ratio. In case of large number of replicas >1.0 can increase numerical stability"
    )
    parser.add_argument(
        '--initial-loss-scaling',
        type=float,
        help=
        "Initial loss scaling factor. The loss scaling interpolates between this and loss-scaling value."
        "Example: 100 epoch, initial loss scaling 16, loss scaling 128: Epoch 1-25 ls=16;Epoch 26-50 ls=32;Epoch 51-75 ls=64;Epoch 76-100 ls=128"
    )
    parser.add_argument('--enable-stochastic-rounding',
                        action="store_true",
                        help="Enable Stochastic Rounding")
    parser.add_argument('--enable-fp-exceptions',
                        action="store_true",
                        help="Enable Floating Point Exceptions")
    # weight averaging params
    weight_avg.add_parser_arguments(parser)

    opts = utils.parse_with_config(parser, "configs.yml")
    if opts.initial_loss_scaling is None:
        opts.initial_loss_scaling = opts.loss_scaling

    # Initialise popdist
    if popdist.isPopdistEnvSet():
        init_popdist(opts)
    else:
        opts.use_popdist = False

    if opts.seed is None:
        opts.seed = generate_random_seed(opts.use_popdist)

    # setup logging
    utils.Logger.setup_logging_folder(opts)

    num_stages = len(opts.pipeline_splits) + 1
    num_amps = len(opts.available_memory_proportion)

    if num_stages == 1 and num_amps > 0:
        logging.error(
            '--available-memory-proportion should only be set when pipelining')
        sys.exit()
    elif num_stages > 1 and num_amps > 0 and num_amps != num_stages and num_amps != 1:
        logging.error(
            f'--available-memory-proportion number of elements should be either 1 or equal to the number of pipeline stages: {num_stages}'
        )
        sys.exit()

    if opts.weight_avg_strategy != 'none' and opts.checkpoint_path == '':
        logging.error(
            'Please provide a --checkpoint-path folder to apply weight averaging to.'
        )
        sys.exit()

    if opts.batch_size == 1 and opts.norm_type == "batch":
        logging.warning(
            "BatchNorm with batch size of 1 may cause instability during inference."
        )

    if num_stages > 1:
        logging.info("Recomputation is always enabled when using pipelining.")

    if not opts.enable_recompute and len(opts.recompute_checkpoints) > 0:
        logging.warning(
            "Recomputation is not enabled, whlile recomputation checkpoints are provided."
        )

    return opts
Esempio n. 13
0
    def ipu_prog(num_replicas, gradient_accumulation):
        import logging
        import sys
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        popdist_on = popdist.isPopdistEnvSet()

        num_global_replicas = popdist.getNumTotalReplicas(
        ) if popdist_on else num_replicas
        num_instances = popdist.getNumInstances() if popdist_on else 1

        dataset_size = global_batch_size = 16
        micro_batch_size = int(global_batch_size / num_global_replicas /
                               gradient_accumulation)

        X = np.arange(1, dataset_size + 1, 1, dtype=float)
        Y = [0] * dataset_size
        ds = tf.data.Dataset.from_tensor_slices((X, Y))
        if popdist_on:
            ds = ds.shard(num_instances, index=popdist.getInstanceIndex())
        ds = ds.batch(micro_batch_size, drop_remainder=True)
        ds = ds.repeat()

        cfg = ipu.config.IPUConfig()
        if popdist_on:
            cfg = popdist.tensorflow.set_ipu_config(
                cfg,
                ipus_per_replica=popdist.getNumIpusPerReplica(),
                configure_device=True)
            hvd.init()
        else:
            cfg.auto_select_ipus = num_global_replicas
        cfg.configure_ipu_system()

        strategy = popdist_strategy.PopDistStrategy(
        ) if popdist_on else ipu.ipu_strategy.IPUStrategy()

        with strategy.scope():

            def get_model():
                input_layer = tf.keras.Input(shape=1)
                kernel_initializer = tf.keras.initializers.Constant(1)
                x = tf.keras.layers.Dense(
                    1, use_bias=False,
                    kernel_initializer=kernel_initializer)(input_layer)
                return tf.keras.Model(input_layer, x)

            model = get_model()
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=gradient_accumulation)
            model.build(input_shape=(micro_batch_size, 1))

            if popdist_on:

                def gradient_normalizer(grads_and_vars):
                    return [(grad / gradient_accumulation, var)
                            for grad, var in grads_and_vars]
            else:

                def gradient_normalizer(grads_and_vars):
                    return [
                        (grad / num_global_replicas / gradient_accumulation,
                         var) for grad, var in grads_and_vars
                    ]

            optimizer = tf.keras.optimizers.SGD(
                learning_rate=1.0, gradient_transformers=[gradient_normalizer])

            loss_class = tf.keras.losses.MeanSquaredError
            loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
            loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue)
            loss = loss_class()

            micro_batches_per_weight_update = num_global_replicas * gradient_accumulation
            steps_per_execution = dataset_size // (
                micro_batch_size * micro_batches_per_weight_update
            ) * micro_batches_per_weight_update

            model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=[tf.keras.losses.MSE],
                          steps_per_execution=steps_per_execution)

            callbacks = [
                OutFeedQueueCallback(queue=loss_outfeed_queue,
                                     name='average_loss')
            ]
            if num_instances > 1:
                callbacks += [AllReduceMetricsCallback()]
            callbacks += [LoggingCallback(1)]

            model.fit(ds,
                      steps_per_epoch=steps_per_execution,
                      callbacks=callbacks)

            return model.get_weights()[0][0][0]