def set_distribution_defaults(opts): if opts['distributed'] and opts['use_popdist']: raise ValueError("Cannot use popdist with --distributed") if opts['distributed']: # Read the cluster config from the `TF_CONFIG` environment variable cluster = tf.distribute.cluster_resolver.TFConfigClusterResolver() # Allow `mpirun` to override the task index cluster.task_id = os.getenv("OMPI_COMM_WORLD_RANK") cluster.task_type = "worker" opts['distributed_worker_count'] = cluster.cluster_spec().num_tasks( "worker") opts['distributed_worker_index'] = cluster.task_id opts['distributed_cluster'] = cluster.cluster_spec().as_dict() opts['summary_str'] += 'Distribution\n' opts['summary_str'] += ' Worker count: {distributed_worker_count}\n' opts['summary_str'] += ' Worker index: {distributed_worker_index}\n' opts['summary_str'] += ' Cluster: {distributed_cluster}\n' elif opts['use_popdist']: opts['distributed_worker_count'] = int(popdist.getNumTotalReplicas() / popdist.getNumLocalReplicas()) opts['distributed_worker_index'] = int( popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas()) opts['distributed_cluster'] = None opts['summary_str'] += 'Popdist\n' opts['summary_str'] += ' Process count: {distributed_worker_count}\n' opts['summary_str'] += ' Process index: {distributed_worker_index}\n' else: opts['distributed_worker_count'] = 1 opts['distributed_worker_index'] = 0 opts['distributed_cluster'] = None
def set_popdist_args(args): if not popdist.isPopdistEnvSet(): args.use_popdist = False args.popdist_size = 1 args.popdist_rank = 0 return if args.inference: raise RuntimeError("Distributed execution is only supported for training") try: import horovod.popart as hvd hvd.init() except ImportError: raise ImportError("Could not find the PopART horovod extension. " "Please install the horovod .whl provided in the Poplar SDK.") args.use_popdist = True popdist_local_factor = popdist.getNumLocalReplicas() if args.replication_factor > 1 and args.replication_factor != popdist_local_factor: logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}") args.replication_factor = popdist_local_factor args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas() args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas() args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank) from mpi4py import MPI setup_comm(MPI.COMM_WORLD)
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replication_factor: print(f"The number of replicas is overridden by PopRun. " f"The new value is {popdist.getNumTotalReplicas()}.") args.replication_factor = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances()
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replicas: logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.") args.replicas = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances() args.popdist_local_rank = hvd.local_rank()
# Add pretraining-specific command line options here. return parser if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.ERROR) opts = make_global_options([add_pretraining_options]) opts['shards'] = ipu_utils.next_power_of_two( max(opts["device_mapping"]) + 1) if popdist.isPopdistEnvSet(): opts['use_popdist'] = True opts['replicas'] = popdist.getNumLocalReplicas() opts['total_replicas'] = popdist.getNumTotalReplicas() if opts['compile_only']: opts['select_ipu'] = None else: opts['select_ipu'] = popdist.getDeviceId() else: opts['use_popdist'] = False opts['total_replicas'] = opts['replicas'] opts['select_ipu'] = None set_defaults(opts) set_poplar_engine_options(execution_profile=opts['execution_profile'], memory_profile=opts['memory_profile'], profile_dir=str(opts['profile_dir']), sync_replicas_independently=opts['replicas'] > 1
parser.add_argument("--init_weight", type=str, default="./ckpt_init/yolov3_coco_converted.fp16.ckpt", help="ckpt init weight") arguments = parser.parse_args() with open(arguments.config) as f: opts = json.load(f) opts['train']['annot_path'] = arguments.train_path opts['train']['initial_weight'] = arguments.init_weight opts['test']['annot_path'] = arguments.test_path if popdist.isPopdistEnvSet(): opts["use_popdist"] = True opts["train"]["replicas"] = popdist.getNumLocalReplicas() opts["train"]["total_replicas"] = popdist.getNumTotalReplicas() opts["select_ipu"] = popdist.getDeviceId( len(opts["train"]["device_mapping"])) opts["distributed_worker_count"] = int(popdist.getNumTotalReplicas() / popdist.getNumLocalReplicas()) opts["distributed_worker_index"] = int( popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas()) opts["use_popdist"] = True else: opts["use_popdist"] = False opts["train"]["total_replicas"] = opts["train"]["replicas"] opts["select_ipu"] = -1 opts["distributed_worker_count"] = 1 opts["distributed_worker_index"] = 0 opts["use_popdist"] = False
def bert_session_options(args, model): engine_options = {} options = popart.SessionOptions() options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enableFloatingPointChecks = args.floating_point_exceptions options.enableStochasticRounding = args.stochastic_rounding options.enablePrefetchDatastreams = not args.minimum_latency_inference # These options are necessary to allow poplar to overlap processing of # multiple iterations in the host side options.defaultPrefetchBufferingDepth = 3 options.rearrangeAnchorsOnHost = False engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly" options.enableOutlining = not args.no_outlining options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime partials_type = "half" if args.enable_half_partials else "float" options.partialsTypeMatMuls = partials_type options.convolutionOptions = {'partialsType': partials_type} if args.replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = args.replication_factor engine_options["target.syncReplicasIndependently"] = "true" if args.use_popdist: popdist.popart.configureSessionOptions(options) # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops # such as add or reshapeInplace. # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation. options.outlineThreshold = 10.0 if args.pipeline: options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline if args.recompute_checkpoint_every_layer and any( map(lambda l: l > 1, args.layers_per_ipu)): options.scheduleNonWeightUpdateGradientConsumersEarly = True options.optimizerStateTensorLocationSettings = bert_optimizer_location_settings( args) # RTS to shard optimizer states with multiple IPU Pods num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() if num_total_replicas > num_local_replicas and args.replicated_tensor_sharding: # Fewer elements would not make sense to shard options.optimizerStateTensorLocationSettings.minElementsForReplicatedTensorSharding = num_local_replicas sharding_domain = popart.CommGroup(popart.CommGroupType.Consecutive, num_local_replicas) # Ensure all related tensors have the same sharding domain set options.weightTensorLocationSettings.location.shardingDomain = sharding_domain options.optimizerStateTensorLocationSettings.location.shardingDomain = sharding_domain options.accumulatorTensorLocationSettings.location.shardingDomain = sharding_domain if "Mean" in args.gradient_reduction_type: options.accumulationAndReplicationReductionType = popart.ReductionType.Mean options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post if args.gradient_reduction_type == "RunningMean": options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running if args.gradient_accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = args.gradient_accumulation_factor # When not replicated SyncPattern.SinglePipeline will provide better overlap # than this option. if device_is_replicated(args): if args.optimizer_state_offchip: options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule. OverlapMemoryOptimized, [0]) elif args.replicated_tensor_sharding: # With OnChip + RTS this will cluster optimizer steps into # schedule bins. Improving outlining and scheduling time. options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule. OverlapMemoryOptimized) if args.engine_cache is not None: options.enableEngineCaching = True options.cachePath = args.engine_cache if args.profile: options.enableEngineCaching = False options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count options.disableGradAccumulationTensorStreams = not args.save_initializers_externally if args.max_copy_merge_size == -1: logger.debug("No copy merge size limit applied") else: logger.warning( f"Copy merge size limit set to {args.max_copy_merge_size}") engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size) # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large # transposes before operations. if args.disable_fully_connected_pass: if args.task == "SQUAD" and args.sequence_length == 384: logger.warning( "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM." ) options.enableFullyConnectedPass = False if args.inference and args.engine_cache is not None and not args.variable_weights_inference: logger.warning( "Using engine cache with constant weights. Checkpoint weights will be ignored. " "Use the `--variable-weights-inference` flag if checkpoint weights should be used." ) if args.variable_weights_inference: options.constantWeights = False if args.group_host_syncs: options.groupHostSync = True if args.internal_exchange_optimisation_target is not None: engine_options["opt.internalExchangeOptimisationTarget"] = str( args.internal_exchange_optimisation_target) options.engineOptions = engine_options # Set synthetic data mode (if active) if args.synthetic_data: if args.synthetic_data_initializer == "zeros": options.syntheticDataMode = popart.SyntheticDataMode.Zeros else: options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal logger.info( f"Running with Synthetic Data Type '{options.syntheticDataMode}'") return options
checkpoint_dir = args.checkpoint_dir label_smoothing = args.label_smoothing optimizer_name = args.optimizer optimizer_params = args.optimizer_params seed = args.seed internal_exchange_optimization_target = args.internal_exchange_optimization_target max_cross_replica_buffer_size = args.max_cross_replica_buffer_size max_reduce_many_buffer_size = args.max_reduce_many_buffer_size gather_conv_output = args.gather_conv_output pipeline_num_parallel = args.pipeline_num_parallel # check if the script has been called by poprun distributed_training = popdist.isPopdistEnvSet() if distributed_training: if num_replicas != popdist.getNumTotalReplicas(): logging.warning( f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) ' f'does not match the config (=={num_replicas}). Poprun will override the config.' ) num_replicas = popdist.getNumTotalReplicas() max_threads_per_instance = os.cpu_count() // popdist.getNumInstances() if pipeline_num_parallel > max_threads_per_instance: logging.warning( f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads ' f'divided by the number of instances, Poprun will override the config. ' ) # Limit the maximal number of threads to the total of physical threads divided by the number of instances pipeline_num_parallel = max_threads_per_instance
def gradient_normalizer(grads_and_vars): return \ [(grad / popdist.getNumTotalReplicas() / batch_config.gradient_accumulation_count, var) for grad, var in grads_and_vars] optimizer_params['learning_rate'] = lr_scheduler
def replicated_tensor_sharding_core(): parser = argparse.ArgumentParser(description="Parse launch parameters.") parser.add_argument("--tensors", nargs="*") parser.add_argument("--optim", nargs="?") parser.add_argument("--tmpdir", nargs="?") parser.add_argument("--filename", nargs="?") parser.add_argument("--compute_batch", nargs="?") args = parser.parse_args(sys.argv[2:]) ipus_per_replica = 1 batches_per_step = 10 accumulation_factor = 4 compute_batch = int(args.compute_batch) hidden_size = 4 reduction = popart.ReductionType.Sum deviceInfo = popdist.popart.getDevice(ipus_per_replica) num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() builder = popart.Builder() np.random.seed(12321) weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32) input_data = [] label_data = [] for i in range( 0, batches_per_step * num_local_replicas * accumulation_factor * compute_batch): np.random.seed(popdist.getInstanceIndex() + i * popdist.getNumInstances()) input_data += [np.random.rand(hidden_size).astype(np.float32)] label_data += [np.random.randint(0, hidden_size, size=1)] input_data = np.concatenate(input_data) label_data = np.concatenate(label_data) builder = popart.Builder() d0 = builder.addInputTensor( popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0") l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )), "l0") data = {} data[d0] = input_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1)) w0 = builder.addInitializedInputTensor(weight_data, 'weight0') x = builder.aiOnnx.matmul([d0, w0]) x = builder.aiOnnx.softmax([x]) data[l0] = label_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1))\ .astype(np.uint32) loss = builder.aiGraphcore.nllloss([x, l0], reduction=reduction, debugContext='loss') proto = builder.getModelProto() dataFlow = popart.DataFlow( batches_per_step, {av: popart.AnchorReturnType("ALL") for av in [x, loss]}) opts = popart.SessionOptions() if accumulation_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accumulation_factor opts.explicitRecomputation = True opts.enableExplicitMainLoops = True opts.useHostCopyOps = True # Let popdist handle distributed settings, such as: # opts.enableDistributedReplicatedGraphs # opts.globalReplicaOffset # opts.globalReplicationFactor popdist.popart.configureSessionOptions(opts) for tensor in ["weight", "optimizerState", "accumulator"]: userOption = tensor + "TensorLocationSettings" print( f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}" ) locationSetting = getattr(opts, userOption) locationSetting.minElementsForOffChip = 0 locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas if tensor in args.tensors: locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On if num_total_replicas > num_local_replicas: locationSetting.location.shardingDomain = popart.CommGroup( popart.CommGroupType.Consecutive, num_local_replicas) setattr(opts, userOption, locationSetting) if args.optim == "Adam": optimizer = popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, False), }, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.LambNoBias) if args.optim == "SGD": optimizer = popart.ConstSGD(0.01) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=deviceInfo, userOptions=opts, loss=loss, optimizer=optimizer) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) tmp_path = Path(args.tmpdir) tmp_path.mkdir(parents=True, exist_ok=True) file_path = str(tmp_path / args.filename) session.modelToHost(file_path) post_proto = onnx.load(file_path)
def ipu_prog(num_replicas, gradient_accumulation): import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) popdist_on = popdist.isPopdistEnvSet() num_global_replicas = popdist.getNumTotalReplicas( ) if popdist_on else num_replicas num_instances = popdist.getNumInstances() if popdist_on else 1 dataset_size = global_batch_size = 16 micro_batch_size = int(global_batch_size / num_global_replicas / gradient_accumulation) X = np.arange(1, dataset_size + 1, 1, dtype=float) Y = [0] * dataset_size ds = tf.data.Dataset.from_tensor_slices((X, Y)) if popdist_on: ds = ds.shard(num_instances, index=popdist.getInstanceIndex()) ds = ds.batch(micro_batch_size, drop_remainder=True) ds = ds.repeat() cfg = ipu.config.IPUConfig() if popdist_on: cfg = popdist.tensorflow.set_ipu_config( cfg, ipus_per_replica=popdist.getNumIpusPerReplica(), configure_device=True) hvd.init() else: cfg.auto_select_ipus = num_global_replicas cfg.configure_ipu_system() strategy = popdist_strategy.PopDistStrategy( ) if popdist_on else ipu.ipu_strategy.IPUStrategy() with strategy.scope(): def get_model(): input_layer = tf.keras.Input(shape=1) kernel_initializer = tf.keras.initializers.Constant(1) x = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=kernel_initializer)(input_layer) return tf.keras.Model(input_layer, x) model = get_model() model.set_gradient_accumulation_options( gradient_accumulation_steps_per_replica=gradient_accumulation) model.build(input_shape=(micro_batch_size, 1)) if popdist_on: def gradient_normalizer(grads_and_vars): return [(grad / gradient_accumulation, var) for grad, var in grads_and_vars] else: def gradient_normalizer(grads_and_vars): return [ (grad / num_global_replicas / gradient_accumulation, var) for grad, var in grads_and_vars ] optimizer = tf.keras.optimizers.SGD( learning_rate=1.0, gradient_transformers=[gradient_normalizer]) loss_class = tf.keras.losses.MeanSquaredError loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue) loss = loss_class() micro_batches_per_weight_update = num_global_replicas * gradient_accumulation steps_per_execution = dataset_size // ( micro_batch_size * micro_batches_per_weight_update ) * micro_batches_per_weight_update model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.losses.MSE], steps_per_execution=steps_per_execution) callbacks = [ OutFeedQueueCallback(queue=loss_outfeed_queue, name='average_loss') ] if num_instances > 1: callbacks += [AllReduceMetricsCallback()] callbacks += [LoggingCallback(1)] model.fit(ds, steps_per_epoch=steps_per_execution, callbacks=callbacks) return model.get_weights()[0][0][0]