コード例 #1
0
ファイル: train.py プロジェクト: muzzynine/examples-1
def set_distribution_defaults(opts):
    if opts['distributed'] and opts['use_popdist']:
        raise ValueError("Cannot use popdist with --distributed")

    if opts['distributed']:
        # Read the cluster config from the `TF_CONFIG` environment variable
        cluster = tf.distribute.cluster_resolver.TFConfigClusterResolver()

        # Allow `mpirun` to override the task index
        cluster.task_id = os.getenv("OMPI_COMM_WORLD_RANK")
        cluster.task_type = "worker"

        opts['distributed_worker_count'] = cluster.cluster_spec().num_tasks(
            "worker")
        opts['distributed_worker_index'] = cluster.task_id
        opts['distributed_cluster'] = cluster.cluster_spec().as_dict()

        opts['summary_str'] += 'Distribution\n'
        opts['summary_str'] += ' Worker count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Worker index: {distributed_worker_index}\n'
        opts['summary_str'] += ' Cluster: {distributed_cluster}\n'
    elif opts['use_popdist']:
        opts['distributed_worker_count'] = int(popdist.getNumTotalReplicas() /
                                               popdist.getNumLocalReplicas())
        opts['distributed_worker_index'] = int(
            popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas())
        opts['distributed_cluster'] = None

        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0
        opts['distributed_cluster'] = None
コード例 #2
0
def set_popdist_args(args):
    if not popdist.isPopdistEnvSet():
        args.use_popdist = False
        args.popdist_size = 1
        args.popdist_rank = 0
        return

    if args.inference:
        raise RuntimeError("Distributed execution is only supported for training")

    try:
        import horovod.popart as hvd
        hvd.init()
    except ImportError:
        raise ImportError("Could not find the PopART horovod extension. "
                          "Please install the horovod .whl provided in the Poplar SDK.")

    args.use_popdist = True
    popdist_local_factor = popdist.getNumLocalReplicas()
    if args.replication_factor > 1 and args.replication_factor != popdist_local_factor:
        logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}")
    args.replication_factor = popdist_local_factor

    args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas()
    args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas()
    args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank)

    from mpi4py import MPI
    setup_comm(MPI.COMM_WORLD)
コード例 #3
0
ファイル: args.py プロジェクト: graphcore/examples
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replication_factor:
        print(f"The number of replicas is overridden by PopRun. "
              f"The new value is {popdist.getNumTotalReplicas()}.")
    args.replication_factor = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
コード例 #4
0
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replicas:
        logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.")
    args.replicas = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
    args.popdist_local_rank = hvd.local_rank()
コード例 #5
0
def set_popdist_args(args):
    if not popdist.isPopdistEnvSet():
        logger.info("No PopRun detected. Using single instance training")
    else:
        logger.info("PopRun is detected")

        args.use_popdist = True
        num_total_replicas = popdist.popdist_core.getNumTotalReplicas()
        args.local_replication_factor = popdist.getNumLocalReplicas()
        args.num_instances = popdist.popdist_core.getNumInstances()
        assert(num_total_replicas == args.local_replication_factor * args.num_instances)
        args.instance_idx = popdist.popdist_core.getInstanceIndex()

        if args.replication_factor != num_total_replicas:
            raise RuntimeError(f"Replication factor({args.replication_factor}) "
                               f"should match popdist replication factor ({num_total_replicas})")

        if args.samples_per_step % args.num_instances != 0:
            raise RuntimeError(f"The number of samples per step({args.samples_per_step}) "
                               f"has to be a integer multiple of the number of instances({args.num_instances})")
コード例 #6
0
ファイル: run_pretraining.py プロジェクト: graphcore/examples
    group = parser.add_argument_group("Pretraining options")
    # Add pretraining-specific command line options here.
    return parser


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)

    opts = make_global_options([add_pretraining_options])

    opts['shards'] = ipu_utils.next_power_of_two(
        max(opts["device_mapping"]) + 1)

    if popdist.isPopdistEnvSet():
        opts['use_popdist'] = True
        opts['replicas'] = popdist.getNumLocalReplicas()
        opts['total_replicas'] = popdist.getNumTotalReplicas()
        if opts['compile_only']:
            opts['select_ipu'] = None
        else:
            opts['select_ipu'] = popdist.getDeviceId()
    else:
        opts['use_popdist'] = False
        opts['total_replicas'] = opts['replicas']
        opts['select_ipu'] = None

    set_defaults(opts)

    set_poplar_engine_options(execution_profile=opts['execution_profile'],
                              memory_profile=opts['memory_profile'],
                              profile_dir=str(opts['profile_dir']),
コード例 #7
0
ファイル: train.py プロジェクト: graphcore/examples
                        help="data path for test")
    parser.add_argument("--init_weight",
                        type=str,
                        default="./ckpt_init/yolov3_coco_converted.fp16.ckpt",
                        help="ckpt init weight")

    arguments = parser.parse_args()
    with open(arguments.config) as f:
        opts = json.load(f)

    opts['train']['annot_path'] = arguments.train_path
    opts['train']['initial_weight'] = arguments.init_weight
    opts['test']['annot_path'] = arguments.test_path
    if popdist.isPopdistEnvSet():
        opts["use_popdist"] = True
        opts["train"]["replicas"] = popdist.getNumLocalReplicas()
        opts["train"]["total_replicas"] = popdist.getNumTotalReplicas()
        opts["select_ipu"] = popdist.getDeviceId(
            len(opts["train"]["device_mapping"]))
        opts["distributed_worker_count"] = int(popdist.getNumTotalReplicas() /
                                               popdist.getNumLocalReplicas())
        opts["distributed_worker_index"] = int(
            popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas())
        opts["use_popdist"] = True

    else:
        opts["use_popdist"] = False
        opts["train"]["total_replicas"] = opts["train"]["replicas"]
        opts["select_ipu"] = -1
        opts["distributed_worker_count"] = 1
        opts["distributed_worker_index"] = 0
コード例 #8
0
def bert_session_options(args, model):
    engine_options = {}
    options = popart.SessionOptions()
    options.virtualGraphMode = popart.VirtualGraphMode.Manual
    options.enableFloatingPointChecks = args.floating_point_exceptions
    options.enableStochasticRounding = args.stochastic_rounding
    options.enablePrefetchDatastreams = not args.minimum_latency_inference

    # These options are necessary to allow poplar to overlap processing of
    # multiple iterations in the host side
    options.defaultPrefetchBufferingDepth = 3
    options.rearrangeAnchorsOnHost = False
    engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly"

    options.enableOutlining = not args.no_outlining
    options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime
    partials_type = "half" if args.enable_half_partials else "float"
    options.partialsTypeMatMuls = partials_type
    options.convolutionOptions = {'partialsType': partials_type}
    if args.replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = args.replication_factor
        engine_options["target.syncReplicasIndependently"] = "true"
    if args.use_popdist:
        popdist.popart.configureSessionOptions(options)
    # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops
    # such as add or reshapeInplace.
    # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation.
    options.outlineThreshold = 10.0
    if args.pipeline:
        options.enablePipelining = True
        options.autoRecomputation = popart.RecomputationType.Pipeline
        if args.recompute_checkpoint_every_layer and any(
                map(lambda l: l > 1, args.layers_per_ipu)):
            options.scheduleNonWeightUpdateGradientConsumersEarly = True

    options.optimizerStateTensorLocationSettings = bert_optimizer_location_settings(
        args)

    # RTS to shard optimizer states with multiple IPU Pods
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    if num_total_replicas > num_local_replicas and args.replicated_tensor_sharding:
        # Fewer elements would not make sense to shard
        options.optimizerStateTensorLocationSettings.minElementsForReplicatedTensorSharding = num_local_replicas
        sharding_domain = popart.CommGroup(popart.CommGroupType.Consecutive,
                                           num_local_replicas)

        # Ensure all related tensors have the same sharding domain set
        options.weightTensorLocationSettings.location.shardingDomain = sharding_domain
        options.optimizerStateTensorLocationSettings.location.shardingDomain = sharding_domain
        options.accumulatorTensorLocationSettings.location.shardingDomain = sharding_domain

    if "Mean" in args.gradient_reduction_type:
        options.accumulationAndReplicationReductionType = popart.ReductionType.Mean
        options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post
        if args.gradient_reduction_type == "RunningMean":
            options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running

    if args.gradient_accumulation_factor > 1:
        options.enableGradientAccumulation = True
        options.accumulationFactor = args.gradient_accumulation_factor

        # When not replicated SyncPattern.SinglePipeline will provide better overlap
        # than this option.
        if device_is_replicated(args):
            if args.optimizer_state_offchip:
                options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                    popart.AccumulateOuterFragmentSchedule.
                    OverlapMemoryOptimized, [0])
            elif args.replicated_tensor_sharding:
                # With OnChip + RTS this will cluster optimizer steps into
                # schedule bins. Improving outlining and scheduling time.
                options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                    popart.AccumulateOuterFragmentSchedule.
                    OverlapMemoryOptimized)

    if args.engine_cache is not None:
        options.enableEngineCaching = True
        options.cachePath = args.engine_cache
    if args.profile:
        options.enableEngineCaching = False
    options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count
    options.disableGradAccumulationTensorStreams = not args.save_initializers_externally
    if args.max_copy_merge_size == -1:
        logger.debug("No copy merge size limit applied")
    else:
        logger.warning(
            f"Copy merge size limit set to {args.max_copy_merge_size}")
        engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size)

    # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large
    # transposes before operations.
    if args.disable_fully_connected_pass:
        if args.task == "SQUAD" and args.sequence_length == 384:
            logger.warning(
                "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM."
            )
        options.enableFullyConnectedPass = False

    if args.inference and args.engine_cache is not None and not args.variable_weights_inference:
        logger.warning(
            "Using engine cache with constant weights. Checkpoint weights will be ignored. "
            "Use the `--variable-weights-inference` flag if checkpoint weights should be used."
        )

    if args.variable_weights_inference:
        options.constantWeights = False

    if args.group_host_syncs:
        options.groupHostSync = True

    if args.internal_exchange_optimisation_target is not None:
        engine_options["opt.internalExchangeOptimisationTarget"] = str(
            args.internal_exchange_optimisation_target)

    options.engineOptions = engine_options

    # Set synthetic data mode (if active)
    if args.synthetic_data:
        if args.synthetic_data_initializer == "zeros":
            options.syntheticDataMode = popart.SyntheticDataMode.Zeros
        else:
            options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        logger.info(
            f"Running with Synthetic Data Type '{options.syntheticDataMode}'")
    return options
コード例 #9
0
def replicated_tensor_sharding_core():
    parser = argparse.ArgumentParser(description="Parse launch parameters.")
    parser.add_argument("--tensors", nargs="*")
    parser.add_argument("--optim", nargs="?")
    parser.add_argument("--tmpdir", nargs="?")
    parser.add_argument("--filename", nargs="?")
    parser.add_argument("--compute_batch", nargs="?")
    args = parser.parse_args(sys.argv[2:])

    ipus_per_replica = 1

    batches_per_step = 10
    accumulation_factor = 4
    compute_batch = int(args.compute_batch)
    hidden_size = 4
    reduction = popart.ReductionType.Sum

    deviceInfo = popdist.popart.getDevice(ipus_per_replica)
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    builder = popart.Builder()

    np.random.seed(12321)
    weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32)

    input_data = []
    label_data = []

    for i in range(
            0, batches_per_step * num_local_replicas * accumulation_factor *
            compute_batch):
        np.random.seed(popdist.getInstanceIndex() +
                       i * popdist.getNumInstances())
        input_data += [np.random.rand(hidden_size).astype(np.float32)]
        label_data += [np.random.randint(0, hidden_size, size=1)]

    input_data = np.concatenate(input_data)
    label_data = np.concatenate(label_data)

    builder = popart.Builder()

    d0 = builder.addInputTensor(
        popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0")
    l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )),
                                "l0")

    data = {}

    data[d0] = input_data.reshape((batches_per_step, num_local_replicas,
                                   accumulation_factor, compute_batch, -1))

    w0 = builder.addInitializedInputTensor(weight_data, 'weight0')
    x = builder.aiOnnx.matmul([d0, w0])

    x = builder.aiOnnx.softmax([x])

    data[l0] = label_data.reshape((batches_per_step,
                    num_local_replicas,
                    accumulation_factor,
                    compute_batch,
                    -1))\
                .astype(np.uint32)
    loss = builder.aiGraphcore.nllloss([x, l0],
                                       reduction=reduction,
                                       debugContext='loss')

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(
        batches_per_step,
        {av: popart.AnchorReturnType("ALL")
         for av in [x, loss]})

    opts = popart.SessionOptions()
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor
    opts.explicitRecomputation = True
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    # Let popdist handle distributed settings, such as:
    # opts.enableDistributedReplicatedGraphs
    # opts.globalReplicaOffset
    # opts.globalReplicationFactor
    popdist.popart.configureSessionOptions(opts)

    for tensor in ["weight", "optimizerState", "accumulator"]:
        userOption = tensor + "TensorLocationSettings"
        print(
            f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}"
        )
        locationSetting = getattr(opts, userOption)
        locationSetting.minElementsForOffChip = 0
        locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas
        if tensor in args.tensors:
            locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On
        if num_total_replicas > num_local_replicas:
            locationSetting.location.shardingDomain = popart.CommGroup(
                popart.CommGroupType.Consecutive, num_local_replicas)
        setattr(opts, userOption, locationSetting)

    if args.optim == "Adam":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias)
    if args.optim == "SGD":
        optimizer = popart.ConstSGD(0.01)

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     deviceInfo=deviceInfo,
                                     userOptions=opts,
                                     loss=loss,
                                     optimizer=optimizer)

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    tmp_path = Path(args.tmpdir)
    tmp_path.mkdir(parents=True, exist_ok=True)
    file_path = str(tmp_path / args.filename)
    session.modelToHost(file_path)
    post_proto = onnx.load(file_path)