Ejemplo n.º 1
0
    def run(transposed):
        bsize = 8
        dsize = 10
        builder = popart.Builder()
        ip = builder.addInputTensor(
            popart.TensorInfo("FLOAT", [bsize, dsize, dsize]))
        if transposed:
            # Explicitly specify the batch dimension for init
            init = builder.aiGraphcore.init([dsize, dsize, bsize],
                                            popart.DataType.FLOAT,
                                            popart.InitType.Zero, 2)
        else:
            init = builder.aiGraphcore.init([bsize, dsize, dsize],
                                            popart.DataType.FLOAT,
                                            popart.InitType.Zero, 0)

        def add_layer(in_id):
            w = builder.addInitializedInputTensor(
                np.ones([dsize, dsize], np.float32))
            if transposed:
                inputs = [w, in_id]
            else:
                inputs = [in_id, w]
            matmul_id = builder.aiOnnx.matmul(inputs)
            return matmul_id

        if transposed:
            ip_t = builder.aiOnnx.transpose([ip])
        else:
            ip_t = ip
        m1 = add_layer(ip_t)
        init = builder.aiOnnx.add([init, m1])
        m2 = add_layer(m1)
        init = builder.aiOnnx.add([init, m2])
        m3 = add_layer(m2)
        init = builder.aiOnnx.add([init, m3])

        out = builder.aiGraphcore.l1loss([init], 0.1)
        builder.addOutputTensor(out)

        device = tu.create_test_device(1)

        dfAnchors = {out: popart.AnchorReturnType("All")}

        opts = popart.SessionOptions()
        opts.enableOutlining = True
        opts.batchSerializationSettings.factor = 4

        proto = builder.getModelProto()

        session = popart.InferenceSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dfAnchors),
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        ip_data = np.ones((bsize, dsize, dsize), dtype=np.float32)
        stepio = popart.PyStepIO({ip: ip_data}, anchors)

        session.run(stepio)
Ejemplo n.º 2
0
def _run_impl(torchWriter, patterns, outputdir, cifarInIndices, device,
              device_hw_id, mode, syntheticData, transformations, epochs,
              printAnchorArrays):

    runIds = [-1] + [
        int(x.split("runId")[1].split("_")[0])
        for x in os.listdir(outputdir) if "runId" in x
    ]
    baseId = 1 + max(runIds)

    def getFnModel(framework, epoch):
        return os.path.join(
            outputdir,
            "runId%d_%sModel_epoch%s.onnx" % (baseId, framework, epoch))

    def getFnPopArt(epoch):
        return getFnModel("PopArt", epoch)

    def getFnTorch(epoch):
        return getFnModel("Torch", epoch)

    def getFnModel0():
        return os.path.join(outputdir, "runId%d_model0.onnx" % (baseId, ))

    dataFlow = torchWriter.dataFlow
    inputShapeInfo = torchWriter.inputShapeInfo
    validModes = ["infer", "train"]
    if mode not in validModes:
        raise Exception("mode must be one of " + str(validModes))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # determine what the data directory is
    datadir = "unset"

    dir_path = os.path.dirname(os.path.realpath(__file__))
    path_c10datadir = os.path.join(dir_path, "c10datadir.py")
    if os.path.exists(path_c10datadir):
        import c10datadir
        datadir = c10datadir.c10datadir
    else:
        tmpdir = tempfile.gettempdir()
        datadir = os.path.abspath(os.path.join(tmpdir, 'cifar10data'))
    print("Using datadir=%s" % (datadir))

    if (not os.path.exists(datadir)):
        print(
            "Specified datadir %s does not exist. Consider making it here with os.mkdir(datadir)"
            % (datadir, ))

    print("c10driver: getting data from", datadir)
    trainset = datasets.CIFAR10(root=datadir,
                                train=True,
                                download=False,
                                transform=transform)

    fnModel0 = getFnModel0()

    # write ONNX Model to file
    torchWriter.saveModel(fnModel=fnModel0)

    stepLoader = torch.utils.data.DataLoader(
        trainset,
        # the amount of data loaded for each step.
        # note this is not the batch size, it's the "step" size
        # (samples per step)
        batch_size=torchWriter.samplesPerBatch * dataFlow.batchesPerStep(),
        #non-random data loading
        shuffle=False,
        num_workers=0)

    deviceManager = popart.DeviceManager()

    # Create a CPU device
    if device == "cpu":
        device = deviceManager.createCpuDevice()

    # Create an IPU Model device
    elif device == "ipu_model":

        options = {"compileIPUCode": True, 'numIPUs': 1, 'tilesPerIPU': 4}
        device = deviceManager.createIpuModelDevice(options)

    # Create an Simulator
    elif device == "sim":
        options = {"numIpus": 1, "tilesPerIPU": 4}
        device = deviceManager.createSimDevice(options)

    # Get a Hardware Device
    elif device == "hw":
        # Get a hardware device that meets the reqirements,
        # may throw if none are available.
        # Will attach to the device
        if device_hw_id:
            device = deviceManager.acquireDeviceById(device_hw_id)
        else:
            device = tu.acquire_ipu()

    # Enumerate available devices
    print("Enumerating devices")
    print("-------------------------------------")
    for idx, d in enumerate(deviceManager.enumerateDevices()):
        print('{0}. {1}'.format(idx, d))
    print("")

    opts = popart.SessionOptions()
    opts.logDir = outputdir
    if syntheticData == True:
        opts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal

    modelProtoX = fnModel0
    if transformations:
        gc = popart.GraphTransformer(fnModel0)
        for transformation in transformations:
            print("Running %s transformation pass" % (transformation, ))
            if transformation == "removeUnusedInputs":
                gc.removeUnusedInputs()

            elif transformation == "prepareNodesForTraining":
                gc.prepareNodesForTraining()

            else:
                raise RuntimeError("Unrecognised transformation %s" %
                                   (transformation, ))

        modelProtoX = gc.getModelProto()

    # Reads ONNX model from file and creates backwards graph,
    # performs Ir optimisations

    if mode == 'infer':
        session = popart.InferenceSession(fnModel=modelProtoX,
                                          inputShapeInfo=inputShapeInfo,
                                          dataFlow=dataFlow,
                                          patterns=patterns,
                                          userOptions=opts,
                                          deviceInfo=device)
    else:
        if len(torchWriter.outNames) != 1:
            raise RuntimeError("Expecting single scalar loss tensor")

        # Append output with an identity loss, to reduce to scalar if
        # necessary
        bder = popart.Builder(modelProtoX)
        loss = bder.aiGraphcore.identityloss(
            [torchWriter.outNames[0]], reduction=popart.ReductionType.Sum)
        session = popart.TrainingSession(fnModel=bder.getModelProto(),
                                         inputShapeInfo=inputShapeInfo,
                                         dataFlow=dataFlow,
                                         loss=loss,
                                         optimizer=torchWriter.optimizer,
                                         patterns=patterns,
                                         userOptions=opts,
                                         deviceInfo=device)

    # get the tensor info for the anchors
    anchorArrays = session.initAnchorArrays()

    allDotPrefixes = [x[0:-4] for x in os.listdir(outputdir) if ".dot" in x]
    print("Will generate graph pdfs for all of:")
    print(allDotPrefixes)
    import subprocess
    # set generateFromDots to True to
    # generate pdf figures of the Ir. It
    # requires the 'dot' program
    generateFromDots = False
    if generateFromDots:
        for name in allDotPrefixes:
            dotfile = os.path.join(outputdir, "%s.dot" % (name, ))
            outputfile = os.path.join(outputdir, "%s.pdf" % (name, ))
            log = subprocess.call(
                ["dot", "-T", "pdf", "-o", outputfile, dotfile])
            print("Exit status on `%s' was: %s" % (name, log))

    print("Setting device to IPU, and preparing it")
    session.prepareDevice()

    if mode == "train":
        print("Writing weights to device")
        session.weightsFromHost()

        print("Writing Optimizer tensors to device, if there are any")

    def addStepDimension(data, batchesPerStep):
        if batchesPerStep == 1:
            return data
        else:
            dataShape = np.array(np.shape(data))
            dataShape[0] //= batchesPerStep
            dataShape = np.insert(dataShape, 0, batchesPerStep)
            return np.reshape(data, dataShape)

    def reportTensorError(tensorInd, result):
        reportStr = str(tensorInd) + " :\n"
        reportStr += "  |pA - tA|^2 / (|pA||tA| + 1e-8)  = " + str(
            result) + "\n"
        return reportStr

    def getAnchorTensor(tId, anchorArrays):
        assertStr = "Tensor" + tId + " must be specified as an anchor"
        assert (tId in anchorArrays.keys()), assertStr
        return anchorArrays[tId]

    def subsampleBatches(array, refShape):
        arrayShape = np.shape(array)

        # Every Nth batch
        if len(arrayShape) == len(refShape):
            n = arrayShape[0] // refShape[0]
            return array[n - 1::n]

        # Last batch only
        else:
            return array[-1]

    def getTensorError(tA, pA):
        # pA, tA are corresponding tensors from two models
        pA_shape = np.shape(pA)
        tA_shape = np.shape(tA)
        assert (pA_shape == tA_shape), "Arrays must be same shape"

        ss_err = np.sum((np.array(pA) - np.array(tA))**2)
        ss_pA = np.sum(np.array(pA)**2)
        ss_tA = np.sum(np.array(tA)**2)
        return ss_err / (math.sqrt(ss_pA * ss_tA) + 1.0e-8)

    def checkResult(result, margin):
        if np.isnan(result):
            raise TestFailureError(str(result) + " is NaN")
        elif (result > margin):
            raise TestFailureError(
                str(result) + " is greater than " + str(margin))

    margin = 5.0e-7
    numReports = []

    for epoch in range(epochs):  # loop over the dataset multiple times
        print("Epoch is %d" % (epoch, ))
        stepData = next(iter(stepLoader))

        # Form the input map for one step's worth of data.
        # Note: data from the torch DataLoader has shape:
        #   [stepSize * batchSize, sampleShape]
        # whereas Popart expects input data of the shape:
        #   [stepSize, batchSize, sampleShape]
        # so we reshape the input array before passing to the stepio
        inputs = {}
        for tenId in cifarInIndices.keys():
            inputs[tenId] = \
                addStepDimension(stepData[cifarInIndices[tenId]].numpy(),
                                 session.dataFlow.batchesPerStep())

        if mode == "train":
            # take batchesPerStep passes (1 step), Torch
            torchWriter.train(inputs)

            # take batchesPerStep passes (1 step), PopArt
            pystepio = popart.PyStepIO(inputs, anchorArrays)
            session.run(pystepio)

            if printAnchorArrays:
                print(
                    "\nAnchor arrays (being printed as printAnchorArrays==True):"
                )
                for name in anchorArrays.keys():
                    arr = anchorArrays[name]
                    print("\nAnchored Array Name=", name, " and Size=",
                          arr.size)

                    if (arr.size < 10):
                        print("\nArray (of size < 10) values are")
                        print(arr)

                    if len(arr.shape) > 1:
                        for i, slice0 in enumerate(arr):
                            print("Sum along axis %d is Sum=%.15f" %
                                  (i, slice0.sum()))

                    print("Total Sum is %.15f" % (arr.sum()))

            # write models to file
            fnTorchModel = getFnTorch(epoch)
            fnPopArtModel = getFnPopArt(epoch)
            torchWriter.saveModel(fnTorchModel)
            session.modelToHost(fnPopArtModel)
            print("Writing models to " + fnTorchModel + " and " +
                  fnPopArtModel)

            # Compare parameters from updated Onnx models
            print("Obtaining popart NumericsReport, A: Torch, B: Popart.")
            if epoch is 0:
                nr = popart.NumericsReport(fnModel0, fnTorchModel, fnModel0,
                                           fnPopArtModel)
            else:
                nr = popart.NumericsReport(getFnTorch(epoch - 1), fnTorchModel,
                                           getFnPopArt(epoch - 1),
                                           fnPopArtModel)

            print(nr.fullReport())
            # One relative error calculated per weight tensor
            for tId, relerror in nr.getRelativeErrors().items():
                checkResult(relerror, margin)

        elif mode == "infer":
            # take batchesPerStep passes (1 step), Torch
            # returns map of outputs for each sample
            # Note: already are of dimension matching the
            # anchors
            torchOutputs = torchWriter.infer(inputs)

            # take batchesPerStep passes (1 step), PopArt
            pystepio = popart.PyStepIO(inputs, anchorArrays)
            session.run(pystepio)

            # Compare torch outputs tensors with popart output from
            # anchor tensor maps
            for nInd, outName in enumerate(torchWriter.outNames):
                # Torch outputs returned for all samples, whereas
                # anchors are returned as specified by the user.
                # Subsample torch outputs to match dimensions
                torchOuput = subsampleBatches(torchOutputs[outName],
                                              np.shape(anchorArrays[outName]))
                result = getTensorError(torchOuput, anchorArrays[outName])
                print(reportTensorError(nInd, result))
                checkResult(result, margin)

    return anchorArrays
Ejemplo n.º 3
0
def get_model(input_shape: List[int], weight_array: np.array,
              batches_per_step: int, replication_factor: int, batch_size: int,
              channels: int, data_len: int, synthetic_data: bool,
              buffer_streams: bool) -> Tuple:
    """Get a simple model for comparison with buffer streams on and off.
    Adapted from prefetch_test.py as we require to test the validity of streams
    here as well.

    Args:
        batches_per_step (int): Batches to run per step
        replication_factor (int): Replicas to run
        batch_size (int): Number of samples per model run
        channels (int): Number of channels e.g. RGB = 3
        data_len (int): Data size
        synthetic_data (bool): Use synthetic data (zeros in this case)
        buffer_streams (bool): The test option: whether to create ops
            before the stream in order to schedule data loading as part of
            graph scheduling. See T29603.

    Returns:
        Tuple: session, anchors, input_shape, label_shape required to run the model
    """

    micro_batch_size = batch_size // (replication_factor)

    builder = popart.Builder()

    data_shape = popart.TensorInfo("FLOAT", input_shape)
    lbl_shape = popart.TensorInfo("INT32", [micro_batch_size])
    w = builder.addInitializedInputTensor(weight_array)

    ip = builder.addInputTensor(data_shape, "main_input_123")
    lb = builder.addInputTensor(lbl_shape, "label_input_456")

    a = builder.aiOnnx.matmul([ip, w])
    o = builder.reshape_const(
        builder.aiOnnx, [a],
        [micro_batch_size, channels * data_len * data_len])
    relu = builder.aiOnnx.relu([o])
    sm = builder.aiOnnx.softmax([relu], axis=0, debugContext="output")
    builder.addOutputTensor(sm)
    o = builder.aiGraphcore.nllloss([sm, lb],
                                    reduction=popart.ReductionType.Mean)

    art = popart.AnchorReturnType("All")
    data_flow = popart.DataFlow(batches_per_step, {
        ip: art,
        lb: art,
        o: art,
        sm: art,
        a: art,
        relu: art
    })

    opts = popart.SessionOptions()
    opts.useHostCopyOps = buffer_streams
    # TODO: Fix outlining
    opts.enableOutlining = False

    ipus = 1

    if replication_factor > 1:
        opts.replicatedGraphCount = replication_factor
        opts.enableReplicatedGraphs = True
        ipus *= replication_factor
    device = tu.create_test_device(ipus)

    assert device

    patterns = popart.Patterns(popart.PatternsLevel.Minimal).enablePattern(
        "MatMulLhsGradOp", True).enablePattern("MatMulRhsGradOp", True)
    patterns.InPlace = False
    if synthetic_data:
        opts.syntheticDataMode = popart.SyntheticDataMode.Zeros

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=data_flow,
                                     loss=o,
                                     optimizer=popart.ConstSGD(LR),
                                     userOptions=opts,
                                     deviceInfo=device,
                                     patterns=patterns)

    session.setRandomSeed(0)
    session.prepareDevice()

    label_shape = [micro_batch_size]

    if replication_factor > 1:
        input_shape = [replication_factor] + input_shape
        label_shape = [replication_factor] + label_shape
    if batches_per_step > 1:
        input_shape = [batches_per_step] + input_shape
        label_shape = [batches_per_step] + label_shape

    anchors = session.initAnchorArrays()

    return session, anchors, label_shape
    def run_test(index, options):
        builder = popart.Builder(opsets={
            "ai.onnx": 9,
            "ai.onnx.ml": 1,
            "ai.graphcore": 1
        })

        mask = builder.addInputTensor(popart.TensorInfo("FLOAT", mask_shape),
                                      "mask")
        x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                      "x_in")

        anchors = {}
        x = x_in
        for i in range(options['numLayers']):
            qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}")
            anchors[popart.reservedGradientPrefix() +
                    qkv] = popart.AnchorReturnType("All")

            vgid = (i % 2) if options['pingPong'] else i

            with builder.virtualGraph(vgid), builder.pingPongPhase(i):
                x = builder.aiOnnx.matmul([x, qkv])
                x = attention_onnx(builder, x, mask, batch_size,
                                   sequence_length, hidden_size,
                                   attention_heads, qkv_length)

        vgid = ((options['numLayers'] - 1) %
                2) if options['pingPong'] else options['numLayers'] - 1

        with builder.virtualGraph(vgid):
            l1 = builder.aiGraphcore.l1loss([x], 0.1)

        proto = builder.getModelProto()

        anchors[x] = popart.AnchorReturnType("All")

        dataFlow = popart.DataFlow(batches_per_step, anchors)

        opts = popart.SessionOptions()
        opts.pingPongPhases = options['numLayers'] if options["pingPong"] else 0
        opts.enableOutlining = options["outlining"]

        # PingPong currently does its own recompute annotations
        opts.autoRecomputation = (popart.RecomputationType.Standard
                                  if options["explicitRecomputation"] else
                                  popart.RecomputationType.NoRecompute)

        opts.outlineThreshold = -np.inf
        opts.enableOutliningCopyCostPruning = False
        opts.virtualGraphMode = (popart.VirtualGraphMode.PingPong
                                 if options["pingPong"] else
                                 popart.VirtualGraphMode.Manual)
        opts.explicitRecomputation = options["explicitRecomputation"]
        opts.aliasZeroCopy = options["aliasZeroCopy"]
        opts.batchSerializationFactor = options["batchSerialize"]

        pat = popart.Patterns(popart.PatternsLevel.Default)

        device = tu.create_test_device(2 if options["pingPong"] else 4,
                                       pattern=popart.SyncPattern.Full)

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.ConstSGD(0.1),
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        inputs = {x_in: input_data, mask: mask_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for __ in range(10):
            session.run(stepio)

        session.modelToHost(str(tmpdir / f"pingpong_attention_{index}.onnx"))

        return anchors
Ejemplo n.º 5
0
def bert_session_options(args, model):
    options = popart.SessionOptions()
    options.enableVirtualGraphs = True
    options.virtualGraphMode = popart.VirtualGraphMode.Manual
    options.enableFloatingPointChecks = args.floating_point_exceptions
    options.enableStochasticRounding = args.stochastic_rounding
    options.enableGroupedMatmuls = False
    options.enableOutlining = not args.no_outlining
    # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops
    # such as add or reshapeInplace.
    # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation.
    options.outlineThreshold = 10.0
    if args.execution_mode == "PIPELINE":
        options.enablePipelining = True
        options.autoRecomputation = popart.RecomputationType.Pipeline
    if args.gradient_accumulation_factor > 1:
        options.enableGradientAccumulation = True
        options.accumulationFactor = args.gradient_accumulation_factor
    if args.replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = args.replication_factor
    if args.engine_cache is not None:
        options.enableEngineCaching = True
        options.cachePath = args.engine_cache
    if args.gc_profile:
        options.reportOptions = {
            "showVarStorage": "true",
            "showPerIpuMemoryUsage": "true",
            "showExecutionSteps": "true"
        }
    options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count
    # Addition of momentum tensors causes merged copies to exceed max
    # host translation table entries during the weightsFromHost program.
    # With the addition of disableGradAccumulationTensorStreams no copy merging
    # is needed but may be needed later when gradAccumulationTensorStreams are re-enabled
    # FIXME when T11642 is resolved.
    options.disableGradAccumulationTensorStreams = True
    if args.max_copy_merge_size == -1:
        logger.debug(f"No copy merge size limit applied")
    else:
        logger.warning(
            f"Workaround for T11642: copy merge size limit set to {args.max_copy_merge_size}"
        )
        options.engineOptions = {
            "opt.maxCopyMergeSize": str(args.max_copy_merge_size),
        }
    # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large
    # transposes before operations.
    # WARNING: This causes SQuAD 384 12-layer to go OOM
    if args.disable_fully_connected_pass:
        if args.task == "SQUAD" and args.sequence_length == 384:
            logger.warning(
                f"Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM."
            )
        options.enableFullyConnectedPass = False

    if args.inference and args.engine_cache is not None and not args.variable_weights_inference:
        logger.warn(
            "Using engine cache with constant weights. Checkpoint weights will be ignored. "
            "Use the `--variable-weights-inference` flag if checkpoint weights should be used."
        )

    if args.variable_weights_inference:
        options.constantWeights = False

    return options
Ejemplo n.º 6
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[Union[popart.Loss, Iterable[popart.Loss]]] = None,
           optimizer: Optional[popart.Optimizer] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None):
    outputs = make_tuple(outputs)
    if loss is not None:
        loss = make_tuple(loss)
    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step,
        {output: popart.AnchorReturnType("ALL")
         for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.enableGroupedMatmuls = False
    options.enableStochasticRounding = False
    options.reportOptions = {"showVarStorage": "true"}
    if ipus is not None and ipus > 1:
        options.virtualGraphMode = popart.VirtualGraphMode.Manual
    else:
        ipus = 1
    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }
    for key, value in user_options.items():
        setattr(options, key, value)

    if ipus is not None:
        options.enableVirtualGraphs = False
    else:
        ipus = 1
    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }

    request_ipus = pow(2, math.ceil(math.log2(ipus)))
    device = popart.DeviceManager().acquireAvailableDevice(request_ipus)
    # The cycle estimates of MSR_OPS codelets have not been validated
    # so it is incorrect to use the IPU_MODEL.
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFeed=data_flow,
                                         userOptions=options,
                                         losses=loss,
                                         optimizer=optimizer)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFeed=data_flow,
                                          userOptions=options)

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.PrepareDeviceException as e:
        if return_stats:
            if log_dir:
                import gcprofile
                os.makedirs(log_dir, exist_ok=True)
                reports = gcprofile.save_popart_report(session,
                                                       log_dir=log_dir,
                                                       exception=e)
                graph_report = json.loads(reports["graph"])
            else:
                graph_report = json.loads(e.getGraphReport())
            max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
            total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
            raise e
        else:
            raise e
    print("Compilation complete")

    session.weightsFromHost()
    if optimizer is not None:
        session.optimizerFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {
            k: np.repeat(v[np.newaxis], batches_per_step, 0)
            for k, v in data.items()
        }

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto
Ejemplo n.º 7
0
def train(opts, model_file, ckpt_file) -> None:
    """
    Train MNIST model using command line args.

    Args:
        opts: The command line options
        model_file: Temporary file for holding the model
        ckpt_file: Temporary file for holding the weights

    """
    if not opts.test_mode:
        max_value = NUM_TEST_SAMPLES // opts.batch_size
        if max_value < opts.batches_per_step:
            print(
                "(batches-per-step * batch-size) is larger than test set!\n"
                " Reduced batches-per-step to: {}\n".format(max_value)
            )
            opts.batches_per_step = max_value

    # Construct MNIST data loaders
    train_loader = get_data_loader(opts, is_train=True)

    test_loader = get_data_loader(opts, is_train=False)
    print("Creating ONNX model.")
    data_in, output = create_model(opts.batch_size, model_file)
    print("Converting model.")
    proto, label_in, loss = convert_model(
        opts.batch_size, model_file.name, output
    )

    # Describe how to run the model
    anchor_desc = {
        output: popart.AnchorReturnType("ALL"),
        loss: popart.AnchorReturnType("ALL"),
    }
    dataFlow = popart.DataFlow(opts.batches_per_step, anchor_desc)
    optimizer = popart.ConstSGD(0.01)

    # Options
    userOpts = popart.SessionOptions()

    # Ensure weight tensors in the validation model are not modified by the IR
    userOpts.constantWeights = False

    # If requested, setup synthetic data
    if opts.syn_data_type in ["random_normal", "zeros"]:
        print(
            "Running with Synthetic Data Type '{}'".format(opts.syn_data_type)
        )
        if opts.syn_data_type == "random_normal":
            userOpts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        elif opts.syn_data_type == "zeros":
            userOpts.syntheticDataMode = popart.SyntheticDataMode.Zeros

    # Select a device
    deviceManager = popart.DeviceManager()
    if opts.simulation:
        print("Running using IPU MODEL")
        options = {
            "compileIPUCode": True,
            "numIPUs": 1,
            "tilesPerIPU": TILES_PER_IPU,
        }
        device = deviceManager.createIpuModelDevice(options)
    else:
        print("Running using Hardware")
        device = deviceManager.acquireAvailableDevice()
        if device is None:
            print("Failed to acquire IPU. Exiting.")
            return
        if opts.test_mode:
            print(" IPU IDs: {}".format(device.driverIds))

    def init_session(proto, loss, dataFlow, userOpts, device, training, opts):
        # Create a session to compile and execute the graph
        if opts.test_mode:
            userOpts.instrumentWithHardwareCycleCounter = True
        if training:
            session = popart.TrainingSession(
                fnModel=proto,
                loss=loss,
                optimizer=optimizer,
                dataFlow=dataFlow,
                userOptions=userOpts,
                deviceInfo=device,
            )
        else:
            session = popart.InferenceSession(
                fnModel=proto,
                dataFlow=dataFlow,
                userOptions=userOpts,
                deviceInfo=device,
            )

        print(
            "Compiling the {} graph.".format(
                "training" if training else "validation"
            )
        )
        session.prepareDevice()

        # Create buffers to receive results from the execution
        anchors = session.initAnchorArrays()

        Session = namedtuple("Session", ["session", "anchors"])
        return Session(session, anchors)

    training = init_session(proto, loss, dataFlow, userOpts, device, True, opts)
    validation = init_session(
        proto, loss, dataFlow, userOpts, device, False, opts
    )

    inputs_per_step = opts.batch_size * opts.batches_per_step
    for i in range(opts.epochs):
        # Training
        if i > 0:
            training.session.resetHostWeights(ckpt_file.name)
        training.session.weightsFromHost()
        for data, label in train_loader:
            if len(label) != inputs_per_step:
                continue
            data, label = preprocess_data(data, label)
            stepio = popart.PyStepIO(
                {data_in: data, label_in: label}, training.anchors
            )
            if opts.test_mode == "training":
                start = time()
            training.session.run(stepio)
            if opts.test_mode == "training":
                duration = time() - start
                report_string = "{:<8.3} sec/itr.".format(duration)
                report_string += "   " + iteration_report(opts, duration)
                print(report_string)
                print(
                    "Hardware cycle count per 'run':",
                    training.session.getCycleCount(),
                )
                print("Total time: {}".format(duration))
        # Evaluation
        aggregated_loss = 0
        num_correct = 0

        training.session.modelToHost(ckpt_file.name)
        validation.session.resetHostWeights(ckpt_file.name)
        validation.session.weightsFromHost()

        for data, label in test_loader:
            if len(label) != inputs_per_step:
                continue

            data, label = preprocess_data(data, label)
            stepio = popart.PyStepIO(
                {data_in: data, label_in: label}, validation.anchors
            )
            if opts.test_mode == "inference":
                start = time()
            validation.session.run(stepio)
            if opts.test_mode == "inference":
                duration = time() - start
                report_string = "{:<8.3} sec/itr.".format(duration)
                report_string += "   " + iteration_report(opts, duration)
                print(report_string)
                print(
                    "Hardware cycle count per 'run':",
                    validation.session.getCycleCount(),
                )
                print("Total time: {}".format(duration))
            aggregated_loss += np.mean(validation.anchors[loss])
            results = np.argmax(
                validation.anchors[output].reshape(
                    [inputs_per_step, NUM_CLASSES]
                ),
                1,
            )
            score = results == label.reshape([inputs_per_step])
            num_correct += np.sum(score)
        aggregated_loss /= len(test_loader)
        accuracy = num_correct / len(test_loader.dataset)

        # Log statistics
        print("Epoch #{}".format(i))
        print("   Loss={0:.4f}".format(aggregated_loss))
        print("   Accuracy={0:.2f}%".format(accuracy * 100))
    def run_test(mode=None, verify=None):
        builder = popart.Builder()

        x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                      "x_in")

        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")

        # We want a bwd pass that looks like:
        #
        # restore, op1, restore, op2, restore, op3
        #
        # Where op1, op2 & op3 are gradient operations that
        # have implicit recompute inputs.

        with builder.virtualGraph(0), builder.pipelineStage(0):
            x = builder.aiOnnx.matmul([x_in, weight_1])
            x = builder.checkpointOutput([x])[0]

            x = builder.aiOnnx.add([x, x])
            # Gelu is a unary operation that takes the fwd input
            # activation. This satisfies our requirement above
            # of needing an implicit recompute input.
            x = builder.aiGraphcore.gelu([x])

            x = builder.checkpointOutput([x])[0]

            x = builder.aiOnnx.add([x, x])
            x = builder.aiGraphcore.gelu([x])

            x = builder.checkpointOutput([x])[0]
            o = x

        with builder.virtualGraph(1), builder.pipelineStage(1):
            l1 = builder.aiGraphcore.l1loss([o], 0.1)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, [
            o,
            popart.reservedGradientPrefix() + weight_1,
        ])

        opts = popart.SessionOptions()
        opts.enableOutlining = False
        opts.enablePipelining = True
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = gradient_accumulation
        opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
        if mode is not None:
            opts.autoRecomputation = mode
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.Adam({}),
                                         deviceInfo=tu.create_test_device(
                                             numIpus=2,
                                             opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        inputs = {x_in: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for _ in range(10):
            session.run(stepio)

        if verify is not None:
            verify(session)

        return anchors
def test_final_stage_recompute_0():
    np.random.seed(0)

    gradient_accumulation = 5
    batch_size = 1
    hidden_size = 16

    input_shape = [batch_size, hidden_size]
    weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype(
        np.float32)
    input_data = np.random.normal(
        0, 0.02, [gradient_accumulation] + input_shape).astype(np.float32)

    builder = popart.Builder()

    x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                  "x_in")

    with builder.virtualGraph(0), builder.pipelineStage(0):
        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")
        x = builder.aiOnnx.matmul([x_in, weight_1])

    with builder.virtualGraph(1), builder.pipelineStage(1):
        weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2")
        x_recomp = builder.aiOnnx.matmul([x, weight_2])
        # This MatMul should be recomputed
        x = builder.checkpointOutput([x_recomp])[0]

        weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3")
        # This MatMul should not be recomputed
        x_no_recomp = builder.aiOnnx.matmul([x, weight_3])
        l1 = builder.aiGraphcore.l1loss([x_no_recomp], 0.1)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, [l1])

    opts = popart.SessionOptions()
    opts.enableOutlining = False
    opts.enablePipelining = True
    opts.enableGradientAccumulation = True
    opts.accumulationFactor = gradient_accumulation
    opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
    opts.autoRecomputation = popart.RecomputationType.Pipeline
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     userOptions=opts,
                                     loss=l1,
                                     optimizer=popart.Adam({}),
                                     deviceInfo=tu.create_test_device(
                                         numIpus=2,
                                         opts={"compileIPUCode": False}))
    ''' Verify the the matmul in the main graphs is correct'''
    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))

    for op in ir["maingraph"]:
        if x_recomp in map(lambda out: out["name"], op["outputs"]):
            assert op["attributes"]["recompute"] == "YES"
        elif x_no_recomp in map(lambda out: out["name"], op["outputs"]):
            assert op["attributes"]["recompute"] == "NO"
Ejemplo n.º 10
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      replicated_graph_count=1,
                      doProfiling=False,
                      doDropout=False,
                      doGradientAccl=False,
                      acclSteps=1,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 16
    microBatchSize = batchSize // acclSteps

    shape_d0 = [microBatchSize, 2, 4, 4]
    shape_l0 = [microBatchSize]

    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugContext="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32])
    if doDropout:
        do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0]
        out = builder.aiOnnx.softmax([do0], axis=1, debugContext="sfm")
    else:
        out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0],
                                      reduction=popart.ReductionType.Sum)

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradientAccl
    opts.accumulationFactor = acclSteps
    opts.enableStochasticRounding = False

    if doSharding is False:
        numIpus = 1 * replicated_graph_count
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIpus = 2 * replicated_graph_count
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 0)
        builder.virtualGraph(c0, 0)
        builder.virtualGraph(r0, 1)
        if doDropout:
            builder.virtualGraph(do0, 1)
        builder.virtualGraph(out, 1)
        builder.virtualGraph(nll, 1)

    if replicated_graph_count > 1:
        opts.replicatedGraphCount = replicated_graph_count
        opts.enableReplicatedGraphs = True

    device = tu.create_test_device(numIpus=numIpus)

    if doTraining is True:
        session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                         dataFlow=popart.DataFlow(
                                             batchesPerStep, anchor_map),
                                         loss=nll,
                                         optimizer=popart.ConstSGD(0.01),
                                         userOptions=opts,
                                         deviceInfo=device)
    else:
        session = popart.InferenceSession(fnModel=builder.getModelProto(),
                                          dataFlow=popart.DataFlow(
                                              batchesPerStep, anchor_map),
                                          userOptions=opts,
                                          deviceInfo=device)

    if doDevicex is False:
        return None

    session.prepareDevice()
    anchors = session.initAnchorArrays()
    session.setRandomSeed(0)

    classes = np.prod(shape_d0) // (batchSize * batchesPerStep)

    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    # With all options enabled return anchors are of the shape:
    # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape]
    if acclSteps > 1:
        shape_d0.insert(0, acclSteps)
        label = label.reshape([acclSteps, -1])
    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)

    data = np.random.random_sample(shape_d0).astype(np.float32)

    # This is a slightly odd case - we want the same data to be input for both
    # replicated graphs, but the dimension we need to repeat on is either the
    # first or second (the replication dimension) depending on whether we
    # have gradient accumulation enabled.
    # If we are not testing, this is a lot simpler as we can split samples however
    # we want.
    if replicated_graph_count > 1:
        if acclSteps > 1:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 2)
            label = label.reshape([replicated_graph_count, -1])
        else:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 1)
            label = label.reshape([replicated_graph_count, -1])

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)
    stepio.enableRuntimeAsserts(False)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
Ejemplo n.º 11
0
# Let's create a known tensor, but with an undefined shape
y = builder.aiOnnx.conv([leaky_relu, w],
                        dilations=[1, 1],
                        pads=[padding] * 4,
                        strides=[1, 1])

l1 = builder.aiGraphcore.l1loss([y], 1.0)

proto = builder.getModelProto()

art = popart.AnchorReturnType("All")
# Describe how to run the model
dataflow = popart.DataFlow(1, {y: art, leaky_relu: art, w: art, l1: art})

# Create a session to compile and execute the graph
options = popart.SessionOptions()
device = popart.DeviceManager().createIpuModelDevice({})
session = popart.TrainingSession(fnModel=proto,
                                 dataFlow=dataflow,
                                 loss=l1,
                                 optimizer=popart.ConstSGD(0.001),
                                 userOptions=options,
                                 deviceInfo=device)

# Compile graph
session.prepareDevice()
# Create buffers to receive results from the execution
anchors = session.initAnchorArrays()
# Copy weights onto the IPU
session.weightsFromHost()
# Generate some random input data.
Ejemplo n.º 12
0
    def run_test(enablePipelining):
        popart.getLogger().setLevel("TRACE")

        builder = popart.Builder()

        i1 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input_data.shape[1::]))
        w0 = builder.addInitializedInputTensor(weight_data_0)
        w1 = builder.addInitializedInputTensor(weight_data_1)
        w2 = builder.addInitializedInputTensor(weight_data_2)

        o0 = builder.aiOnnx.matmul([i1, w0])
        if enablePipelining:
            builder.virtualGraph(o0, 0)

        o1 = builder.aiOnnx.matmul([o0, w1])
        if enablePipelining:
            builder.virtualGraph(o1, 1)

        o2 = builder.aiOnnx.matmul([o1, w2])
        if enablePipelining:
            builder.virtualGraph(o2, 2)

        o2l1 = builder.aiGraphcore.l1loss([o2], 0.1)
        if enablePipelining:
            builder.virtualGraph(o2l1, 2)

        proto = builder.getModelProto()

        anchorId = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT"

        # Need to anchor the output of the backward pass to stop it being pruned
        dataFlow = popart.DataFlow(bps, [anchorId])

        optimizer = popart.SGD({"defaultLearningRate": (1.0, False)})

        opts = popart.SessionOptions()
        if enablePipelining:
            opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        opts.enablePipelining = enablePipelining

        numIPUs = 1
        if enablePipelining:
            numIPUs = 3

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=dataFlow,
            loss=o2l1,
            optimizer=optimizer,
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

        session.prepareDevice()

        anchors = session.initAnchorArrays()

        inputs = {i1: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        session.weightsFromHost()

        # run 2 steps, changing the optimizer halfway through
        result = []
        session.run(stepio)
        result.append(np.copy(anchors[anchorId]))

        session.updateOptimizerFromHost(
            popart.SGD({"defaultLearningRate": (0.5, False)}))

        session.run(stepio)
        result.append(np.copy(anchors[anchorId]))

        return result
Ejemplo n.º 13
0
def test_virtual_graph4():

    builder = popart.Builder()

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))

    with builder.virtualGraph(3):
        o1 = builder.aiOnnx.add([i1, i2])
        o1l1 = builder.aiGraphcore.l1loss([o1], 0.1)
        o2 = builder.aiOnnx.add([i3, o1])
        o2l1 = builder.aiGraphcore.l1loss([o2], 0.1)

    with builder.virtualGraph(2):
        o3 = builder.aiOnnx.mul([i1, i3])
        o3l1 = builder.aiGraphcore.l1loss([o3], 0.1)

    with builder.virtualGraph(3):
        loss = builder.aiOnnx.sum([o1l1, o2l1, o3l1])

    proto = builder.getModelProto()

    # Need to anchor the output of the backward pass to stop it being pruned
    dataFlow = popart.DataFlow(
        1, {
            o1: popart.AnchorReturnType("All"),
            o2: popart.AnchorReturnType("All"),
            o3: popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i1:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i2:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i3:
            popart.AnchorReturnType("All")
        })

    optimizer = popart.ConstSGD(0.01)

    opts = popart.SessionOptions()
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    s = popart.TrainingSession(fnModel=proto,
                               dataFlow=dataFlow,
                               loss=loss,
                               optimizer=optimizer,
                               userOptions=opts,
                               deviceInfo=tu.create_test_device(numIpus=4))

    s.prepareDevice()

    anchors = s.initAnchorArrays()

    data1 = np.ones([1], dtype=np.float32)
    data2 = np.ones([1], dtype=np.float32)
    data3 = np.ones([1], dtype=np.float32)

    inputs = {i1: data1, i2: data2, i3: data3}
    stepio = popart.PyStepIO(inputs, anchors)

    s.run(stepio)
    s.weightsFromHost()
Ejemplo n.º 14
0
def test_virtual_graph3():

    popart.getLogger().setLevel("TRACE")

    builder = popart.Builder()

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i4 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))

    with builder.virtualGraph(3):
        o1 = builder.aiOnnx.add([i1, i2])
        o2 = builder.aiOnnx.add([i3, i4])

    with builder.virtualGraph(2):
        o3 = builder.aiOnnx.add([o1, o2])
        o = builder.aiOnnx.add([i1, o3])
        o = builder.aiGraphcore.l1loss([o], 0.1)

    proto = builder.getModelProto()

    # Need to anchor the output of the backward pass to stop it being pruned
    dataFlow = popart.DataFlow(
        1, {
            o: popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i1:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i2:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i3:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i4:
            popart.AnchorReturnType("All")
        })

    optimizer = popart.SGD({"defaultLearningRate": (0.01, True)})

    opts = popart.SessionOptions()
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    s = popart.TrainingSession(fnModel=proto,
                               dataFlow=dataFlow,
                               loss=o,
                               optimizer=optimizer,
                               userOptions=opts,
                               deviceInfo=tu.create_test_device(numIpus=4))

    s.prepareDevice()

    anchors = s.initAnchorArrays()

    data1 = np.ones([1], dtype=np.float32)
    data2 = np.ones([1], dtype=np.float32)
    data3 = np.ones([1], dtype=np.float32)
    data4 = np.ones([1], dtype=np.float32)

    inputs = {i1: data1, i2: data2, i3: data3, i4: data4}
    stepio = popart.PyStepIO(inputs, anchors)

    s.run(stepio)
    s.weightsFromHost()
Ejemplo n.º 15
0
def run_model(tmpdir, batches_per_step, accum_factor, replicas, tile_set,
              exchange_strategy):
    size = 64

    proto, inputs, weights, labels, dataFlow, loss, sum = get_model(
        size, batches_per_step, 4, 1, tile_set, exchange_strategy)

    opts = popart.SessionOptions()
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    opts.instrumentWithHardwareCycleCounter = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Auto

    # Both true & false should work - testing with false to avoid
    # host-cycle-overhead
    opts.rearrangeAnchorsOnHost = False
    opts.rearrangeStreamsOnHost = False

    # Set session options to generate the report
    tu.set_autoreport_options(opts, tmpdir, output_execution_profile=True)

    if accum_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accum_factor

    if tile_set == popart.TileSet.IO:
        opts.numIOTiles = 128
    else:
        opts.numIOTiles = 0

    if replicas > 1:
        opts.enableReplicatedGraphs = True
        opts.replicatedGraphCount = replicas

    pat = popart.Patterns(popart.PatternsLevel.Default)

    session = popart.TrainingSession(
        fnModel=proto,
        dataFlow=dataFlow,
        userOptions=opts,
        loss=loss,
        optimizer=popart.ConstSGD(1e-6),
        patterns=pat,
        # Trying to use less than all the tiles throw an error like
        #   popart_core.poplar_exception: Trying to access tile 72 on IPU
        #   0 but the virtual graph only covers the following tiles on
        #   that IPU: 0-63
        # The error happens in a call to poplar made by gcl::perIPUTiles.
        deviceInfo=tu.create_test_device(numIpus=replicas,
                                         tilesPerIPU=tu.USE_ALL_TILES))

    anchors = session.initAnchorArrays()

    session.prepareDevice()

    np.random.seed(224488)

    session.weightsFromHost()

    warmup_iterations = 1
    calc_iterations = 1

    for i in range(warmup_iterations + calc_iterations):
        datainputs = {
            input: (np.random.normal(
                0, 0.05, (replicas * batches_per_step * accum_factor, 1, size,
                          size)).astype(np.float32))
            for input in inputs
        }
        datainputs[labels] = np.random.randint(
            0, size, (replicas * batches_per_step * accum_factor, 1, size))
        stepio = popart.PyStepIO(datainputs, anchors)
        session.run(stepio)

    session.weightsToHost()
    weights_data = {
        w: np.zeros((1, size, size), dtype=np.float32)
        for w in weights
    }
    weights_read = popart.PyWeightsIO(weights_data)
    session.readWeights(weights_read)

    for w in weights_data:
        assert np.count_nonzero(np.isnan(weights_data[w])) == 0

    report = session.getReport()

    overlapPercentage = get_compute_io_overlap_percentage(
        report, warmup_iterations)

    return overlapPercentage, weights_data
    def run_test(mode=None, verify=None):
        builder = popart.Builder()

        def norm(input_x):
            gamma = builder.addInitializedInputTensor(
                np.ones(hidden_size, np.float32), "Gamma")
            beta = builder.addInitializedInputTensor(
                np.zeros(hidden_size, np.float32), "Beta")
            return builder.aiGraphcore.groupnormalization(
                [input_x, gamma, beta], 1)[0]

        x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                      "x_in")

        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")
        weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2")
        weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3")

        with builder.virtualGraph(0), builder.pipelineStage(0):
            x_0 = builder.aiOnnx.matmul([x_in, weight_1])
            x_0 = norm(x_0)

            # If recomputeOutputs was used directly on `x_0` all 3 outputs
            # of groupnormalization would be stashed.
            # By using a checkpointOutput only 1 output will be stashed and the
            # rest will be recomputed.
            x_0 = builder.checkpointOutput([x_0])[0]

            x_1 = builder.aiOnnx.matmul([x_0, weight_2])
            x_1 = norm(x_1)
            x_1 = builder.aiOnnx.add([x_0, x_1])

            # This checkpoint should be redundant as x_1 will be stashed
            # at the start of stage1 on ipu1.
            x_1 = builder.checkpointOutput([x_1])[0]

        with builder.virtualGraph(1), builder.pipelineStage(1):
            o = builder.aiOnnx.matmul([x_1, weight_3])
            l1 = builder.aiGraphcore.l1loss([o], 0.1)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, [
            o,
            popart.reservedGradientPrefix() + weight_1,
            popart.reservedGradientPrefix() + weight_2,
            popart.reservedGradientPrefix() + weight_3,
        ])

        opts = popart.SessionOptions()
        opts.enableOutlining = False
        opts.enablePipelining = True
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = gradient_accumulation
        opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
        if mode is not None:
            opts.autoRecomputation = mode
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.Adam({}),
                                         deviceInfo=tu.create_test_device(
                                             numIpus=2,
                                             opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        inputs = {x_in: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for _ in range(10):
            session.run(stepio)

        if verify is not None:
            verify(session, x_0)

        return anchors
Ejemplo n.º 17
0
def matmul_avail_memory(capfd, apply_to_conv=True, avail_mem_prop=0.9):
    os.environ["POPLIBS_LOG_LEVEL"] = "DEBUG"

    builder = popart.Builder()

    input_shape = popart.TensorInfo("FLOAT", [2, 4])
    weight_shape = popart.TensorInfo("FLOAT", [4, 8])

    weight_data = np.ones(weight_shape.shape(), np.float32)
    input_ = builder.addInputTensor(input_shape)
    weights = builder.addInitializedInputTensor(weight_data)
    act = builder.aiOnnx.matmul([input_, weights])
    o = builder.aiOnnx.relu([act])
    loss = builder.aiGraphcore.identityloss([o])

    # Apply the setAvailableMemoryProportion to the matmul
    if apply_to_conv:
        builder.setAvailableMemoryProportion(act, avail_mem_prop)
    # For the test_conv_avail_memory_error_2 test we try to apply the
    # setAvailableMemoryProportion to the relu op defined above, rather
    # than the expected convolution op, and expect an error.
    else:
        builder.setAvailableMemoryProportion(o, avail_mem_prop)

    anchor_names = [
        o,
        popart.reservedGradientPrefix() + input_,
        popart.reservedGradientPrefix() + weights
    ]
    training_dataFlow = popart.DataFlow(
        1, {
            anchor_names[0]: popart.AnchorReturnType("All"),
            anchor_names[1]: popart.AnchorReturnType("All"),
            anchor_names[2]: popart.AnchorReturnType("All")
        })

    opts = popart.SessionOptions()
    opts.constantWeights = False  # Allow the weights to be updated

    # Create the device
    device = tu.create_test_device(1, opts={"compileIPUCode": True})
    device.attach()

    # Prepare the input data
    input_data = np.random.random_sample(input_shape.shape()).astype(
        np.float32)

    # Prepare the Training session
    training_session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                              dataFlow=training_dataFlow,
                                              loss=loss,
                                              optimizer=popart.ConstSGD(0.01),
                                              userOptions=opts,
                                              deviceInfo=device)

    # Compile the training graph
    training_session.prepareDevice()

    # Run the training session
    training_session.weightsFromHost()

    training_anchors = training_session.initAnchorArrays()
    training_inputs = {input_: input_data}

    training_session.run(popart.PyStepIO(training_inputs, training_anchors))

    captured = capfd.readouterr()
    os.environ["POPLIBS_LOG_LEVEL"] = "NONE"

    return captured.err
def test_pipeline_stage_merging():
    np.random.seed(0)

    # With 3 stages the minimum pipeline cycles is 5
    # With 2 stages the minimum pipeline cycles is 3
    # So if the consecutive stages aren't fused an error will be thrown.
    gradient_accumulation = 3
    batch_size = 1
    hidden_size = 16

    input_shape = [batch_size, hidden_size]

    weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype(
        np.float32)

    input_data = np.random.normal(0, 0.02, [gradient_accumulation] +
                                  input_shape).astype(np.float32)

    builder = popart.Builder()

    x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                  "x_in")

    weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")
    weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2")
    weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3")

    # Pipelining should combine stage 0 and 1.
    with builder.virtualGraph(0), builder.pipelineStage(0):
        x_0 = builder.aiOnnx.matmul([x_in, weight_1])

    with builder.virtualGraph(0), builder.pipelineStage(1):
        x_1 = builder.aiOnnx.matmul([x_0, weight_2])

    with builder.virtualGraph(1), builder.pipelineStage(2):
        o = builder.aiOnnx.matmul([x_1, weight_3])
        l1 = builder.aiGraphcore.l1loss([o], 0.1)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, [o])

    opts = popart.SessionOptions()
    opts.enableOutlining = False
    opts.enablePipelining = True
    opts.enableGradientAccumulation = True
    opts.accumulationFactor = gradient_accumulation
    opts.autoRecomputation = popart.RecomputationType.Pipeline
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     userOptions=opts,
                                     loss=l1,
                                     optimizer=popart.ConstSGD(1e-9),
                                     deviceInfo=tu.create_test_device(
                                         numIpus=2,
                                         opts={"compileIPUCode": False}))

    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))
    stashes = [op for op in ir["maingraph"] if op["type"] == "Stash"]
    stashedTensors = [stash["inputs"][0]["name"] for stash in stashes]

    assert {'x_in'} == set(stashedTensors)
Ejemplo n.º 19
0
def test_reset_host_weights_with_extra_tensor_in_onnx_model():
    """
    1. Create a training session, and a corresponding validation session
    2. The training session must contain some feauture that means when writing
       the ONNX model back to the host, it contains extra initializers compared
       with the original (builder-generated) model. In this case we achieve this
       by using an SGD optimizer with momentum.
    3. Try resetting the weights of the validation session using the ONNX model
       with the additional momentum tensor (call resetHostWeights)
    4. Observe that a PopART exception is thrown
    5. Try again, but with ignoreWeightsInModelWithoutCorrespondingHostWeight.
    6. Observe that it succeeds
    """
    def getModelWithRandomWeights():
        builder = popart.Builder()
        dShape = [2, 2]
        i0 = builder.addInputTensor(popart.TensorInfo("FLOAT", dShape))
        wData = np.random.rand(*dShape).astype(np.float32)
        w0 = builder.addInitializedInputTensor(wData)
        o = builder.aiOnnx.matmul([i0, w0])
        loss = builder.aiGraphcore.l1loss([o], 0.1)
        builder.addOutputTensor(loss)
        return builder

    device = tu.create_test_device()
    tr_builder = getModelWithRandomWeights()
    o = tr_builder.getOutputTensorIds()[0]

    # 1. & 2.
    # Training
    tr_opt = popart.SGD({"defaultMomentum": (0.01, True)})
    tr_sess = popart.TrainingSession(fnModel=tr_builder.getModelProto(),
                                     dataFlow=popart.DataFlow(1, []),
                                     loss=o,
                                     optimizer=tr_opt,
                                     deviceInfo=device)
    tr_sess.prepareDevice()
    tmpfile = os.path.join(tempfile.mkdtemp(), "tr_model.onnx")
    tr_sess.modelToHost(tmpfile)

    # Validation (with different model proto weights)
    va_builder = getModelWithRandomWeights()
    va_opts = popart.SessionOptions()
    va_opts.constantWeights = False
    va_sess = popart.InferenceSession(fnModel=va_builder.getModelProto(),
                                      dataFlow=popart.DataFlow(1, [o]),
                                      deviceInfo=device,
                                      userOptions=va_opts)
    va_sess.prepareDevice()

    # 3. Try reset validation weights with training weights
    wId = [
        w for w in va_builder.getInputTensorIds()
        if va_builder.isInitializer(w)
    ][0]
    missing_tensor_name = popart.reservedAcclToAccumulatorPrefix(
    ) + popart.reservedGradientPrefix() + wId
    with pytest.raises(popart.popart_exception) as e_info:
        va_sess.resetHostWeights(tmpfile)
    # 4.
    assert e_info.value.args[
        0] == "resetWeights, no tensor '" + missing_tensor_name + "' in tensors"

    # 5. & 6. Try again, but this time ignore the missing tensor
    va_sess.resetHostWeights(
        tmpfile, ignoreWeightsInModelWithoutCorrespondingHostWeight=True)
Ejemplo n.º 20
0
def test_manual_serialization():

    # Basic model:
    #
    #  X: data input if shape (N, C0)
    #  W: weight input of shape (C0, C1)
    #
    #  Y    = matmul(X, W)
    #  Z    = relu(Y)
    #  loss = l1Loss(Z)
    #
    # With array dimensions

    N = 12
    C0 = 244
    C1 = 286

    # In this test, we manually serialise the matmul, converting
    # matmul ((N,C0) , (C0,C1))
    #
    # into a sequence of factor-f smaller matmuls
    # matmul (N,C0/f),(C0/f,C1))
    #
    # reapeated and accumulated f times, where f is

    f = 4
    assert (C0 % f == 0)

    # Constructing the model

    builder = popart.Builder()
    # NOTE: T22702 For some seeds this test fails.
    np.random.seed(0)
    wVals = np.array(npr.randn(C0, C1), dtype=np.float32)
    W = builder.addInitializedInputTensor(wVals)
    xInfo = popart.TensorInfo("FLOAT", [N, C0])
    X = builder.addInputTensor(xInfo)
    axesV = np.array([0, 1]).astype(np.int32)
    axes = builder.addInitializedInputTensor(axesV)

    for i in range(f):
        # the lower index of the i'th slice
        lwr = int(i * C0 / f)

        # the upper index of the i'th slice
        upp = int((i + 1) * C0 / f)

        # Take a slice of size (N,C0/f) out of X
        s0 = builder.addInitializedInputTensor(
            np.array([0, lwr]).astype(np.int32))
        e0 = builder.addInitializedInputTensor(
            np.array([N, upp]).astype(np.int32))
        X_slice = builder.aiOnnx.slice([X, s0, e0, axes])

        # Take a slice of size (C0/f,C1) out of W
        s1 = builder.addInitializedInputTensor(
            np.array([lwr, 0]).astype(np.int32))
        e1 = builder.addInitializedInputTensor(
            np.array([upp, C1]).astype(np.int32))
        W_slice = builder.aiOnnx.slice([W, s1, e1, axes])

        # Multiply the slices together, and accumulate as necessary
        mm_part = builder.aiOnnx.matmul([X_slice, W_slice])
        if i == 0:
            Y = mm_part

        else:
            Y = builder.aiOnnx.add([mm_part, Y])

    # Finally, the non-linearity
    Z = builder.aiOnnx.relu([Y])

    # This boiler-plate is currently necessary with opset-10 slice
    graph_transformer = popart.GraphTransformer(builder.getModelProto())
    graph_transformer.convertAllFixedPointInitializersToConstants()
    builder = popart.Builder(graph_transformer.getModelProto())

    l1 = builder.aiGraphcore.l1loss([Z], 0.2)
    dataFlow = popart.DataFlow(1, {})
    device = tu.create_test_device()
    userOptions = popart.SessionOptions()

    # To obtain the final dot graph, uncomment this:
    # userOptions.dotChecks = {"Final"};

    patterns = popart.Patterns()

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=dataFlow,
                                     optimizer=popart.SGD(
                                         {"defaultLearningRate": (0.1, True)}),
                                     loss=l1,
                                     patterns=patterns,
                                     userOptions=userOptions,
                                     deviceInfo=device)
    session.prepareDevice()
    session.weightsFromHost()

    inputVals = np.array(npr.randn(1 * N * C0), dtype=np.float32)
    stepio = popart.PyStepIO({X: inputVals}, {})
    session.run(stepio)
    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(C0 * C1), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({W: w0R})
    session.readWeights(weightsRead)

    # A pytorch version to confirm numerical correctness:
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.w0 = torch.nn.Parameter(torch.from_numpy(wVals.copy()))

        def forward(self, x):
            return torch.relu(torch.matmul(x, self.w0))

    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.1)

    out = net(torch.from_numpy(inputVals.reshape([N, C0])))
    loss = 0.2 * torch.mean(torch.abs(out))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    baseline0 = np.sum(
        np.abs(net.w0.detach().numpy().flatten() - wVals.flatten()))
    baseline1 = np.sum(np.abs(w0R - wVals.flatten()))
    error = np.sum(np.abs(np.abs(net.w0.detach().numpy().flatten() - w0R)))
    assert (error / (baseline0 + baseline1) < 1e-6)
Ejemplo n.º 21
0
def main(argv):
    FLAGS = flags.FLAGS
    print(f"micro batch size is {FLAGS.micro_batch_size}")
    print(f"batch size is {FLAGS.batch_size}")
    print(f"batches_per_step is {FLAGS.batches_per_step}")
    proto, data, outputs, output_id = graph_builder()
    print(f"Model: {FLAGS.model_name}")
    if not FLAGS.synthetic:
        print(f"Data_dir: {FLAGS.data_dir}")
    else:
        print(f"Using synthetic data")
    print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}")
    print(f"num_workers: {FLAGS.num_workers}")
    print(f"batches per step: {FLAGS.batches_per_step}")
    dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs)

    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if FLAGS.synthetic:
        options.syntheticDataMode = popart.SyntheticDataMode.Zeros
    options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count

    # Configure precision of convolutions and MatMuls
    if FLAGS.half_partials:
        options.convolutionOptions = {'partialsType': 'half'}
        options.partialsTypeMatMuls = "half"

    # Select a device
    deviceManager = popart.DeviceManager()
    device = deviceManager.acquireAvailableDevice(1)
    print(f"{device}\n")
    if device is None:
        raise Exception("Not enough IPUs available.")

    session = popart.InferenceSession(fnModel=proto,
                                      deviceInfo=device,
                                      dataFlow=dataFlow,
                                      userOptions=options)

    print("Compiling...")
    start = time.time()
    try:
        session.prepareDevice()
    except popart.PrepareDeviceException as e:
        import gcprofile
        gcprofile.save_popart_report(session, exception=e)
        sys.exit(1)
    compilation_duration = time.time() - start
    print("Time to compile: {:.3f} seconds\n".format(compilation_duration))

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()
    # Copy weights and optimisation parameters onto the device
    session.weightsFromHost()

    def report_time(duration, data_duration=None, compute_duration=None):
        report_string = "Total {:<8.3} sec.".format(duration)
        if data_duration:
            report_string += "   Preprocessing {:<8.3} sec ({:4.3}%).".format(
                data_duration, 100 * (data_duration / duration))
        if compute_duration:
            report_string += "   Compute {:<8.3} sec ({:4.3}%).".format(
                compute_duration, 100 * (compute_duration / duration))
        report_string += "   {:5f} images/sec.".format(
            int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration))
        print(report_string)
        if FLAGS.report_hw_cycle_count:
            print("Hardware cycle count per 'run':", session.getCycleCount())

    print("Executing...")
    average_batches_per_sec = 0

    # Run
    start = time.time()
    durations = []
    if FLAGS.synthetic:
        for i in range(FLAGS.iterations):
            stepio = popart.PyStepIO(data, anchors)
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
    else:
        for d in data:
            stepio = popart.PyStepIO(d, anchors)
            # Calc data duration
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
Ejemplo n.º 22
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[str] = None,
           optimizer: Optional[popart.Optimizer] = None,
           patterns: Optional[popart.Patterns] = None,
           user_options: Optional[Mapping[str, Any]] = None,
           skip_execution: bool = False):
    batches_per_step = 1

    outputs = make_tuple(outputs)

    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step,
        {output: popart.AnchorReturnType("ALL")
         for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.reportOptions = {"showVarStorage": "true"}
    options.enableStochasticRounding = False
    options.constantWeights = True
    options.outlineThreshold = 10.0

    for key, value in user_options.items():
        if key not in ["batchSerializationFactor", "executionPhases"]:
            setattr(options, key, value)

    replicas = user_options.get("replicatedGraphCount", 1)
    request_ipus = pow(2, math.ceil(math.log2(replicas)))
    device = tu.create_test_device(numIpus=request_ipus)

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFlow=data_flow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=data_flow,
                                          userOptions=options,
                                          patterns=patterns)

    if skip_execution:
        device.detach()
        return session

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.OutOfMemoryException as e:
        device.detach()
        raise e
    print("Compilation complete")

    session.weightsFromHost()
    # NOTE: If we ever use a model with random ops, we would need to call this
    # here, using the same seed given to numpy.
    # session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    rf = user_options.get("replicatedGraphCount")
    if rf is not None and rf > 1:
        data = {k: np.repeat(v[np.newaxis], rf, 0) for k, v in data.items()}

    # Add a gradient accumulation factor dimension if needed
    af = user_options.get("accumulationFactor")
    if af is not None and af > 1:
        data = {k: np.repeat(v[np.newaxis], af, 0) for k, v in data.items()}

    stepio = popart.PyStepIO(data, anchors)
    session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    return (anchors[output] for output in outputs), post_proto, outputs
Ejemplo n.º 23
0
def train(opts):
    # Do not require the mnist data to be present if running with synthetic data
    train_data, train_labels, test_data, test_labels = load_dummy(opts) \
        if opts.syn_data_type in ["random_normal", "zeros"] else load_mnist()

    if not opts.test_mode:
        max_value = len(test_data) // opts.batch_size
        if max_value < opts.batches_per_step:
            print("(batches-per-step * batch-size) is larger than test set!\n"
                  " Reduced batches-per-step to: {}\n".format(max_value))
            opts.batches_per_step = max_value
    training_set = DataSet(opts.batch_size, opts.batches_per_step, train_data,
                           train_labels)
    test_set = DataSet(opts.batch_size, opts.batches_per_step, test_data,
                       test_labels)

    print("Creating ONNX model.")
    proto, data_in, labels_in, output, loss = create_model(
        opts.samples_per_device)

    # Describe how to run the model
    anchor_desc = {
        output: popart.AnchorReturnType("ALL"),
        loss: popart.AnchorReturnType("ALL")
    }
    dataFlow = popart.DataFlow(opts.batches_per_step, anchor_desc)

    # Options
    userOpts = popart.SessionOptions()

    # The validation graph by default will be optimized to change all variables to constants
    # This prevents that, which allows for checkpoints to be loaded into the model without recompiling
    userOpts.constantWeights = False

    # If requested, setup synthetic data
    if opts.syn_data_type in ["random_normal", "zeros"]:
        print("Running with Synthetic Data Type '{}'".format(
            opts.syn_data_type))
        if opts.syn_data_type == "random_normal":
            userOpts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        elif opts.syn_data_type == "zeros":
            userOpts.syntheticDataMode = popart.SyntheticDataMode.Zeros

    # Enable auto-sharding
    if opts.num_ipus > opts.replication_factor:
        userOpts.virtualGraphMode = popart.VirtualGraphMode.Auto

    # Enable pipelining
    if opts.pipeline:
        userOpts.enablePipelining = True

    # Enable replication
    if opts.replication_factor > 1:
        userOpts.enableReplicatedGraphs = True
        userOpts.replicatedGraphCount = opts.replication_factor

    # A single device is shared between training and validation sessions
    device = get_device(opts.num_ipus, opts.simulation)

    training = init_session(proto,
                            loss,
                            dataFlow,
                            userOpts,
                            device,
                            training=True)
    validation = init_session(proto,
                              loss,
                              dataFlow,
                              userOpts,
                              device,
                              training=False)

    # Make weight transfer file
    _, onnx_file_name = tempfile.mkstemp()

    print("Running training loop.")
    for i in range(opts.epochs):
        # Training
        if i > 0:
            training.session.resetHostWeights(onnx_file_name)
        training.session.weightsFromHost()
        for step, (data, labels) in enumerate(training_set):
            stepio = popart.PyStepIO({
                data_in: data,
                labels_in: labels
            }, training.anchors)

            start = time()
            training.session.run(
                stepio, 'Epoch ' + str(i) + ' training step' + str(step))
            if opts.test_mode == "training":
                log_run_info(training, start, opts)

        training.session.modelToHost(onnx_file_name)

        if not opts.validation_final_epoch or i == opts.epochs - 1:
            aggregated_loss = 0
            aggregated_accuracy = 0
            validation.session.resetHostWeights(onnx_file_name)
            validation.session.weightsFromHost()

            # Evaluation
            for step, (data, labels) in enumerate(test_set):
                stepio = popart.PyStepIO({
                    data_in: data,
                    labels_in: labels
                }, validation.anchors)
                start = time()
                validation.session.run(
                    stepio,
                    'Epoch ' + str(i) + ' evaluation step ' + str(step))
                if opts.test_mode == "inference":
                    log_run_info(validation, start, opts)

                # Loss
                aggregated_loss += np.mean(validation.anchors[loss])
                # Accuracy
                results = np.argmax(
                    validation.anchors[output].reshape(
                        [test_set.inputs_per_step, 10]), 1)
                num_correct = np.sum(
                    results == labels.reshape([test_set.inputs_per_step]))
                aggregated_accuracy += num_correct / test_set.inputs_per_step

            # Log statistics
            aggregated_loss /= len(test_set)
            aggregated_accuracy /= len(test_set)
            print("Epoch #{}".format(i + 1))
            print("   Loss={0:.4f}".format(aggregated_loss))
            print("   Accuracy={0:.2f}%".format(aggregated_accuracy * 100))

    # Remove weight transfer file
    os.remove(onnx_file_name)
Ejemplo n.º 24
0
def run(benchmark, opts):
    proto, data, outputs, losses, optimizer = benchmark.graph_builder(opts)

    if opts.save_graph:
        with open('model.onnx', "wb") as f:
            f.write(proto)
            print("Written to file: model.onnx")

    dataFlow = popart.DataFlow(opts.batches_per_step, outputs)

    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if not opts.use_generated_data:
        options.syntheticDataMode = popart.SyntheticDataMode.Zeros
    options.instrumentWithHardwareCycleCounter = opts.report_hw_cycle_count
    options.engineOptions = {
        "debug.instrumentCompute": "true" if opts.report else "false"
    }
    if opts.convolution_options:
        options.convolutionOptions = json.loads(opts.convolution_options)

    if opts.shards > 1:
        if opts.auto_sharding:
            options.virtualGraphMode = popart.VirtualGraphMode.Auto
        else:
            options.virtualGraphMode = popart.VirtualGraphMode.Manual

    options.enablePipelining = opts.pipeline

    # Select a device
    deviceManager = popart.DeviceManager()
    if opts.simulation:
        deviceOptions = {
            "compileIPUCode": True,
            'numIPUs': opts.shards,
            "tilesPerIPU": 1216
        }
        device = deviceManager.createIpuModelDevice(deviceOptions)
    else:
        device = deviceManager.acquireAvailableDevice(opts.shards)
        if device is None:
            raise OSError("Failed to acquire IPU.")

    if opts.mode == 'train':
        session = popart.TrainingSession(fnModel=proto,
                                         loss=losses,
                                         deviceInfo=device,
                                         optimizer=optimizer,
                                         dataFlow=dataFlow,
                                         userOptions=options)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=dataFlow,
                                          userOptions=options)

    print("Compiling...")
    start = time.time()
    session.prepareDevice()
    compilation_duration = time.time() - start
    print("Duration: {:.3f} seconds\n".format(compilation_duration))

    if opts.tensor_tile_mapping:
        with open("tile_mapping.json", 'w') as f:
            json.dump(session.getTensorTileMap(), f)
            print("Written to file: tile_mapping.json")

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    # Copy weights and optimization parameters onto the device
    session.weightsFromHost()

    # Add a batches_per_step dimension if needed
    if opts.batches_per_step > 1:
        data = {
            k: np.repeat(v[np.newaxis], opts.batches_per_step, 0)
            for k, v in data.items()
        }

    stepio = popart.PyStepIO(data, anchors)

    print("Executing...")
    average_batches_per_sec = 0
    # Steps
    for __ in range(opts.steps):
        # Run
        start = time.time()
        session.run(stepio)
        duration = time.time() - start

        if opts.report:
            return save_reports(opts, session)

        average_batches_per_sec += (opts.batches_per_step /
                                    duration) / opts.steps
        report_string = "{:<8.3} sec/itr.".format(duration)
        report_string += "   " + benchmark.iteration_report(opts, duration)
        print(report_string)

    if opts.report_hw_cycle_count:
        print("Hardware cycle count per 'run':", session.getCycleCount())

    return compilation_duration, average_batches_per_sec
Ejemplo n.º 25
0
ip = builder.addInputTensor(data_shape)
lb = builder.addInputTensor(lbl_shape)

w = builder.addInitializedInputTensor(np.ones([2, 2], np.float16))
b = builder.addInitializedInputTensor(np.ones([2], np.float16))
o = builder.aiOnnx.gemm([ip, w, b], 1., 1., False, False)
o = builder.aiOnnx.relu([o])
o = builder.aiOnnx.softmax([o])
o = builder.aiGraphcore.nllloss([o, lb])

dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")})

# Create a session to compile and the graph for inference
#------------------------------------------------------------------------------
inferenceOptions = popart.SessionOptions()
# Need to compile the inference graph with variable weights we they can be updated
# before execution
inferenceOptions.constantWeights = False

inferenceSession = popart.InferenceSession(
    fnModel=builder.getModelProto(),
    dataFlow=dataFlow,
    userOptions=inferenceOptions,
    deviceInfo=popart.DeviceManager().createIpuModelDevice({}))

# Compile graph
inferenceSession.prepareDevice()

# Create buffers to receive results from the execution
inferenceAnchors = inferenceSession.initAnchorArrays()
Ejemplo n.º 26
0
    def test(config, iteration, true_scaling, test_case):
        builder = popart.Builder()

        w0name = "weight_0"
        w1name = "weight_1"
        w2name = "weight_2"

        input0Shape = [1, 1, 1]
        input0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input0Shape), "input0")

        w0data = np.array([test_case[0][0]], dtype=np.float32)
        w0R = np.empty([1, ], dtype=np.float32)
        w0Id = builder.addInitializedInputTensor(w0data, w0name)

        w1data = np.array([test_case[1][0]], dtype=np.float32)
        w1R = np.empty([1, ], dtype=np.float32)
        w1Id = builder.addInitializedInputTensor(w1data, w1name)

        w2data = np.array([test_case[2][0]], dtype=np.float32)
        w2R = np.empty([1, ], dtype=np.float32)
        w2Id = builder.addInitializedInputTensor(w2data, w2name)

        add0 = builder.aiOnnx.add([w0Id, input0])
        add1 = builder.aiOnnx.add([w1Id, add0])
        add2 = builder.aiOnnx.add([w2Id, add1])
        loss = builder.aiGraphcore.l1loss([add2], 1.0, debugPrefix="l1LossVal")
        builder.addOutputTensor(add2)

        proto = builder.getModelProto()
        dataFlow = popart.DataFlow(1, {})
        opts = popart.SessionOptions()
        opts.reportOptions = {"showExecutionSteps": "true"}
        opts.enableGroupedMatmuls = False
        pat = popart.Patterns(popart.PatternsLevel.Default)
        device = popart.DeviceManager().acquireAvailableDevice(1)
        if device is None:
            raise OSError("Failed to acquire IPU.")

        # The stage->tensor map would come from the Bert model in reality
        # (see model.tensors)
        mock_tensor_map = {
            0: [w0Id],
            1: [w1Id],
            2: [w2Id]
        }

        factory = ScheduledOptimizerFactory(
            config, iteration, tensors=mock_tensor_map)
        assert_scaled_lr(factory, true_scaling)

        optimizer_step0 = factory.create()

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=dataFlow,
            userOptions=opts,
            loss=loss,
            optimizer=optimizer_step0,
            patterns=pat,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        input_data = np.array([3.1415], dtype=np.float32)
        stepio = popart.PyStepIO({input0: input_data}, anchors)

        for step in range(iteration.total_steps):
            session.run(stepio)
            session.weightsToHost()
            weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R})
            session.readWeights(weightsRead)

            assert (np.isclose(test_case[0][step+1], w0R))
            assert (np.isclose(test_case[1][step+1], w1R))
            assert (np.isclose(test_case[2][step+1], w2R))

            iteration.count += 1

            if factory.should_update(iteration):
                optimizer_step1 = factory.update_and_create(iteration)
                assert_scaled_lr(factory, true_scaling)

                session.updateOptimizerFromHost(optimizer_step1)
Ejemplo n.º 27
0
def main(args):

    # Model parameters
    np.random.seed(1971)
    input_rows = 28
    input_columns = 28
    num_classes = 10
    batch_size = 2048
    input_shape = [batch_size, input_rows * input_columns]
    labels_shape = [batch_size]

    # Create model
    x0, labels, model_proto, anchor_map, loss = create_model(
        num_features=input_columns * input_rows,
        num_classes=num_classes,
        batch_size=batch_size,
        force_recompute=True if args.recomputing == 'ON' else False)

    # Save model (optional)
    if args.export:
        with open(args.export, 'wb') as model_path:
            model_path.write(model_proto)

    # Session options
    num_ipus = 1
    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.engineOptions = {"debug.instrument": "true"}

    if args.recomputing == 'AUTO':
        opts.autoRecomputation = popart.RecomputationType.Standard

    # Create session
    session = popart.TrainingSession(
        fnModel=model_proto,
        dataFeed=popart.DataFlow(1, anchor_map),
        losses=[loss],
        optimizer=popart.ConstSGD(0.01),
        userOptions=opts,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus))

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    # Synthetic data input
    data_in = np.random.uniform(low=0.0, high=1.0,
                                size=input_shape).astype(np.float32)

    labels_in = np.random.randint(low=0, high=num_classes,
                                  size=labels_shape).astype(np.int32)

    # Run session
    inputs = {x0: data_in, labels: labels_in}
    stepio = popart.PyStepIO(inputs, anchors)
    session.weightsFromHost()
    session.optimizerFromHost()
    session.run(stepio)

    # Save report and return session object (optional)
    if args.report:
        from gcprofile import save_popart_report
        save_popart_report(session)
    if args.test:
        return session
def get_ir(model_file_name='model.onnx',
           enable_executionphases=True,
           enable_matmul_serialization=False,
           enable_outlining=False,
           activation_tensor_location_settings=None,
           weight_tensor_location_settings=None,
           optimizer_state_tensor_location_settings=None,
           accumulator_tensor_location_settings=None,
           tensor_location_setting_override={},
           num_layers=3,
           dsize=48,
           batch_size=1,
           num_iterations=1,
           num_replicas=1,
           accumulation_factor=2,
           optimizer=popart.SGD({"defaultLearningRate": (0.5, False)})):

    np.random.seed(10911)
    matmul_serialization_mode = 'output_channels'
    matmul_serialization_factor = 2

    builder = popart.Builder()
    ip = builder.addInputTensor(
        popart.TensorInfo("FLOAT", [batch_size, dsize, dsize]))

    def add_layer(index, in_id):
        w = builder.addInitializedInputTensor(
            np.random.rand(dsize, dsize).astype(np.float32), f"W{index}")
        matmul_id = builder.aiOnnx.matmul([in_id, w])
        if enable_matmul_serialization:
            builder.setSerializeMatMul({matmul_id}, matmul_serialization_mode,
                                       matmul_serialization_factor)
        return matmul_id

    out = ip
    for i in range(num_layers):
        with builder.executionPhase(i):
            out = add_layer(i, out)

    l1 = builder.aiGraphcore.l1loss([out], 0.1)

    anchorIds = []

    builder.addOutputTensor(out)

    device = tu.create_test_device(num_replicas *
                                   (2 if enable_executionphases else 1),
                                   pattern=popart.SyncPattern.Full)

    dfAnchors = {}
    for anchorId in anchorIds:
        dfAnchors.update({anchorId: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()
    opts.enableOutlining = enable_outlining
    opts.enableReplicatedGraphs = True if num_replicas > 1 else False
    opts.replicatedGraphCount = num_replicas
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor

    if activation_tensor_location_settings is not None:
        opts.activationTensorLocationSettings = activation_tensor_location_settings
    if weight_tensor_location_settings is not None:
        opts.weightTensorLocationSettings = weight_tensor_location_settings
    if optimizer_state_tensor_location_settings is not None:
        opts.optimizerStateTensorLocationSettings = optimizer_state_tensor_location_settings
    if accumulator_tensor_location_settings is not None:
        opts.accumulatorTensorLocationSettings = accumulator_tensor_location_settings

    opts.tensorLocationSettingsOverride = tensor_location_setting_override

    if (enable_executionphases):
        opts.executionPhaseSettings.phases = num_layers
        opts.autoRecomputation = popart.RecomputationType.NoRecompute
        opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        opts.explicitRecomputation = False

    proto = builder.getModelProto()

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=popart.DataFlow(1, dfAnchors),
                                     optimizer=optimizer,
                                     loss=l1,
                                     patterns=popart.Patterns(
                                         popart.PatternsLevel.All),
                                     userOptions=opts,
                                     deviceInfo=device)

    session.prepareDevice()
    session.weightsFromHost()
    anchors = session.initAnchorArrays()

    for i in range(num_iterations):
        ip_data = np.random.rand(num_replicas, accumulation_factor, batch_size,
                                 dsize, dsize).astype(np.float32)
        stepio = popart.PyStepIO({ip: ip_data}, anchors)
        session.run(stepio)

    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))
    return ir
Ejemplo n.º 29
0
def create_model_pipelined(bufferStreams: bool = False,
                           pipelining: bool = False) -> Dict:
    """Create a simple model with optional pipeliing to test buffering streams

    Args:
        bufferStreams (bool, optional): Whether bufferStreamCopiesToDevice is on or off.
            Defaults to False.
        pipelining (bool, optional): Whether to pipeline the model in 2 parts. 
            Defaults to False.

    Returns:
        Dict: A dict of session, stepio, anchors and out tensor name required
            to run and test the model.
    """
    builder = popart.Builder()

    data_shape = popart.TensorInfo("FLOAT16", [8, 2])
    lbl_shape = popart.TensorInfo("INT32", [8])

    ip = builder.addInputTensor(data_shape, "input_data")
    lb = builder.addInputTensor(lbl_shape, "label")

    w = builder.addInitializedInputTensor(w_init)
    b = builder.addInitializedInputTensor(bias_init)
    gemm = builder.aiOnnx.gemm([ip, w, b], 1., 1., False, False)
    relu = builder.aiOnnx.relu([gemm])
    sm = builder.aiOnnx.softmax([relu])
    nll = builder.aiGraphcore.nllloss([sm, lb])

    builder.addOutputTensor(sm)

    art = popart.AnchorReturnType("All")
    dataFlow = popart.DataFlow(BPS, {sm: art, nll: art})

    opts = popart.SessionOptions()
    opts.enableOutlining = True
    opts.useHostCopyOps = bufferStreams

    numIPUs = 1

    if pipelining:
        opts.enablePipelining = True
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        builder.pipelineStage(gemm, 0)
        builder.virtualGraph(gemm, 0)
        builder.pipelineStage(relu, 0)
        builder.virtualGraph(relu, 0)
        builder.pipelineStage(sm, 1)
        builder.virtualGraph(sm, 1)
        builder.pipelineStage(nll, 1)
        builder.virtualGraph(nll, 1)
        numIPUs = 2

    device = tu.create_test_device(numIPUs)

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=dataFlow,
                                     loss=nll,
                                     optimizer=popart.ConstSGD(0.1),
                                     userOptions=opts,
                                     deviceInfo=device)

    session.prepareDevice()

    # 2 host load ops for input
    check_ops(session, bufferStreams, 2)

    anchors = session.initAnchorArrays()
    stepio = popart.PyStepIO({
        ip: trainingData,
        lb: trainingDataLables
    }, anchors)

    return {
        "session": session,
        "stepio": stepio,
        "anchors": anchors,
        "out": sm
    }
Ejemplo n.º 30
0
def test_groupHostSync():
    builder = popart.Builder()

    a = builder.addInputTensor(popart.TensorInfo("FLOAT16", [1]))
    w = builder.addInitializedInputTensor(np.ones([1], np.float16))
    o = builder.aiOnnx.add([w, a])
    l1 = builder.aiGraphcore.l1loss([o], 0.1)

    anchor_config = {
        o: popart.AnchorReturnType("All"),
        l1: popart.AnchorReturnType("All")
    }
    dataFlow = popart.DataFlow(1, anchor_config)

    options = popart.SessionOptions()
    options.engineOptions = {
        "debug.instrumentCompute": "true",
        "debug.instrumentExternalExchange": "true"
    }
    options.groupHostSync = True  #The option we are testing
    options.reportOptions = {
        "showVarStorage": "true",
        "showPerIpuMemoryUsage": "true",
        "showExecutionSteps": "true"
    }

    session = popart.InferenceSession(fnModel=builder.getModelProto(),
                                      dataFlow=dataFlow,
                                      deviceInfo=tu.create_test_device(),
                                      userOptions=options)

    session.prepareDevice()
    session.weightsFromHost()

    anchors = session.initAnchorArrays()
    input_a = np.array([1.4], dtype=np.float16)
    stepio = popart.PyStepIO({a: input_a}, anchors)
    session.run(stepio)
    summaryReport = session.getSummaryReport()

    lines = summaryReport.split('\n')
    order = []
    first = False
    countStreams = 0
    countSeq = 0

    # Analyse a sequence:
    # default order :
    #     StreamCopy (FromHost) x2
    #     Add
    #     StreamCopy(ToHost) x2
    #     Absolute
    #     Reduce
    #     StreamCopy(ToHost) x2

    # with the option:
    #     StreamCopy (FromHost) x2
    #     Add
    #     Absolute
    #     Reduce
    #     StreamCopy(ToHost)   x2

    for l in lines:
        if re.search(r"Sequence", l):
            countSeq += 1
            if countSeq >= 7:
                break
        if re.search(r"OnTileExecute: 104/Op/Add", l):
            order.append(1)
            first = True
        if re.search(r"OnTileExecute: 101/abs/Op/Absolute", l):
            order.append(2)
        if re.search(r"101/add/ReduceExpression", l):
            order.append(3)
        if re.search(r"StreamCopy", l) and first:
            order.append(4)
            countStreams += 1

    # The streamcopy to host should only happen at the end (after ReduceExpression)
    # Expected list with the option enabled: [1,2,3,4,4]
    # Expected list without the option: [1,4,4,2,3,4,4]
    assert (order[1] == 2)
    assert (order[2] == 3)
    assert (order[3] == 4)
    # The number of Streamcopies happening in total
    # (start counting from Add) should be 2.
    assert (countStreams == 2)