Beispiel #1
0
def compare_models(model_A0, model_A1, model_B0, model_B1):
    report = popart.NumericsReport(model_A0, model_A1, model_B0, model_B1)
    report = report.fullReport()
    print(report)

    difference = 0.0

    for line in report.splitlines():
        match = re.search(' =\s+(\d+)', line)
        if match:
            difference += float(match.group(1))

    return difference
Beispiel #2
0
def _run_impl(torchWriter, patterns, outputdir, cifarInIndices, device,
              device_hw_id, mode, syntheticData, transformations, epochs,
              printAnchorArrays):

    runIds = [-1] + [
        int(x.split("runId")[1].split("_")[0])
        for x in os.listdir(outputdir) if "runId" in x
    ]
    baseId = 1 + max(runIds)

    def getFnModel(framework, epoch):
        return os.path.join(
            outputdir,
            "runId%d_%sModel_epoch%s.onnx" % (baseId, framework, epoch))

    def getFnPopArt(epoch):
        return getFnModel("PopArt", epoch)

    def getFnTorch(epoch):
        return getFnModel("Torch", epoch)

    def getFnModel0():
        return os.path.join(outputdir, "runId%d_model0.onnx" % (baseId, ))

    dataFlow = torchWriter.dataFlow
    inputShapeInfo = torchWriter.inputShapeInfo
    validModes = ["infer", "train"]
    if mode not in validModes:
        raise Exception("mode must be one of " + str(validModes))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # determine what the data directory is
    datadir = "unset"

    dir_path = os.path.dirname(os.path.realpath(__file__))
    path_c10datadir = os.path.join(dir_path, "c10datadir.py")
    if os.path.exists(path_c10datadir):
        import c10datadir
        datadir = c10datadir.c10datadir
    else:
        tmpdir = tempfile.gettempdir()
        datadir = os.path.abspath(os.path.join(tmpdir, 'cifar10data'))
    print("Using datadir=%s" % (datadir))

    if (not os.path.exists(datadir)):
        print(
            "Specified datadir %s does not exist. Consider making it here with os.mkdir(datadir)"
            % (datadir, ))

    print("c10driver: getting data from", datadir)
    trainset = datasets.CIFAR10(root=datadir,
                                train=True,
                                download=False,
                                transform=transform)

    fnModel0 = getFnModel0()

    # write ONNX Model to file
    torchWriter.saveModel(fnModel=fnModel0)

    stepLoader = torch.utils.data.DataLoader(
        trainset,
        # the amount of data loaded for each step.
        # note this is not the batch size, it's the "step" size
        # (samples per step)
        batch_size=torchWriter.samplesPerBatch * dataFlow.batchesPerStep(),
        #non-random data loading
        shuffle=False,
        num_workers=0)

    deviceManager = popart.DeviceManager()

    # Create a CPU device
    if device == "cpu":
        device = deviceManager.createCpuDevice()

    # Create an IPU Model device
    elif device == "ipu_model":

        options = {"compileIPUCode": True, 'numIPUs': 1, 'tilesPerIPU': 4}
        device = deviceManager.createIpuModelDevice(options)

    # Create an Simulator
    elif device == "sim":
        options = {"numIpus": 1, "tilesPerIPU": 4}
        device = deviceManager.createSimDevice(options)

    # Get a Hardware Device
    elif device == "hw":
        # Get a hardware device that meets the reqirements,
        # may throw if none are available.
        # Will attach to the device
        if device_hw_id:
            device = deviceManager.acquireDeviceById(device_hw_id)
        else:
            device = tu.acquire_ipu()

    # Enumerate available devices
    print("Enumerating devices")
    print("-------------------------------------")
    for idx, d in enumerate(deviceManager.enumerateDevices()):
        print('{0}. {1}'.format(idx, d))
    print("")

    opts = popart.SessionOptions()
    opts.logDir = outputdir
    if syntheticData == True:
        opts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal

    modelProtoX = fnModel0
    if transformations:
        gc = popart.GraphTransformer(fnModel0)
        for transformation in transformations:
            print("Running %s transformation pass" % (transformation, ))
            if transformation == "removeUnusedInputs":
                gc.removeUnusedInputs()

            elif transformation == "prepareNodesForTraining":
                gc.prepareNodesForTraining()

            else:
                raise RuntimeError("Unrecognised transformation %s" %
                                   (transformation, ))

        modelProtoX = gc.getModelProto()

    # Reads ONNX model from file and creates backwards graph,
    # performs Ir optimisations

    if mode == 'infer':
        session = popart.InferenceSession(fnModel=modelProtoX,
                                          inputShapeInfo=inputShapeInfo,
                                          dataFlow=dataFlow,
                                          patterns=patterns,
                                          userOptions=opts,
                                          deviceInfo=device)
    else:
        if len(torchWriter.outNames) != 1:
            raise RuntimeError("Expecting single scalar loss tensor")

        # Append output with an identity loss, to reduce to scalar if
        # necessary
        bder = popart.Builder(modelProtoX)
        loss = bder.aiGraphcore.identityloss(
            [torchWriter.outNames[0]], reduction=popart.ReductionType.Sum)
        session = popart.TrainingSession(fnModel=bder.getModelProto(),
                                         inputShapeInfo=inputShapeInfo,
                                         dataFlow=dataFlow,
                                         loss=loss,
                                         optimizer=torchWriter.optimizer,
                                         patterns=patterns,
                                         userOptions=opts,
                                         deviceInfo=device)

    # get the tensor info for the anchors
    anchorArrays = session.initAnchorArrays()

    allDotPrefixes = [x[0:-4] for x in os.listdir(outputdir) if ".dot" in x]
    print("Will generate graph pdfs for all of:")
    print(allDotPrefixes)
    import subprocess
    # set generateFromDots to True to
    # generate pdf figures of the Ir. It
    # requires the 'dot' program
    generateFromDots = False
    if generateFromDots:
        for name in allDotPrefixes:
            dotfile = os.path.join(outputdir, "%s.dot" % (name, ))
            outputfile = os.path.join(outputdir, "%s.pdf" % (name, ))
            log = subprocess.call(
                ["dot", "-T", "pdf", "-o", outputfile, dotfile])
            print("Exit status on `%s' was: %s" % (name, log))

    print("Setting device to IPU, and preparing it")
    session.prepareDevice()

    if mode == "train":
        print("Writing weights to device")
        session.weightsFromHost()

        print("Writing Optimizer tensors to device, if there are any")

    def addStepDimension(data, batchesPerStep):
        if batchesPerStep == 1:
            return data
        else:
            dataShape = np.array(np.shape(data))
            dataShape[0] //= batchesPerStep
            dataShape = np.insert(dataShape, 0, batchesPerStep)
            return np.reshape(data, dataShape)

    def reportTensorError(tensorInd, result):
        reportStr = str(tensorInd) + " :\n"
        reportStr += "  |pA - tA|^2 / (|pA||tA| + 1e-8)  = " + str(
            result) + "\n"
        return reportStr

    def getAnchorTensor(tId, anchorArrays):
        assertStr = "Tensor" + tId + " must be specified as an anchor"
        assert (tId in anchorArrays.keys()), assertStr
        return anchorArrays[tId]

    def subsampleBatches(array, refShape):
        arrayShape = np.shape(array)

        # Every Nth batch
        if len(arrayShape) == len(refShape):
            n = arrayShape[0] // refShape[0]
            return array[n - 1::n]

        # Last batch only
        else:
            return array[-1]

    def getTensorError(tA, pA):
        # pA, tA are corresponding tensors from two models
        pA_shape = np.shape(pA)
        tA_shape = np.shape(tA)
        assert (pA_shape == tA_shape), "Arrays must be same shape"

        ss_err = np.sum((np.array(pA) - np.array(tA))**2)
        ss_pA = np.sum(np.array(pA)**2)
        ss_tA = np.sum(np.array(tA)**2)
        return ss_err / (math.sqrt(ss_pA * ss_tA) + 1.0e-8)

    def checkResult(result, margin):
        if np.isnan(result):
            raise TestFailureError(str(result) + " is NaN")
        elif (result > margin):
            raise TestFailureError(
                str(result) + " is greater than " + str(margin))

    margin = 5.0e-7
    numReports = []

    for epoch in range(epochs):  # loop over the dataset multiple times
        print("Epoch is %d" % (epoch, ))
        stepData = next(iter(stepLoader))

        # Form the input map for one step's worth of data.
        # Note: data from the torch DataLoader has shape:
        #   [stepSize * batchSize, sampleShape]
        # whereas Popart expects input data of the shape:
        #   [stepSize, batchSize, sampleShape]
        # so we reshape the input array before passing to the stepio
        inputs = {}
        for tenId in cifarInIndices.keys():
            inputs[tenId] = \
                addStepDimension(stepData[cifarInIndices[tenId]].numpy(),
                                 session.dataFlow.batchesPerStep())

        if mode == "train":
            # take batchesPerStep passes (1 step), Torch
            torchWriter.train(inputs)

            # take batchesPerStep passes (1 step), PopArt
            pystepio = popart.PyStepIO(inputs, anchorArrays)
            session.run(pystepio)

            if printAnchorArrays:
                print(
                    "\nAnchor arrays (being printed as printAnchorArrays==True):"
                )
                for name in anchorArrays.keys():
                    arr = anchorArrays[name]
                    print("\nAnchored Array Name=", name, " and Size=",
                          arr.size)

                    if (arr.size < 10):
                        print("\nArray (of size < 10) values are")
                        print(arr)

                    if len(arr.shape) > 1:
                        for i, slice0 in enumerate(arr):
                            print("Sum along axis %d is Sum=%.15f" %
                                  (i, slice0.sum()))

                    print("Total Sum is %.15f" % (arr.sum()))

            # write models to file
            fnTorchModel = getFnTorch(epoch)
            fnPopArtModel = getFnPopArt(epoch)
            torchWriter.saveModel(fnTorchModel)
            session.modelToHost(fnPopArtModel)
            print("Writing models to " + fnTorchModel + " and " +
                  fnPopArtModel)

            # Compare parameters from updated Onnx models
            print("Obtaining popart NumericsReport, A: Torch, B: Popart.")
            if epoch is 0:
                nr = popart.NumericsReport(fnModel0, fnTorchModel, fnModel0,
                                           fnPopArtModel)
            else:
                nr = popart.NumericsReport(getFnTorch(epoch - 1), fnTorchModel,
                                           getFnPopArt(epoch - 1),
                                           fnPopArtModel)

            print(nr.fullReport())
            # One relative error calculated per weight tensor
            for tId, relerror in nr.getRelativeErrors().items():
                checkResult(relerror, margin)

        elif mode == "infer":
            # take batchesPerStep passes (1 step), Torch
            # returns map of outputs for each sample
            # Note: already are of dimension matching the
            # anchors
            torchOutputs = torchWriter.infer(inputs)

            # take batchesPerStep passes (1 step), PopArt
            pystepio = popart.PyStepIO(inputs, anchorArrays)
            session.run(pystepio)

            # Compare torch outputs tensors with popart output from
            # anchor tensor maps
            for nInd, outName in enumerate(torchWriter.outNames):
                # Torch outputs returned for all samples, whereas
                # anchors are returned as specified by the user.
                # Subsample torch outputs to match dimensions
                torchOuput = subsampleBatches(torchOutputs[outName],
                                              np.shape(anchorArrays[outName]))
                result = getTensorError(torchOuput, anchorArrays[outName])
                print(reportTensorError(nInd, result))
                checkResult(result, margin)

    return anchorArrays
Beispiel #3
0
def test_import_torch_lstm_train(tmpdir):
    torch.manual_seed(0)
    np.random.seed(0)

    seq_length = 5
    batch_size = 2
    layers = 1

    # create an lstm module with defined input and hidden sizes
    def torch_create_lstm(input_size, hidden_size):
        class Module0(torch.nn.Module):
            def __init__(self):
                torch.nn.Module.__init__(self)
                self.lstm = torch.nn.LSTM(input_size, hidden_size, layers)

            def forward(self, inputs):
                x = self.lstm(inputs[0], inputs[1])
                return x[0] + x[1][0] + x[1][1]

        return Module0()

    # export model created by `torch_create_lstm`
    def torch_export_lstm(onnx_file_name, model, inputs):
        print('pytorch exporting lstm')
        tt = [torch.tensor(i) for i in inputs]
        dummy_input = [tt[0], (tt[1], tt[2])]

        torch.onnx.export(model,
                          dummy_input,
                          onnx_file_name,
                          input_names=['X', 'initial_h', 'initial_c'],
                          output_names=['out'],
                          do_constant_folding=False)

    # create a random np array of shape `*shape` and type np.float32
    def np_rand(*shape):
        return np.random.rand(*shape).astype(np.float32)

    # run the torch lstm
    def run_lstm_torch(torch_lstm, inputs, d__out):
        # run the torch session
        x = torch.tensor(inputs[0], requires_grad=True)
        h0 = torch.tensor(inputs[1], requires_grad=True)
        c0 = torch.tensor(inputs[2], requires_grad=True)

        torch_lstm.lstm.weight_ih_l0.requires_grad_(True)
        torch_lstm.lstm.weight_hh_l0.requires_grad_(True)

        out = torch_lstm.forward((x, (h0, c0)))

        d__out = torch.tensor(d__out)
        out.backward(d__out)

        # manually update parameters
        for name, param in torch_lstm.named_parameters():
            print('Updating lstm param {}'.format(name))
            param.data.sub_(0.1 * param.grad.data)

        outputs = {
            'out':
            out,
            popart.reservedGradientPrefix() + 'X':
            x.grad,
            popart.reservedGradientPrefix() + 'initial_h':
            h0.grad,
            popart.reservedGradientPrefix() + 'initial_c':
            c0.grad,
            popart.reservedGradientPrefix() + 'W':
            torch_lstm.lstm.weight_ih_l0.grad,
            popart.reservedGradientPrefix() + 'R':
            torch_lstm.lstm.weight_hh_l0.grad,
            popart.reservedGradientPrefix() + 'WB':
            torch_lstm.lstm.bias_ih_l0.grad,
            popart.reservedGradientPrefix() + 'RB':
            torch_lstm.lstm.bias_hh_l0.grad,
        }
        return {key: value.data.numpy() for key, value in outputs.items()}

    def get_popart_fname(fname):
        path = Path(fname)
        path = path.parent / ('popart_' + path.name)
        return str(path)

    def get_torch_fname(fname):
        path = Path(fname)
        path = path.parent / ('torch_' + path.name)
        return str(path)

    def run_lstm_popart(onnx_file_name, inputs):
        # generate a popart session
        builder = popart.Builder(onnx_file_name)
        loss = builder.aiGraphcore.identityloss(['out'])
        outputs = builder.getOutputTensorIds()
        anchors = outputs + [
            popart.reservedGradientPrefix() + 'out',
            popart.reservedGradientPrefix() + 'X',
            popart.reservedGradientPrefix() + 'initial_h',
            popart.reservedGradientPrefix() + 'initial_c',
            popart.reservedGradientPrefix() + 'lstm.weight_ih_l0',
            popart.reservedGradientPrefix() + 'lstm.weight_hh_l0',
            popart.reservedGradientPrefix() + 'lstm.bias_ih_l0',
            popart.reservedGradientPrefix() + 'lstm.bias_hh_l0'
        ]
        dataFlow = popart.DataFlow(1, anchors)
        optimizer = popart.ConstSGD(0.1)
        device = tu.create_test_device(1)
        print('Creating session')
        s = popart.TrainingSession(fnModel=builder.getModelProto(),
                                   dataFlow=dataFlow,
                                   optimizer=optimizer,
                                   loss=loss,
                                   patterns=popart.Patterns(
                                       ['PreUniRepl', 'OpToReshape']),
                                   deviceInfo=device)
        print('setting device')

        anchor_map = s.initAnchorArrays()
        s.prepareDevice()

        # run the popart session
        input_map = {
            'X': inputs[0],
            'initial_h': inputs[1],
            'initial_c': inputs[2]
        }
        stepio = popart.PyStepIO(input_map, anchor_map)
        s.weightsFromHost()
        s.run(stepio)
        s.modelToHost(get_popart_fname(onnx_file_name))

        anchor_map[popart.reservedGradientPrefix() +
                   'W'] = anchor_map.pop(popart.reservedGradientPrefix() +
                                         'lstm.weight_ih_l0')
        anchor_map[popart.reservedGradientPrefix() +
                   'R'] = anchor_map.pop(popart.reservedGradientPrefix() +
                                         'lstm.weight_hh_l0')
        anchor_map[popart.reservedGradientPrefix() +
                   'WB'] = anchor_map.pop(popart.reservedGradientPrefix() +
                                          'lstm.bias_ih_l0')
        anchor_map[popart.reservedGradientPrefix() +
                   'RB'] = anchor_map.pop(popart.reservedGradientPrefix() +
                                          'lstm.bias_hh_l0')
        return anchor_map

    input_size = 2
    hidden_size = 7
    fname = str(tmpdir / 'bar.onnx')

    # create inputs
    x = np_rand(seq_length, batch_size, input_size)
    h0 = np_rand(layers, batch_size, hidden_size)
    c0 = np_rand(layers, batch_size, hidden_size)

    torch_lstm = torch_create_lstm(input_size, hidden_size)
    torch_export_lstm(fname, torch_lstm, (x, h0, c0))
    popart_out = run_lstm_popart(fname, (x, h0, c0))
    torch_out = run_lstm_torch(
        torch_lstm, (x, h0, c0),
        popart_out.pop(popart.reservedGradientPrefix() + 'out'))
    torch_export_lstm(get_torch_fname(fname), torch_lstm, (x, h0, c0))

    nr = popart.NumericsReport(fname, get_torch_fname(fname), fname,
                               get_popart_fname(fname))
    print(nr.fullReport())

    assert len(popart_out.keys()) == 8
    assert len(popart_out.keys()) == len(torch_out.keys())

    errors = 0
    for key in popart_out.keys():
        po = popart_out[key]
        to = torch_out[key]
        print('Checking {}'.format(key))
        if po.shape != to.shape:
            errors += 1
            print('tensors {} are not matching shapes'.format(key))
            print()
        elif not np.allclose(po, to, atol=1e-07):
            errors += 1
            print('tensors {} are not close'.format(key))
            print('  popart')
            print('    {}'.format(po))
            print('  torch')
            print('    {}'.format(to))
            print()
    assert errors == 0