Exemple #1
0
    def test_all_reduce(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        strategy = ipu_strategy.IPUStrategy()

        def make_all_reduce_function(reduce_op):
            @def_function.function(experimental_compile=True)
            def all_reduce_function():
                replica_ctx = distribution_strategy_context.get_replica_context(
                )
                x = math_ops.cast(replication_ops.replication_index(),
                                  np.float32)
                return replica_ctx.all_reduce(reduce_op, x)

            return all_reduce_function

        report = tu.ReportJSON(self, eager_mode=True, replicated=True)
        report.reset()

        with strategy.scope():
            summed = strategy.experimental_run_v2(
                make_all_reduce_function(reduce_util.ReduceOp.SUM))
            self.assertEqual(1.0, summed.numpy())

            mean = strategy.experimental_run_v2(
                make_all_reduce_function(reduce_util.ReduceOp.MEAN))
            self.assertEqual(0.5, mean.numpy())
Exemple #2
0
    def testTrainingMomentumInLoop(self):
        with self.session() as sess:

            x = array_ops.placeholder(datatype, shape=[1, 224, 224, 4])
            y_ = array_ops.placeholder(datatype, shape=[1, 1000])

            with ipu.scopes.ipu_scope("/device:IPU:0"):

                def model(x, l):
                    def body(x, label):
                        logits = inference(x)
                        loss = math_ops.reduce_mean(
                            nn_ops.softmax_cross_entropy_with_logits_v2(
                                logits=logits,
                                labels=array_ops.stop_gradient(label)))
                        return x, label, momentum.MomentumOptimizer(
                            0.01, 0.9).minimize(loss)

                    return ipu.loops.repeat(10, body, (x, l))

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                train = ipu.ipu_compiler.compile(model, inputs=[x, y_])

            report = tu.ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())
            report.reset()

            data = np.zeros([1, 224, 224, 4])
            labels = np.zeros([1, 1000])

            sess.run(train, feed_dict={x: data, y_: labels})
            report.parse_log()

            report.assert_total_tile_memory(40885054)
Exemple #3
0
    def test_building_model_explicitly(self):
        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            report = tu.ReportJSON(self, eager_mode=True)
            report.reset()

            model = keras.Sequential([
                keras.layers.Dense(5),
                keras.layers.Dense(10),
                keras.layers.Softmax(),
            ])

            self.assertFalse(model.built)

            model.build(input_shape=(None, 2))

            # The model is now built, meaning shapes are known and weights are
            # allocated, but no engines should have been compiled or executed yet.
            self.assertTrue(model.built)
            self.assertEqual(4, len(model.variables))

            event_counts, trace_events = report.get_ipu_events()
            self.assertEqual([], _get_compiled_modules(trace_events))
            self.assertEqual(0, event_counts[IpuTraceEvent.EXECUTE])
Exemple #4
0
    def testCheckMaxTileSizePadding2(self):
        with self.session() as sess:

            def my_graph(a, b):
                with variable_scope.variable_scope("vs", use_resource=True):
                    weights = variable_scope.get_variable(
                        "x",
                        dtype=np.float16,
                        shape=[64, 64],
                        initializer=init_ops.constant_initializer(1.0))
                a = math_ops.matmul(a, weights, name="mm1")
                a = array_ops.pad(a, [[0, 0], [4935, 1]], constant_values=64)
                return a + b

            pa = array_ops.placeholder(np.float16, [64, 64], name="a")
            pb = array_ops.placeholder(np.float16, [64, 5000], name="a")

            with ops.device("/device:IPU:0"):
                out = ipu_compiler.compile(my_graph, [pa, pb])

            report = tu.ReportJSON(self, sess)
            report.reset()

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())
            report.reset()

            out = sess.run(out, {pa: np.ones(pa.shape), pb: np.ones(pb.shape)})
            self.assertAllClose(np.full(pb.shape, 65.0), out[0])

            report.parse_log()
            report.assert_max_tile_memory(2998)
Exemple #5
0
    def test_model_with_autograph_loop(self):
        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            model = keras.Sequential([
                keras.layers.Dense(1, activation='relu'),
            ])

            @def_function.function
            def step_fn(x):
                while x[0] < 0.0:
                    x = model(x)
                return x

            report = tu.ReportJSON(self, eager_mode=True)
            report.reset()

            inputs = -1.0 * np.ones((1, 1), dtype=np.float32)
            out = strategy.experimental_run_v2(step_fn, args=[inputs])
            self.assertGreaterEqual(out, 0.0)

            # There should be a single engine, executed once. If auto-clustering
            # were enabled, it would usually produce multiple engines for the loop.
            event_counts, _ = report.get_ipu_events()
            self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE])
            self.assertEqual(1, event_counts[IpuTraceEvent.EXECUTE])
Exemple #6
0
  def testNoLookup(self):
    shape = [100000, 200]
    lookup_count = 4096

    host_embedding = embedding_ops.create_host_embedding(
        "my_host_embedding",
        shape,
        np.float32,
        optimizer_spec=embedding_ops.HostEmbeddingOptimizerSpec(0.5))

    def my_net(i):
      return i

    with ops.device('cpu'):
      i = array_ops.placeholder(np.int32, [lookup_count])

    with ipu.scopes.ipu_scope("/device:IPU:0"):
      r = ipu.ipu_compiler.compile(my_net, inputs=[i])

    cfg = ipu.utils.create_ipu_config(profiling=True,
                                      always_rearrange_copies_on_the_host=True)
    cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False)
    ipu.utils.configure_ipu_system(cfg)
    with sl.Session() as sess:
      i_h = np.arange(0, lookup_count).reshape([lookup_count])

      report = tu.ReportJSON(self, sess, configure_device=False)
      sess.run(variables.global_variables_initializer())
      report.reset()

      with host_embedding.register(sess):
        result = sess.run([r], {i: i_h})

      # Check the indices are correct, but the real test is no timeout.
      self.assertAllClose(result[0][0], i_h)
Exemple #7
0
    def testSingleFunctionElided(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(a):
                return nn.relu(a)

            def body(a):
                return func(a)

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float16, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {a: np.ones(a.shape)})
            self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64]))

            report.parse_log()

            ok = [
                'Relu/relu*/Nonlinearity',
                '__seed',
            ]
            report.assert_all_compute_sets_and_list(ok)

            # Function inlined into the entry computation.
            self.assertEqual(len(report.tensor_map.computation_names()), 1)
Exemple #8
0
    def testCheckMaxTileSize(self):
        with self.session() as sess:
            dtype = np.float32
            shape = (1024, 2048)
            with ops.device("/device:IPU:0"):
                with variable_scope.variable_scope("", use_resource=True):
                    a = variable_scope.get_variable(
                        "a",
                        shape=shape,
                        initializer=init_ops.constant_initializer(2),
                        dtype=dtype)
                pb = array_ops.placeholder(shape=shape, dtype=dtype, name="b")
                c = constant_op.constant(4, shape=shape, dtype=dtype, name="c")
                output = a + pb + c

            report = tu.ReportJSON(self, sess)
            report.reset()

            sess.run(variables.global_variables_initializer())

            report.parse_log()
            report.assert_max_tile_memory(7480)

            out = sess.run(output, {pb: np.ones(shape=shape, dtype=dtype)})
            self.assertAllClose(np.full(shape, 7, dtype=dtype), out)

            report.parse_log()
            report.assert_max_tile_memory(28294)
Exemple #9
0
    def _run_on_ipu():
        g = ops.Graph()
        with g.as_default(), test_wrapper.test_session(graph=g) as session:
            g.add_to_collection("run_type", "ipu")
            inputs = inputs_fn()
            fd = dict(zip(inputs, init_values))
            with variable_scope.variable_scope("ipu",
                                               use_resource=True,
                                               reuse=False):
                with ipu.scopes.ipu_scope("/device:IPU:0"):
                    res = ipu.ipu_compiler.compile(model_fn, inputs=inputs)

            report = tu.ReportJSON(test_wrapper, session)
            tu.move_variable_initialization_to_cpu()
            session.run(variables.global_variables_initializer())
            report.reset()
            r = session.run(res, fd)[0]

            report.parse_log()
            if compute_sets:
                report.assert_all_compute_sets_and_list(compute_sets)
            if partial_compute_sets:
                report.assert_compute_sets_contain_list(partial_compute_sets)

            test_wrapper.assertAllEqual(report.get_ml_type_counts(),
                                        conv_classifications)
            tvars = session.run(variables.trainable_variables())
            return r, tvars
Exemple #10
0
    def testTrainingMomentum(self):
        with self.session() as sess:

            x = array_ops.placeholder(datatype, shape=[1, 224, 224, 4])
            y_ = array_ops.placeholder(datatype, shape=[1, 1000])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                logits = inference(x)

                loss = math_ops.reduce_mean(
                    nn_ops.softmax_cross_entropy_with_logits_v2(
                        logits=logits, labels=array_ops.stop_gradient(y_)))

                train = momentum.MomentumOptimizer(0.01, 0.9).minimize(loss)

            report = tu.ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())
            report.reset()

            data = np.zeros([1, 224, 224, 4])
            labels = np.zeros([1, 1000])

            sess.run(train, feed_dict={x: data, y_: labels})
            report.parse_log()

            report.assert_total_tile_memory(38642237)
Exemple #11
0
  def testReplicationNormaliseNotInplace(self):
    with ops.device("/device:IPU:0"):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      a = gen_poputil_ops.ipu_replication_normalise(x)
      b = a + x

    with tu.ipu_session() as sess:
      report = tu.ReportJSON(self, sess, replicated=True)
      sess.run(variables.global_variables_initializer())

      report.reset()

      res = sess.run(b, {x: np.ones([1, 4, 4, 2])})
      self.assertAllClose(res, np.full([1, 4, 4, 2], 1.5))
      report.parse_log()

      ok = [
          '__seed*',
          'IpuReplicationNormalise/replication-normalise*/replication_normalise/Op/Divide',
          'switchControlBroadcast*/GlobalPre/Copy/OnTileCopy',
          '/OnTileCopy',
          'Copy_XLA_Args*OnTileCopy',
          'add/add*/AddTo',
      ]
      report.assert_all_compute_sets_and_list(ok)
Exemple #12
0
    def test_inference_step_fn_keras_model(self):
        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            model = keras.Sequential([
                keras.layers.Dense(5),
                keras.layers.Dense(10),
                keras.layers.Softmax(),
            ])

            @def_function.function
            def step_fn(x):
                return model(x)

            report = tu.ReportJSON(self, eager_mode=True)
            report.reset()

            inputs = np.ones((1, 2), dtype=np.float32)
            out = strategy.experimental_run_v2(step_fn, args=[inputs])
            self.assertEqual("/job:localhost/replica:0/task:0/device:IPU:0",
                             out.device)
            self.assertAllClose(1.0, np.sum(out.numpy()))

            # There should be a single engine, executed once.
            event_counts, _ = report.get_ipu_events()
            self.assertEqual(1, event_counts[IpuTraceEvent.EXECUTE])
Exemple #13
0
    def testModel(self):
        shape = [1000, 256]
        lookup_count = 128
        lr = 1 / 2
        acc_factor = 2
        num_iterations = 6

        host_embedding = embedding_ops.create_host_embedding(
            "my_host_embedding",
            shape,
            np.float32,
            optimizer_spec=embedding_ops.HostEmbeddingSGDGAOptimizerSpec(
                lr, acc_factor))

        optimizer = ga.GradientAccumulationOptimizerV2(
            gd.GradientDescentOptimizer(lr), acc_factor)

        # A dummy model that has an embedding lookup and a matmul
        def model(i, w):
            a = host_embedding.lookup(i)
            return math_ops.matmul(a * a, w)

        def training(loss, i, w):
            loss_ = model(i, w)
            # mean_loss = math_ops.reduce_mean(loss)
            abs_mean_loss = math_ops.abs(loss_)
            train = optimizer.minimize(abs_mean_loss)
            return loss, i, w, train

        def my_net(i, w):
            loss = array_ops.constant(0.0, shape=[])
            r = loops.repeat(num_iterations, training, [loss, i, w])
            return r

        with ops.device('cpu'):
            i = array_ops.placeholder(np.int32, [lookup_count])
            w = array_ops.placeholder(np.float32, [256, 128])

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            r = ipu.ipu_compiler.compile(my_net, inputs=[i, w])

        cfg = ipu.utils.create_ipu_config(
            profiling=True, always_rearrange_copies_on_the_host=True)
        cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False)
        ipu.utils.configure_ipu_system(cfg)
        with sl.Session() as sess:
            i_h = np.arange(0, lookup_count).reshape([lookup_count])
            w_h = np.random.rand(256, 128).astype(np.float32)

            report = tu.ReportJSON(self, sess, configure_device=False)
            sess.run(variables.global_variables_initializer())
            report.reset()

            with host_embedding.register(sess):
                result = sess.run([r], {i: i_h, w: w_h})

            # Given the dumb model and the LR is the inverse of the accumulation factor,
            # we expect the "mean loss" to be zero.
            self.assertAllClose(result[0][0], 0.0)
Exemple #14
0
  def testResnetLike(self):
    # Check that we get all classifications for a small resnet correct

    def stage1(img, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        x = conv(img, 7, 2, 16)
        x = nn.relu(x)
        x = max_pool(x, ksize=3, stride=2)
        return x, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        x = block("b", 2, 64, 1, x)
        return x, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        x = math_ops.reduce_mean(x, axis=[1, 2])
        x = fc(x, 100)
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 1 conv in stage1, 2 conv in stage2, 1 matmul in stage3 = 4
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
Exemple #15
0
    def testFunctionInferenceWithVariableScope(self):
        with tu.ipu_session() as sess:

            def func(a, b, name):
                @ipu.function
                def outlined_func(a, b):
                    with variable_scope.variable_scope(name,
                                                       use_resource=True):
                        w = variable_scope.get_variable(
                            "w",
                            shape=[64, 64],
                            dtype=np.float32,
                            initializer=init_ops.ones_initializer())
                    x = math_ops.matmul(a, w)
                    x = x + b
                    return math_ops.sigmoid(x)

                return outlined_func(a, b)

            def body(a, b, c):
                a = func(a, b, name="one")
                a = a - func(a, c, name="two")
                return a

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [64, 64])
                b = array_ops.placeholder(np.float32, [64, 64])
                c = array_ops.placeholder(np.float32, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b, c])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]})
            self.assertAllClose(result[0], np.broadcast_to(0., [64, 64]))

            report.parse_log()
            # There would be multiple non-linearities if the function was not
            # cached.
            ok = [
                'MatMul/dot*/Conv_1',
                'add/add*/Op/Add',
                'Sigmoid/sigmoid/Nonlinearity',
                'sub/subtract*/Op/Subtract',
                '__seed',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)
            report.assert_total_tile_memory(954492)
            report.assert_max_tile_memory(1690)

            # Entry computation and outlined one.
            self.assertEqual(len(report.tensor_map.computation_names()), 2)
Exemple #16
0
    def test_keras_mnist_model_compile_fit(self):
        num_examples = 100
        batch_size = 10
        num_classes = 10
        num_epochs = 3

        def mnist_model():
            model = keras.models.Sequential()
            model.add(keras.layers.Conv2D(32, (3, 3)))
            model.add(keras.layers.Conv2D(64, (3, 3)))
            model.add(keras.layers.Dropout(0.25))
            model.add(keras.layers.Flatten())
            model.add(keras.layers.Dense(num_classes, activation='softmax'))
            return model

        (x_train, y_train), _ = keras.datasets.mnist.load_data()
        x_train = x_train[:num_examples]
        y_train = y_train[:num_examples]

        x_train = x_train.reshape(*x_train.shape, 1)
        x_train = x_train.astype('float32')
        x_train /= 255
        y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)

        report = tu.ReportJSON(self, eager_mode=True)
        report.reset()

        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            model = mnist_model()
            model.compile(
                loss=keras.losses.categorical_crossentropy,
                optimizer=keras.optimizer_v2.gradient_descent.SGD(0.05))
            history = model.fit(
                x_train,
                y_train,
                batch_size=batch_size,
                shuffle=False,  # Try to make it deterministic.
                epochs=num_epochs,
                verbose=1)

            # Check that the loss decreased.
            losses = history.history["loss"]
            self.assertEqual(num_epochs, len(losses))
            self.assertLess(losses[1], losses[0])
            self.assertLess(losses[2], losses[1])

            num_batches = num_epochs * num_examples // batch_size

            # There should be be a single engine, loaded once, and executed one
            # time for each batch.
            event_counts, _ = report.get_ipu_events()
            self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE])
            self.assertEqual(num_batches, event_counts[IpuTraceEvent.EXECUTE])
Exemple #17
0
  def testDIENShape(self):
    shape = [10000000, 20]  # 740MB at float32
    lookup_count = 4096

    def my_net(i):

      # lookup
      out = gen_pop_datastream_ops.ipu_device_embedding_lookup(
          i,
          embedding_id="host_embedding",
          embedding_shape=shape,
          dtype=np.float32)

      #update
      gen_pop_datastream_ops.ipu_device_embedding_update_add(
          out, out, i, embedding_id="host_embedding", embedding_shape=shape)

      self.assertEqual(out.shape, (lookup_count, shape[1]))
      return out

    with ops.device('cpu'):
      i = array_ops.placeholder(np.int32, [lookup_count])
      w = variable_scope.get_variable("foo",
                                      dtype=np.float32,
                                      shape=shape,
                                      use_resource=False)

    with ipu.scopes.ipu_scope("/device:IPU:0"):
      r = ipu.ipu_compiler.compile(my_net, inputs=[i])

    cfg = ipu.utils.create_ipu_config(profiling=True)
    cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False)
    ipu.utils.configure_ipu_system(cfg)
    with sl.Session() as sess:
      i_h = np.arange(0, lookup_count).reshape([lookup_count])

      report = tu.ReportJSON(self, sess, configure_device=False)

      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(
          gen_pop_datastream_ops.ipu_host_embedding_register(
              w, "host_embedding"))
      result = sess.run([r], {i: i_h})
      v = sess.run(
          gen_pop_datastream_ops.ipu_host_embedding_deregister(
              w, "host_embedding"))

      # Since we updated with the same activations, we expect to see a 2x
      self.assertAllClose(result[0][0] * 2, np.take(v, i_h, axis=0))
      self.assertEqual(result[0][0].shape, (lookup_count, shape[1]))
      report.parse_log()
      report.assert_max_tile_memory(772, tolerance=0.3)
Exemple #18
0
    def testUserOpWithAllocate(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([128])],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y):
                x = ipu.custom_ops.precompiled_user_op([x, y],
                                                       lib_path,
                                                       op_name="AllocTest",
                                                       outs=outputs)
                return x

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[128])
                y = array_ops.placeholder(np.float32, shape=[128])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            report = tu.ReportJSON(self, sess)
            report.reset()

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([128]),
                y: np.ones([128]),
            })

            report.parse_log()

            found = 0
            for t in report.get_tensor_map().all_tensors():
                if t.inst == "arg0.1":
                    # Allocator maps all of input 0 to tile 0
                    self.assertAllEqual(t.tile_ids(), [0])
                    found = found + 1
                if t.inst == "arg1.2":
                    # Allocator leaves input 1 to be linearly mapped
                    self.assertAllEqual(t.tile_ids(), [0, 1, 2, 3])
                    found = found + 1

            self.assertAllEqual(found, 2)
            self.assertAllEqual(np.full([128], 2.0), res[0])
Exemple #19
0
    def testWideConstantWithAllocationTarget(self):
        with self.session() as sess:
            # This test will fail if the dynamic slice is not mapped correctly.
            dtype = np.float32
            shape = (512, 2, 2048)

            def my_net(y):
                def cond(i, x, y):
                    del x
                    del y
                    return i < 2

                def body(i, x, y):
                    s = array_ops.slice(x, [i, i, i], [1, 1, 2048])
                    y = y + math_ops.reduce_mean(s)
                    x = x + constant_op.constant(1, shape=shape, dtype=dtype)
                    i = i + 1
                    return (i, x, y)

                i = 0
                c = constant_op.constant(4, shape=shape, dtype=dtype, name="c")
                return control_flow_ops.while_loop(cond,
                                                   body, (i, c, y),
                                                   name='')[2]

            with ops.device('cpu'):
                y = array_ops.placeholder(dtype, [1])

            with ops.device("/device:IPU:0"):
                r = xla.compile(my_net, inputs=[y])

            report = tu.ReportJSON(self, sess)
            report.reset()

            y = sess.run(r, {y: [10]})
            self.assertAllClose(y[0], [19])

            report.parse_log(assert_len=4)

            ok = [
                '__seed*', 'Copy_*_to_*', 'Slice/dynamic-slice*/dynamicSlice',
                'Mean/reduce', 'Mean/multiply', 'add*/add*/Add',
                'add_*/fusion/Op/Add'
            ]
            report.assert_all_compute_sets_and_list(ok)

            report.assert_max_tile_memory(9008)
            report.assert_always_live_memory(323748)
Exemple #20
0
    def testOptions(self):
        with self.test_session() as session:
            np.random.seed(1234)
            h_w1 = np.random.random_sample([1, 1, 4, 2])
            h_w2 = np.random.random_sample([1, 1, 1, 4])

            @ipu.nn_ops.multi_conv(options={"invalidFlag": "yes"})
            def convs(a, b, w1, w2):
                a = nn.conv2d(a, w1, 1, padding='VALID')
                b = nn.conv2d_transpose(b, w2, [2, 32, 32, 4], 1)
                return a, b

            def body(a, b):
                w1 = variable_scope.get_variable(
                    "w1",
                    dtype=np.float32,
                    shape=[1, 1, 4, 2],
                    initializer=init_ops.constant_initializer(h_w1))
                w2 = variable_scope.get_variable(
                    "w2",
                    dtype=np.float32,
                    shape=[1, 1, 4, 2],
                    initializer=init_ops.constant_initializer(h_w2))
                a, b = convs(a, b, w1, w2)
                option_flags = a.op.get_attr("option_flags")
                option_flags_proto = json_format.Parse(
                    option_flags, option_flag_pb2.PoplarOptionFlags())
                self.assertEqual(len(option_flags_proto.flags), 1)
                self.assertEqual(option_flags_proto.flags[0].option,
                                 "invalidFlag")
                self.assertEqual(option_flags_proto.flags[0].value, "yes")
                return a, b

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [2, 32, 32, 4])
                b = array_ops.placeholder(np.float32, [2, 32, 32, 2])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b])

            tu.ReportJSON(self, session)
            tu.move_variable_initialization_to_cpu()
            session.run(variables.global_variables_initializer())
            with self.assertRaisesRegex(
                    Exception,
                    r"\[Error\]\[Build graph\] Unrecognised option \'invalidFlag\'"
            ):
                session.run(res, {x: np.ones(x.shape) for x in [a, b]})
Exemple #21
0
    def testRecomputeSuggestion(self):
        def my_model(a):
            b = array_ops.constant(np.random.rand(5, 5),
                                   dtype=np.float32,
                                   name="W_ih")
            c = array_ops.constant(np.random.rand(5, 5),
                                   dtype=np.float32,
                                   name="W_ho")
            d = a + b
            ipu.internal_ops.print_tensor(d)  # block some optimisation
            e = d + c
            ipu.internal_ops.print_tensor(e)  # block some optimisation
            f = ipu.internal_ops.recompute(e)
            g = f + f
            ipu.internal_ops.print_tensor(g)  # block some optimisation
            output = g + f

            return [output]

        with ops.device("cpu"):
            inp = array_ops.placeholder(np.float32, [5, 5], name="a")

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            out = ipu.ipu_compiler.compile(my_model, inputs=[inp])

        with tu.ipu_session() as sess:
            report = tu.ReportJSON(self,
                                   sess,
                                   replicated=False,
                                   allow_recompute=True)
            sess.run(variables.global_variables_initializer())

            report.reset()
            sess.run(out, {inp: np.ones([5, 5])})
            report.parse_log()

            # 5 adds in a graph that only defined 4
            ok = [
                '__seed*',
                'add_1/add.1/Op/Add',
                'add_2/add.10/Op/Add',
                'add_1/add.1.clone.1/Op/Add',
                'add/add.4/Op/Add',
                'add_1/add.1.clone/Op/Add',
                'add_3/add.12/Op/Add',
            ]
            report.assert_all_compute_sets_and_list(ok)
Exemple #22
0
    def test_train_step_fn_keras_model_known_input_size(self):
        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            model = keras.Sequential([
                keras.layers.Dense(1, input_shape=[10]),
            ])

            optimizer = keras.optimizer_v2.gradient_descent.SGD(0.01)

            @def_function.function
            def step_fn(features, labels):
                with GradientTape() as tape:
                    predictions = model(features, training=True)
                    prediction_loss = keras.losses.mean_squared_error(
                        labels, predictions)
                    loss = math_ops.reduce_mean(prediction_loss)

                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
                return loss

            report = tu.ReportJSON(self, eager_mode=True)
            report.reset()

            batch_size = 5
            x_train = np.ones((batch_size, 10), dtype=np.float32)
            y_train = np.ones((batch_size, 1), dtype=np.float32)

            first_loss = strategy.experimental_run_v2(step_fn,
                                                      args=[x_train, y_train])
            second_loss = strategy.experimental_run_v2(step_fn,
                                                       args=[x_train, y_train])

            # Check that loss is decreasing.
            self.assertLess(second_loss, first_loss)

            # There should be a single engine, loaded once, executed twice.
            event_counts, _ = report.get_ipu_events()
            self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE])
            self.assertEqual(2, event_counts[IpuTraceEvent.EXECUTE])
Exemple #23
0
    def test_optimizer(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        strategy = ipu_strategy.IPUStrategy()

        report = tu.ReportJSON(self, eager_mode=True, replicated=True)
        report.reset()

        with strategy.scope():
            initial_variable = 2.0
            variable = variables.Variable(initial_variable)
            learning_rate = 0.5
            num_iterations = 3

            data = [1.0, 2.0]
            dataset = dataset_ops.Dataset.from_tensor_slices((data))
            dataset = dataset.repeat(num_iterations)
            infeed = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                     feed_name="feed",
                                                     replication_factor=2)

            optimizer = keras.optimizer_v2.gradient_descent.SGD(learning_rate)

            @def_function.function(experimental_compile=True)
            def apply_gradient():
                gradient = infeed._dequeue()  # pylint: disable=protected-access
                optimizer.apply_gradients([(gradient, variable)])

            # The optimizers in v2 will sum the gradients, and not average them.
            expected_gradient = np.sum(data)
            expected_variable = initial_variable

            infeed.initializer  # pylint: disable=pointless-statement

            for _ in range(num_iterations):
                strategy.experimental_run_v2(apply_gradient)
                expected_variable -= learning_rate * expected_gradient
                self.assertEqual(expected_variable, variable.numpy())
Exemple #24
0
  def testTwoParallelMatMuls(self):
    # Check that we get all classifications for a simple conv

    def graph(x, label):
      a = fc(x, 48)
      a = nn.relu(a)
      b = fc(x, 48)
      b = nn.relu(b)
      x = a + b

      a = fc(x, 100)
      a = nn.relu(a)
      b = fc(x, 100)
      b = nn.relu(b)
      x = a + b

      loss = math_ops.reduce_mean(
          nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label))

      opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss)
      return loss, opt

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 224])
      l = array_ops.placeholder(np.int32, shape=[1])

    with ops.device("/device:IPU:0"):
      output = ipu_compiler.compile(graph, inputs=[x, l])

    tu.move_variable_initialization_to_cpu()

    with tu.ipu_session() as sess:

      report = tu.ReportJSON(self, sess)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(output, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 4x updates, 2x grads
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 2, 4])
Exemple #25
0
    def testTrainNoExec(self):
        shape = [100000, 200]
        lookup_count = 4096

        host_embedding = embedding_ops.create_host_embedding(
            "my_host_embedding",
            shape,
            np.float32,
            optimizer_spec=embedding_ops.HostEmbeddingSGDGAOptimizerSpec(
                0.5, 2))

        def my_net(i):
            out = host_embedding.lookup(i)

            return out

        with ops.device('cpu'):
            i = array_ops.placeholder(np.int32, [lookup_count])

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            r = ipu.ipu_compiler.compile(my_net, inputs=[i])

        cfg = ipu.utils.create_ipu_config(
            profiling=True, always_rearrange_copies_on_the_host=True)
        cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False)
        ipu.utils.configure_ipu_system(cfg)
        with sl.Session() as sess:
            i_h = np.arange(0, lookup_count).reshape([lookup_count])

            report = tu.ReportJSON(self, sess, configure_device=False)
            sess.run(variables.global_variables_initializer())
            report.reset()

            with host_embedding.register(sess):
                # training=False should ignore the number of expected updates.
                result = sess.run([r], {i: i_h})

            v = sess.run(host_embedding.get_embedding_tensor())
            # Check the lookup result, but we are really interested that it doesn't hang.
            self.assertAllClose(result[0][0], np.take(v, i_h, axis=0))
Exemple #26
0
    def testFunctionsNoMatch(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(a):
                return nn.relu(a)

            def body(a, b, c):
                return func(a), func(b), func(c)

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float16, [64, 64])
                b = array_ops.placeholder(np.float16, [64, 64])
                c = array_ops.placeholder(np.float32, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b, c])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]})
            self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64]))
            self.assertAllClose(result[1], np.broadcast_to(1.0, [64, 64]))
            self.assertAllClose(result[2], np.broadcast_to(1.0, [64, 64]))

            report.parse_log()
            # Two non-linearties, as one of them has a different type.
            ok = [
                'Relu/relu/Nonlinearity',
                'Relu/relu.*/Nonlinearity',
                '__seed',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)

            # Main computation (including inlined fp32 one, and the fp16 outlined).
            self.assertEqual(len(report.tensor_map.computation_names()), 2)
Exemple #27
0
    def test_building_model_by_passing_input_shape_to_first_layer(self):
        strategy = ipu_strategy.IPUStrategy()

        with strategy.scope():

            report = tu.ReportJSON(self, eager_mode=True)
            report.reset()

            # Passing input_shape to first layer builds the model.
            model = keras.Sequential([
                keras.layers.Dense(5, input_shape=(2, )),
                keras.layers.Dense(10),
                keras.layers.Softmax(),
            ])

            # The model is built, meaning shapes are known and weights allocated,
            # but no engines should have been compiled or executed yet.
            self.assertTrue(model.built)
            self.assertEqual(4, len(model.variables))

            event_counts, trace_events = report.get_ipu_events()
            self.assertEqual([], _get_compiled_modules(trace_events))
            self.assertEqual(0, event_counts[IpuTraceEvent.EXECUTE])
Exemple #28
0
  def testResnetLike(self):
    # Check that we get all classifications for a small resnet correct

    def graph(img, label):
      x = conv(img, 7, 2, 16)
      x = nn.relu(x)
      x = max_pool(x, ksize=3, stride=2)

      x = block("b", 2, 64, 1, x)

      x = math_ops.reduce_mean(x, axis=[1, 2])
      x = fc(x, 100)
      loss = math_ops.reduce_mean(
          nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label))

      opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss)
      return loss, opt

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      l = array_ops.placeholder(np.int32, shape=[1])

    with ops.device("/device:IPU:0"):
      output = ipu_compiler.compile(graph, inputs=[x, l])

    tu.move_variable_initialization_to_cpu()

    with tu.ipu_session() as sess:

      report = tu.ReportJSON(self, sess)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(output, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 3 convs, 1 matmul = 4
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
Exemple #29
0
    def runCustomUserOpWithUnusedOutput(self, op_name, ok):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([128])],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y):
                ipu.custom_ops.precompiled_user_op([x, y],
                                                   lib_path,
                                                   op_name=op_name,
                                                   outs=outputs)
                return [x + y]

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[128])
                y = array_ops.placeholder(np.float32, shape=[128])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            report = tu.ReportJSON(self, sess)
            report.reset()

            sess.run(variables.global_variables_initializer())
            sess.run(model, {
                x: np.ones([128]),
                y: np.ones([128]),
            })

            report.parse_log()
            report.assert_all_compute_sets_and_list(ok)
Exemple #30
0
  def testOutlinedFunction(self):
    # Check that we get all classifications for a simple conv

    def stage1(x, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        weight = variable_scope.get_variable(
            "w0",
            shape=[224, 48],
            dtype=np.float32,
            initializer=init_ops.ones_initializer())
        a = ipu_math_ops.serialized_matmul(
            x, weight, 2, serialization_dimension="a_rows_b_columns")
        a = nn.relu(a)
        b = fc(x, 48)
        b = nn.relu(b)
        return a + b, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        a = fc(x, 100)
        a = nn.relu(a)
        b = fc(x, 100)
        b = nn.relu(b)
        return a + b, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 224])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 3 matmul in stage 1, 2 matmuls in stage 2 = 5 (5x updates, 5x grads)
      self.assertAllEqual(report.get_ml_type_counts(), [0, 5, 2, 5])