コード例 #1
0
        def build_and_run_model():
            dataset = dataset_ops.Dataset.from_tensor_slices(
                np.ones(10, dtype=np.float32))
            infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue(
                dataset, "infeed")
            outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue("outfeed")

            def body(v, x):
                v = v + x
                outfed = outfeed_queue.enqueue(v)
                return v, outfed

            def my_net(v):
                return ipu.loops.repeat(10, body, v, infeed_queue)

            v = array_ops.placeholder(np.float32, shape=())
            with ipu.scopes.ipu_scope("/device:IPU:0"):
                [result] = ipu.ipu_compiler.compile(my_net, inputs=[v])
            with ops.control_dependencies([result]):
                dequeued = outfeed_queue.dequeue()

            with session.Session() as sess:
                report = ReportJSON(
                    self, sess, set_opts_fn=_use_offline_compilation_if_needed)
                sess.run(infeed_queue.initializer)
                try:
                    res, deq = sess.run([result, dequeued], {v: 0.0})
                except errors.InvalidArgumentError as e:
                    if offline_compilation_needed and "compilation only" in e.message:
                        res = []
                        deq = []
                    else:
                        raise
                events = report.get_event_trace(sess)
                return res, deq, events
コード例 #2
0
    def testGroupNormalizeInference(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    gamma = constant_op.constant([0.5, 0.5], np.float32)
                    beta = constant_op.constant([0.5, 0.5], np.float32)
                    mean = constant_op.constant([0.5, 0.5], np.float32)
                    inv_std_dev = constant_op.constant([0.5, 0.5], np.float32)
                    y = gen_popnn_ops.popnn_group_norm_inference(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        mean=mean,
                        inv_std_dev=inv_std_dev,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    y = gen_popnn_ops.popnn_group_norm_inference(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        mean=mean,
                        inv_std_dev=inv_std_dev,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run(y, {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # Would fail if there were two batch norms in the graph
            ok = [
                '__seed*', 'Copy_',
                'vs/conv2d/Conv2D/convolution.*/Conv_1x1/Convolve',
                'vs/PopnnGroupNormInference/group-norm-inference*/'
            ]
            report.assert_all_compute_sets_and_list(ok)
コード例 #3
0
  def testCaseSimple(self):
    with self.session() as sess:

      def my_graph(pa, pb, pc):
        with ipu.scopes.ipu_scope("/device:IPU:0"):

          @eager_function.defun
          def b0(x, y):
            return x + y

          @eager_function.defun
          def b1(x, y):
            return x - y

          @eager_function.defun
          def b2(x, y):
            return x * y

          branches = [
              f.get_concrete_function(array_ops.zeros_like(pb),
                                      array_ops.zeros_like(pc))
              for f in [b0, b1, b2]
          ]

          c_out = gen_functional_ops.case(pa,
                                          input=[pb, pc],
                                          Tout=[dtypes.float32],
                                          branches=branches)

          return [c_out[0]]

      with ops.device('cpu'):
        pa = array_ops.placeholder(np.int32, [], name="a")
        pb = array_ops.placeholder(np.float32, [2], name="b")
        pc = array_ops.placeholder(np.float32, [2], name="c")

      out = ipu.ipu_compiler.compile(my_graph, [pa, pb, pc])

      report = ReportJSON(self, sess)

      report.reset()

      result = sess.run(out, {pa: 0, pb: [0., 1.], pc: [1., 5.]})
      self.assertAllClose(result[0], [1., 6.])

      result = sess.run(out, {pa: 1, pb: [0., 1.], pc: [1., 5.]})
      self.assertAllClose(result[0], [-1., -4.])

      result = sess.run(out, {pa: 2, pb: [0., 1.], pc: [1., 5.]})
      self.assertAllClose(result[0], [0., 5.])

      result = sess.run(out, {pa: 10, pb: [0., 1.], pc: [1., 5.]})
      self.assertAllClose(result[0], [0., 5.])

      report.parse_log()
      report.assert_contains_one_compile_event()
コード例 #4
0
    def testGather(self):
        with self.session() as sess:

            def my_net(w, i):
                out = array_ops.gather(w, i)
                return [out]

            with ops.device('cpu'):
                i = array_ops.placeholder(np.int32, [256])
                w = array_ops.placeholder(np.float32, [1024, 8])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(my_net, inputs=[w, i])

            report = ReportJSON(self, sess)
            report.reset()

            i_h = np.arange(0, 3 * 256, 3)
            w_h = np.arange(8192).reshape(1024, 8)
            expect = np.take(w_h, i_h, axis=0)

            result = sess.run(r, {i: i_h, w: w_h})
            self.assertAllClose(result[0], expect)

            report.parse_log()
            tm = report.get_tensor_map()

            bad_maps = []
            for tensor in tm.all_tensors():
                if tensor.num_elements > 16:
                    if len(tensor.tiles) == 1 and tensor.has_contant:
                        bad_maps += [tensor.inst]

            self.assertFalse(bad_maps)
コード例 #5
0
    def testBatchNormalizeInferenceDontMatchDifferentTypes(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    y = layers_norm.batch_normalization(y, fused=True)
                    y = math_ops.cast(y, np.float16)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    y = layers_norm.batch_normalization(y, fused=True)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run(y, {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # Matches two convolutions
            ok = [
                '__seed*', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization/FusedBatchNorm*/batch-norm-inference.*/',
                'vs/Cast/convert.*/Cast',
                'vs/conv2d_1/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization_1/FusedBatchNorm*/batch-norm-inference.*/'
            ]
            report.assert_all_compute_sets_and_list(ok)
コード例 #6
0
    def testDoNotCompileScalarConstGraph(self):
        with self.session() as sess:

            def my_graph(a, b):
                with ops.device("/device:IPU:0"):
                    x = math_ops.add(a, b)
                return x

            with ops.device('cpu'):
                a = 2
                b = 3
            out = ipu.ipu_compiler.compile(my_graph, [a, b])
            report = ReportJSON(self, sess)
            report.reset()

            result = sess.run(out)

            report.parse_log()
            report.assert_contains_no_compile_event()

            self.assertEqual(result, [5])
コード例 #7
0
    def testDoNotCompileScalarElementWiseGraphWithParameter(self):
        with self.session() as sess:

            def my_graph(a, b):
                with ops.device("/device:IPU:0"):
                    x = math_ops.add(a, b)
                return x

            with ops.device('cpu'):
                a = array_ops.placeholder(np.int32, name="a")
                b = array_ops.placeholder(np.int32, name="b")

            out = ipu.ipu_compiler.compile(my_graph, [a, b])
            report = ReportJSON(self, sess)
            report.reset()

            fd = {a: np.int32(2), b: np.int32(3)}
            result = sess.run(out, fd)

            report.parse_log()
            report.assert_contains_no_compile_event()

            self.assertAllClose(result, [5])
コード例 #8
0
    def testInplaceReadWrite(self):
        with self.session() as sess:

            def my_net(x, y, a):
                z = x + y
                c = a + x
                return c, z

            with ops.device('cpu'):
                x = array_ops.placeholder(np.int32, [100])
                y = array_ops.placeholder(np.int32, [100])
                a = array_ops.placeholder(np.int32, [100])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(my_net, inputs=[x, y, a])

            report = ReportJSON(self, sess)
            report.reset()

            i_x = np.full(100, 1)
            i_y = np.full(100, 2)
            i_a = np.full(100, 10)
            expect_c = np.full(100, 11)
            expect_z = np.full(100, 3)

            result_c, result_z = sess.run(r, {x: i_x, y: i_y, a: i_a})
            self.assertAllClose(result_c, expect_c)
            self.assertAllClose(result_z, expect_z)

            report.parse_log()
            tm = report.get_tensor_map()

            bad_maps = []
            for tensor in tm.all_tensors():
                # Number of elements in tensor 100.
                # Number of used tiles should be larger than 1
                if tensor.num_elements != 100 or len(tensor.tiles) <= 1:
                    bad_maps += [tensor.inst]

            self.assertFalse(bad_maps)
コード例 #9
0
    def testNormCacheConstants(self):
        with self.session() as sess:

            def model(x, y, z):
                scale = gen_array_ops.broadcast_to(z, shape=[65536])
                offset = scale
                b_mean, b_var = nn.moments(x, [0, 1, 2], name='moments')
                a = nn.fused_batch_norm(x,
                                        scale,
                                        offset,
                                        b_mean,
                                        b_var,
                                        1e-3,
                                        is_training=False,
                                        name="a")
                b = nn.fused_batch_norm(y,
                                        scale,
                                        offset,
                                        b_mean,
                                        b_var,
                                        1e-3,
                                        is_training=False,
                                        name="b")

                return a[0] + b[0]

            with ops.device('cpu'):
                x = array_ops.placeholder(np.float16, [1, 1, 1, 65536],
                                          name="x")
                y = array_ops.placeholder(np.float16, [1, 1, 1, 65536],
                                          name="y")
                z = array_ops.placeholder(np.float16, shape=[1])

            with ops.device("/device:IPU:0"):
                res = ipu_compiler.compile(model, inputs=[x, y, z])

            report = ReportJSON(self, sess)
            tu.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())

            report.reset()

            r = sess.run(res, {
                x: np.ones(x.shape),
                y: np.ones(y.shape),
                z: [1.0]
            })
            self.assertAllClose(r[0], np.full(r[0].shape, 2))

            report.parse_log()

            report.assert_total_tile_memory(1634674)
            report.assert_max_tile_memory(1551)

            # Would fail if there were two batch norms in the graph
            ok = [
                '__seed*',
                'host-exchange-local-copy',
                'Copy_',
                'moments/SquaredDifference/multiply',
                'a/batch-norm-inference',
                'add/add*/Add',
            ]
            report.assert_all_compute_sets_and_list(ok)
コード例 #10
0
    def testGroupNormsMatchFwdBwd(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv1')
                    gamma = constant_op.constant([0.5, 0.5], np.float32)
                    beta = constant_op.constant([0.5, 0.5], np.float32)
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv2')
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv3')
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)

                loss = math_ops.reduce_sum(y)
                optimizer = gradient_descent.GradientDescentOptimizer(0.1)
                train = optimizer.minimize(loss)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # One GN for forwards and one GN for grad
            # pylint: disable=line-too-long
            ok = [
                '__seed*',
                'Copy_',
                'vs/conv1/Conv2D/convolution*/Conv_1x1/Convolve',
                'vs/PopnnGroupNormTraining/group-norm-training*/Norm',
                'vs/PopnnGroupNormTraining/group-norm-training*/iStdDev',
                'vs/PopnnGroupNormTraining/group-norm-training*/Whiten',
                'Sum/reduce.*/*/Reduce',
                'gradients/vs/PopnnGroupNormTraining_2_grad/PopnnGroupNormGrad/group-norm-grad*/',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose',
            ]
            # pylint: enable=line-too-long
            report.assert_all_compute_sets_and_list(ok)
コード例 #11
0
    def testBatchNormsMatchFwdBwd(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv1')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv2')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv3')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)

                loss = math_ops.reduce_sum(y)
                optimizer = gradient_descent.GradientDescentOptimizer(0.1)
                train = optimizer.minimize(loss)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # One BN for forwards and one BN for grad
            # (note that we don't cache gradient application)
            # pylint: disable=line-too-long
            ok = [
                '__seed*',
                'Copy*',
                'vs/conv1/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization/FusedBatchNorm*/batch-norm-training.*/',
                'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce',
                'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce',
                'gradients/vs/batch_normalization_2/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/',
                'GradientDescent/update_vs/batch_normalization/',
                'GradientDescent/update_vs/batch_normalization_1/',
                'GradientDescent/update_vs/batch_normalization_2/',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose',
            ]
            # pylint: enable=line-too-long
            report.assert_all_compute_sets_and_list(ok)
コード例 #12
0
    def testCombineStreamCopies(self):
        with self.session() as sess:

            def with_outside_scope(x1, x2):
                with ipu_scope("/device:IPU:0"):
                    x1 *= 1.0
                    x2 *= 2.0
                    with outside_compilation_scope():
                        y1 = constant_op.constant(1.0, dtype=dtypes.float32)
                        y1 += x1
                        y2 = constant_op.constant(2.0, dtype=dtypes.float32)
                        y2 += x2
                    x1 += y1
                    x2 += y2
                    return x1, x2

            def without_outside_scope(x1, x2):
                with ipu_scope("/device:IPU:0"):
                    x1 *= 1.0
                    x2 *= 2.0
                    y1 = constant_op.constant(1.0, dtype=dtypes.float32)
                    y1 += x1
                    y2 = constant_op.constant(2.0, dtype=dtypes.float32)
                    y2 += x2
                    x1 += y1
                    x2 += y2
                    return x1, x2

            input1 = array_ops.placeholder(dtype=dtypes.float32, shape=(2, ))
            input2 = array_ops.placeholder(dtype=dtypes.float32, shape=(1, ))

            compiled_with_outside_scope = ipu_compiler.compile(
                with_outside_scope, inputs=[input1, input2])

            compiled_without_outside_scope = ipu_compiler.compile(
                without_outside_scope, inputs=[input1, input2])

            opts = utils.create_ipu_config(profiling=True)
            opts = utils.set_optimization_options(
                opts, max_send_recv_cluster_size=12)
            utils.configure_ipu_system(opts)

            report = ReportJSON(self, sess, configure_device=False)

            def count_stream_copies(compiled_func):
                report.reset()
                out1, out2 = sess.run(compiled_func, {
                    input1: [1.0, 1.0],
                    input2: [1.0]
                })
                self.assertAllEqual(out1, [3.0, 3.0])
                self.assertAllEqual(out2, [6.0])
                report.parse_log()

                main_program_index = report.get_first_program_of_type(
                    'Switch')['children'][1]
                main_program_seq = map(
                    report.get_program,
                    report.get_program(main_program_index)['children'])
                stream_copies = [
                    p for p in main_program_seq if p['type'] == 'StreamCopy'
                ]
                return len(stream_copies)

            num_copies_without_outside_scope = count_stream_copies(
                compiled_without_outside_scope)
            num_copies_with_outside_scope = count_stream_copies(
                compiled_with_outside_scope)

            # There should be at most two new SendToHost/RecvFromHost stream copies.
            self.assertLessEqual(num_copies_with_outside_scope,
                                 num_copies_without_outside_scope + 2)
コード例 #13
0
    def testMappingJson(self):
        with self.session() as sess:

            def my_net(a, b, c):
                a = array_ops.broadcast_to(a, shape=[1024])
                b = array_ops.strided_slice(b, [0], [8192], [8])
                c = array_ops.pad(c, paddings=[[256, 256]])
                out = a + b + c
                return [out]

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [])
                b = array_ops.placeholder(np.float32, [8192])
                c = array_ops.placeholder(np.float32, [512])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(my_net, inputs=[a, b, c])

            report = ReportJSON(self, sess)
            report.reset()

            fd = {a: 1.0, b: np.ones([8192]), c: np.ones([512])}
            result = sess.run(r, fd)

            expected = [2] * 256 + [3] * 512 + [2] * 256
            self.assertAllClose(result[0], expected)

            report.parse_log()
            tm = report.get_tensor_map()

            # There are two fusions in the graph, zero pad and implicit
            # broadcast add. We work out which one's which by looking at
            # layouts.
            fusion_0_layout = []
            fusion_1_layout = []
            slice_layout = []
            add_layout = []
            for tensor in tm.all_tensors():
                if tensor.inst.startswith('fusion.'):
                    fusion_1_layout = tensor
                elif tensor.inst.startswith('fusion'):
                    fusion_0_layout = tensor
                elif tensor.inst.startswith('slice'):
                    slice_layout = tensor
                elif tensor.inst.startswith('add'):
                    add_layout = tensor

            # The slice contains 4 elements on 256 tiles
            self.assertEqual(len(slice_layout.tiles), 256)
            for tile_idx, tile in enumerate(slice_layout.tiles):
                self.assertEqual(tile.tile, tile_idx)
                self.assertEqual(tile.num_elements, 4)

            # The broadcast add will have the same layout as the slice as it
            # should be done inplace.
            if slice_layout.tiles == fusion_1_layout.tiles:
                pad_layout = fusion_0_layout
            else:
                self.assertEqual(slice_layout.tiles, fusion_0_layout.tiles)
                pad_layout = fusion_1_layout

            # The pad contains 512 elements on tile 0,
            # and one region with 4 elements on tiles 64-192
            self.assertEqual(len(pad_layout.tiles), 129)
            for tile_idx, tile in enumerate(pad_layout.tiles):
                if tile_idx == 0:
                    self.assertEqual(tile.tile, tile_idx)
                    self.assertEqual(tile.num_elements, 512)
                else:
                    self.assertEqual(tile.tile, 63 + tile_idx)
                    self.assertEqual(tile.num_elements, 4)

            # The add is done inplace
            self.assertEqual(slice_layout.tiles, add_layout.tiles)