Exemple #1
0
    def testDoNotCompileScalarElementWiseGraphWithParameter(self):
        with self.session() as sess:

            def my_graph(a, b):
                with ops.device("/device:IPU:0"):
                    x = math_ops.add(a, b)
                return x

            with ops.device('cpu'):
                a = array_ops.placeholder(np.int32, name="a")
                b = array_ops.placeholder(np.int32, name="b")

            out = ipu.ipu_compiler.compile(my_graph, [a, b])
            report = ReportJSON(self, sess)
            report.reset()

            fd = {a: np.int32(2), b: np.int32(3)}
            result = sess.run(out, fd)

            report.parse_log()
            report.assert_contains_no_compile_event()

            self.assertAllClose(result, [5])
Exemple #2
0
    def RunLayer(self, layer_func, x):
        with self.session() as sess:
            with ops.device('cpu'):
                px = array_ops.placeholder(dataType, shape=x.shape)
                ph = array_ops.placeholder(dataType,
                                           shape=[batch_size, num_hidden])
                pc = array_ops.placeholder(dataType,
                                           shape=[batch_size, num_hidden])
            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(layer_func, inputs=[px, ph, pc])

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())
            report.reset()
            result = sess.run(r, {
                px: x,
                ph: np.ones(ph.shape),
                pc: np.ones(pc.shape)
            })
            report.parse_log()
            size = report.get_total_tile_memory()
        return (size, result)
Exemple #3
0
  def testMergedWeightDownload(self):
    with self.session() as sess:
      x = array_ops.placeholder(datatype, shape=[16, 4])
      y_ = array_ops.placeholder(datatype, shape=[16, 256])

      with ipu.scopes.ipu_scope("/device:IPU:0"):
        logits = inference(x)

        loss = math_ops.reduce_mean(
            nn_ops.softmax_cross_entropy_with_logits_v2(
                logits=logits, labels=array_ops.stop_gradient(y_)))

      report = ReportJSON(self,
                          sess,
                          compile_ipu_code=True,
                          device_count_override=1)

      sess.run(variables.global_variables_initializer())
      report.reset()

      data = np.zeros([16, 4])
      labels = np.zeros([16, 256])

      sess.run(loss, feed_dict={x: data, y_: labels})
      report.parse_log()

      # Find the first case - the download weights sequence
      download_weights_index = report.get_first_program_of_type(
          'Switch')['children'][0]

      self.assertLess(
          len(report.get_program(download_weights_index)['children']), 12,
          "The download weights sequence should not have lots of entries "
          "(because the copies will have been merged)")

      # Also check the overall size
      report.assert_total_tile_memory(8725954)
Exemple #4
0
    def testRemap(self):
        with self.session() as sess:

            def my_net(w, i):
                w = ipu.ops.internal_ops.remap(w)
                i = ipu.ops.internal_ops.remap(i)
                out = array_ops.gather(w, i)
                return [out]

            with ops.device('cpu'):
                i = array_ops.placeholder(np.int32, [8])
                w = array_ops.placeholder(np.float32, [32 * 1024])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(my_net, inputs=[w, i])

            report = ReportJSON(self, sess)
            report.reset()

            i_h = np.arange(0, 8)
            w_h = np.arange(32 * 1024)

            result = sess.run(r, {i: i_h, w: w_h})
            self.assertAllClose(result[0], np.take(w_h, i_h))

            report.parse_log()
            tm = report.get_tensor_map()

            bad_maps = []
            for tensor in tm.all_tensors():
                # Total elements > 16
                if tensor.num_elements > 16:
                    # Tiles used != 1024
                    if len(tensor.tiles) != 1024:
                        bad_maps += [tensor.inst]

            self.assertFalse(bad_maps)
Exemple #5
0
    def testNormCacheConstants(self):
        with self.session() as sess:

            def model(x, y, z):
                scale = gen_array_ops.broadcast_to(z, shape=[65536])
                offset = scale
                b_mean, b_var = nn.moments(x, [0, 1, 2], name='moments')
                a = nn.fused_batch_norm(x,
                                        scale,
                                        offset,
                                        b_mean,
                                        b_var,
                                        1e-3,
                                        is_training=False,
                                        name="a")
                b = nn.fused_batch_norm(y,
                                        scale,
                                        offset,
                                        b_mean,
                                        b_var,
                                        1e-3,
                                        is_training=False,
                                        name="b")

                return a[0] + b[0]

            with ops.device('cpu'):
                x = array_ops.placeholder(np.float16, [1, 1, 1, 65536],
                                          name="x")
                y = array_ops.placeholder(np.float16, [1, 1, 1, 65536],
                                          name="y")
                z = array_ops.placeholder(np.float16, shape=[1])

            with ops.device("/device:IPU:0"):
                res = ipu_compiler.compile(model, inputs=[x, y, z])

            report = ReportJSON(self, sess)
            tu.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())

            report.reset()

            r = sess.run(res, {
                x: np.ones(x.shape),
                y: np.ones(y.shape),
                z: [1.0]
            })
            self.assertAllClose(r[0], np.full(r[0].shape, 2))

            report.parse_log()

            report.assert_total_tile_memory(1634674)
            report.assert_max_tile_memory(1551)

            # Would fail if there were two batch norms in the graph
            ok = [
                '__seed*',
                'host-exchange-local-copy',
                'Copy_',
                'moments/SquaredDifference/multiply',
                'a/batch-norm-inference',
                'add/add*/Add',
            ]
            report.assert_all_compute_sets_and_list(ok)
Exemple #6
0
    def testGroupNormsMatchFwdBwd(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv1')
                    gamma = constant_op.constant([0.5, 0.5], np.float32)
                    beta = constant_op.constant([0.5, 0.5], np.float32)
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv2')
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv3')
                    y, _, _ = gen_popnn_ops.popnn_group_norm_training(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)

                loss = math_ops.reduce_sum(y)
                optimizer = gradient_descent.GradientDescentOptimizer(0.1)
                train = optimizer.minimize(loss)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # One GN for forwards and one GN for grad
            # pylint: disable=line-too-long
            ok = [
                '__seed*',
                'Copy_',
                'vs/conv1/Conv2D/convolution*/Conv_1x1/Convolve',
                'vs/PopnnGroupNormTraining/group-norm-training*/Norm',
                'vs/PopnnGroupNormTraining/group-norm-training*/iStdDev',
                'vs/PopnnGroupNormTraining/group-norm-training*/Whiten',
                'Sum/reduce.*/*/Reduce',
                'gradients/vs/PopnnGroupNormTraining_2_grad/PopnnGroupNormGrad/group-norm-grad*/',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose',
            ]
            # pylint: enable=line-too-long
            report.assert_all_compute_sets_and_list(ok)
Exemple #7
0
    def testGroupNormalizeInferenceAndStatistics(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    gamma = constant_op.constant([0.5, 0.5], np.float32)
                    beta = constant_op.constant([0.5, 0.5], np.float32)
                    mean, inv_std_dev = gen_popnn_ops.popnn_group_norm_statistics(
                        inputs=y,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = gen_popnn_ops.popnn_group_norm_inference(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        mean=mean,
                        inv_std_dev=inv_std_dev,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer())
                    mean, inv_std_dev = gen_popnn_ops.popnn_group_norm_statistics(
                        inputs=y,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)
                    y = gen_popnn_ops.popnn_group_norm_inference(
                        inputs=y,
                        gamma=gamma,
                        beta=beta,
                        mean=mean,
                        inv_std_dev=inv_std_dev,
                        data_format="NHWC",
                        epsilon=0.0015,
                        num_groups=2)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run(y, {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # Would fail if there were two batch norms in the graph
            ok = [
                '__seed*', 'Copy_',
                'vs/conv2d/Conv2D/convolution.*/Conv_1x1/Convolve',
                'vs/PopnnGroupNormStatistics/group-norm-statistics*/',
                'vs/PopnnGroupNormInference/group-norm-inference*/'
            ]
            report.assert_all_compute_sets_and_list(ok)
Exemple #8
0
    def testBatchNormsMatchFwdBwd(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = convolutional.conv2d(
                        x,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv1')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv2')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)
                    y = convolutional.conv2d(
                        y,
                        2,
                        1,
                        use_bias=False,
                        kernel_initializer=init_ops.ones_initializer(),
                        name='conv3')
                    y = layers_norm.batch_normalization(y,
                                                        fused=True,
                                                        training=True)

                loss = math_ops.reduce_sum(y)
                optimizer = gradient_descent.GradientDescentOptimizer(0.1)
                train = optimizer.minimize(loss)

            report = ReportJSON(self, sess)

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # One BN for forwards and one BN for grad
            # (note that we don't cache gradient application)
            # pylint: disable=line-too-long
            ok = [
                '__seed*',
                'Copy*',
                'vs/conv1/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization/FusedBatchNorm*/batch-norm-training.*/',
                'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce',
                'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce',
                'gradients/vs/batch_normalization_2/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/',
                'GradientDescent/update_vs/batch_normalization/',
                'GradientDescent/update_vs/batch_normalization_1/',
                'GradientDescent/update_vs/batch_normalization_2/',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose',
                'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose',
            ]
            # pylint: enable=line-too-long
            report.assert_all_compute_sets_and_list(ok)
Exemple #9
0
    def testCombineStreamCopies(self):
        with self.session() as sess:

            def with_outside_scope(x1, x2):
                with ipu_scope("/device:IPU:0"):
                    x1 *= 1.0
                    x2 *= 2.0
                    with outside_compilation_scope():
                        y1 = constant_op.constant(1.0, dtype=dtypes.float32)
                        y1 += x1
                        y2 = constant_op.constant(2.0, dtype=dtypes.float32)
                        y2 += x2
                    x1 += y1
                    x2 += y2
                    return x1, x2

            def without_outside_scope(x1, x2):
                with ipu_scope("/device:IPU:0"):
                    x1 *= 1.0
                    x2 *= 2.0
                    y1 = constant_op.constant(1.0, dtype=dtypes.float32)
                    y1 += x1
                    y2 = constant_op.constant(2.0, dtype=dtypes.float32)
                    y2 += x2
                    x1 += y1
                    x2 += y2
                    return x1, x2

            input1 = array_ops.placeholder(dtype=dtypes.float32, shape=(2, ))
            input2 = array_ops.placeholder(dtype=dtypes.float32, shape=(1, ))

            compiled_with_outside_scope = ipu_compiler.compile(
                with_outside_scope, inputs=[input1, input2])

            compiled_without_outside_scope = ipu_compiler.compile(
                without_outside_scope, inputs=[input1, input2])

            opts = utils.create_ipu_config(profiling=True)
            opts = utils.set_optimization_options(
                opts, max_send_recv_cluster_size=12)
            utils.configure_ipu_system(opts)

            report = ReportJSON(self, sess, configure_device=False)

            def count_stream_copies(compiled_func):
                report.reset()
                out1, out2 = sess.run(compiled_func, {
                    input1: [1.0, 1.0],
                    input2: [1.0]
                })
                self.assertAllEqual(out1, [3.0, 3.0])
                self.assertAllEqual(out2, [6.0])
                report.parse_log()

                main_program_index = report.get_first_program_of_type(
                    'Switch')['children'][1]
                main_program_seq = map(
                    report.get_program,
                    report.get_program(main_program_index)['children'])
                stream_copies = [
                    p for p in main_program_seq if p['type'] == 'StreamCopy'
                ]
                return len(stream_copies)

            num_copies_without_outside_scope = count_stream_copies(
                compiled_without_outside_scope)
            num_copies_with_outside_scope = count_stream_copies(
                compiled_with_outside_scope)

            # There should be at most two new SendToHost/RecvFromHost stream copies.
            self.assertLessEqual(num_copies_with_outside_scope,
                                 num_copies_without_outside_scope + 2)
Exemple #10
0
  def testCaseVariables(self):
    with self.session() as sess:

      def my_graph(pa, pb):
        with ipu.scopes.ipu_scope("/device:IPU:0"):

          @eager_function.defun
          def b0(x, y):
            return x + y

          @eager_function.defun
          def b1(x, y):
            return x - y

          @eager_function.defun
          def b2(x, y):
            return x * y

          v = variable_scope.get_variable('b0',
                                          dtype=dtypes.float32,
                                          initializer=[1., 5.])

          branches = [
              f.get_concrete_function(array_ops.zeros_like(pb),
                                      array_ops.zeros_like(v))
              for f in [b0, b1, b2]
          ]

          c_out = gen_functional_ops.case(pa,
                                          input=[pb, v],
                                          Tout=[dtypes.float32],
                                          branches=branches)

          return [c_out[0]]

      with ops.device('cpu'):
        pa = array_ops.placeholder(np.int32, [], name="a")
        pb = array_ops.placeholder(np.float32, [2], name="b")

      out = ipu.ipu_compiler.compile(my_graph, [pa, pb])

      report = ReportJSON(self, sess)

      sess.run(variables_lib.global_variables_initializer())

      report.reset()

      result = sess.run(out, {pa: 0, pb: [0., 1.]})
      self.assertAllClose(result[0], [1., 6.])

      result = sess.run(out, {pa: 1, pb: [0., 1.]})
      self.assertAllClose(result[0], [-1., -4.])

      result = sess.run(out, {pa: 2, pb: [0., 1.]})
      self.assertAllClose(result[0], [0., 5.])

      result = sess.run(out, {pa: 10, pb: [0., 1.]})
      self.assertAllClose(result[0], [0., 5.])

      report.parse_log()
      report.assert_contains_one_compile_event()
Exemple #11
0
    def testMappingJson(self):
        with self.session() as sess:

            def my_net(a, b, c):
                a = array_ops.broadcast_to(a, shape=[1024])
                b = array_ops.strided_slice(b, [0], [8192], [8])
                c = array_ops.pad(c, paddings=[[256, 256]])
                out = a + b + c
                return [out]

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [])
                b = array_ops.placeholder(np.float32, [8192])
                c = array_ops.placeholder(np.float32, [512])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                r = ipu.ipu_compiler.compile(my_net, inputs=[a, b, c])

            report = ReportJSON(self, sess)
            report.reset()

            fd = {a: 1.0, b: np.ones([8192]), c: np.ones([512])}
            result = sess.run(r, fd)

            expected = [2] * 256 + [3] * 512 + [2] * 256
            self.assertAllClose(result[0], expected)

            report.parse_log()
            tm = report.get_tensor_map()

            # There are two fusions in the graph, zero pad and implicit
            # broadcast add. We work out which one's which by looking at
            # layouts.
            fusion_0_layout = []
            fusion_1_layout = []
            slice_layout = []
            add_layout = []
            for tensor in tm.all_tensors():
                if tensor.inst.startswith('fusion.'):
                    fusion_1_layout = tensor
                elif tensor.inst.startswith('fusion'):
                    fusion_0_layout = tensor
                elif tensor.inst.startswith('slice'):
                    slice_layout = tensor
                elif tensor.inst.startswith('add'):
                    add_layout = tensor

            # The slice contains 4 elements on 256 tiles
            self.assertEqual(len(slice_layout.tiles), 256)
            for tile_idx, tile in enumerate(slice_layout.tiles):
                self.assertEqual(tile.tile, tile_idx)
                self.assertEqual(tile.num_elements, 4)

            # The broadcast add will have the same layout as the slice as it
            # should be done inplace.
            if slice_layout.tiles == fusion_1_layout.tiles:
                pad_layout = fusion_0_layout
            else:
                self.assertEqual(slice_layout.tiles, fusion_0_layout.tiles)
                pad_layout = fusion_1_layout

            # The pad contains 512 elements on tile 0,
            # and one region with 4 elements on tiles 64-192
            self.assertEqual(len(pad_layout.tiles), 129)
            for tile_idx, tile in enumerate(pad_layout.tiles):
                if tile_idx == 0:
                    self.assertEqual(tile.tile, tile_idx)
                    self.assertEqual(tile.num_elements, 512)
                else:
                    self.assertEqual(tile.tile, 63 + tile_idx)
                    self.assertEqual(tile.num_elements, 4)

            # The add is done inplace
            self.assertEqual(slice_layout.tiles, add_layout.tiles)