def build_and_run_model(): dataset = dataset_ops.Dataset.from_tensor_slices( np.ones(10, dtype=np.float32)) infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset, "infeed") outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue("outfeed") def body(v, x): v = v + x outfed = outfeed_queue.enqueue(v) return v, outfed def my_net(v): return ipu.loops.repeat(10, body, v, infeed_queue) v = array_ops.placeholder(np.float32, shape=()) with ipu.scopes.ipu_scope("/device:IPU:0"): [result] = ipu.ipu_compiler.compile(my_net, inputs=[v]) with ops.control_dependencies([result]): dequeued = outfeed_queue.dequeue() with session.Session() as sess: report = ReportJSON( self, sess, set_opts_fn=_use_offline_compilation_if_needed) sess.run(infeed_queue.initializer) try: res, deq = sess.run([result, dequeued], {v: 0.0}) except errors.InvalidArgumentError as e: if offline_compilation_needed and "compilation only" in e.message: res = [] deq = [] else: raise events = report.get_event_trace(sess) return res, deq, events
def testGroupNormalizeInference(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) gamma = constant_op.constant([0.5, 0.5], np.float32) beta = constant_op.constant([0.5, 0.5], np.float32) mean = constant_op.constant([0.5, 0.5], np.float32) inv_std_dev = constant_op.constant([0.5, 0.5], np.float32) y = gen_popnn_ops.popnn_group_norm_inference( inputs=y, gamma=gamma, beta=beta, mean=mean, inv_std_dev=inv_std_dev, data_format="NHWC", epsilon=0.0015, num_groups=2) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = gen_popnn_ops.popnn_group_norm_inference( inputs=y, gamma=gamma, beta=beta, mean=mean, inv_std_dev=inv_std_dev, data_format="NHWC", epsilon=0.0015, num_groups=2) report = ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(y, {x: np.zeros([1, 4, 4, 2])}) report.parse_log() # Would fail if there were two batch norms in the graph ok = [ '__seed*', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1/Convolve', 'vs/PopnnGroupNormInference/group-norm-inference*/' ] report.assert_all_compute_sets_and_list(ok)
def testCaseSimple(self): with self.session() as sess: def my_graph(pa, pb, pc): with ipu.scopes.ipu_scope("/device:IPU:0"): @eager_function.defun def b0(x, y): return x + y @eager_function.defun def b1(x, y): return x - y @eager_function.defun def b2(x, y): return x * y branches = [ f.get_concrete_function(array_ops.zeros_like(pb), array_ops.zeros_like(pc)) for f in [b0, b1, b2] ] c_out = gen_functional_ops.case(pa, input=[pb, pc], Tout=[dtypes.float32], branches=branches) return [c_out[0]] with ops.device('cpu'): pa = array_ops.placeholder(np.int32, [], name="a") pb = array_ops.placeholder(np.float32, [2], name="b") pc = array_ops.placeholder(np.float32, [2], name="c") out = ipu.ipu_compiler.compile(my_graph, [pa, pb, pc]) report = ReportJSON(self, sess) report.reset() result = sess.run(out, {pa: 0, pb: [0., 1.], pc: [1., 5.]}) self.assertAllClose(result[0], [1., 6.]) result = sess.run(out, {pa: 1, pb: [0., 1.], pc: [1., 5.]}) self.assertAllClose(result[0], [-1., -4.]) result = sess.run(out, {pa: 2, pb: [0., 1.], pc: [1., 5.]}) self.assertAllClose(result[0], [0., 5.]) result = sess.run(out, {pa: 10, pb: [0., 1.], pc: [1., 5.]}) self.assertAllClose(result[0], [0., 5.]) report.parse_log() report.assert_contains_one_compile_event()
def testGather(self): with self.session() as sess: def my_net(w, i): out = array_ops.gather(w, i) return [out] with ops.device('cpu'): i = array_ops.placeholder(np.int32, [256]) w = array_ops.placeholder(np.float32, [1024, 8]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[w, i]) report = ReportJSON(self, sess) report.reset() i_h = np.arange(0, 3 * 256, 3) w_h = np.arange(8192).reshape(1024, 8) expect = np.take(w_h, i_h, axis=0) result = sess.run(r, {i: i_h, w: w_h}) self.assertAllClose(result[0], expect) report.parse_log() tm = report.get_tensor_map() bad_maps = [] for tensor in tm.all_tensors(): if tensor.num_elements > 16: if len(tensor.tiles) == 1 and tensor.has_contant: bad_maps += [tensor.inst] self.assertFalse(bad_maps)
def testBatchNormalizeInferenceDontMatchDifferentTypes(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) y = math_ops.cast(y, np.float16) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) report = ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(y, {x: np.zeros([1, 4, 4, 2])}) report.parse_log() # Matches two convolutions ok = [ '__seed*', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization/FusedBatchNorm*/batch-norm-inference.*/', 'vs/Cast/convert.*/Cast', 'vs/conv2d_1/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization_1/FusedBatchNorm*/batch-norm-inference.*/' ] report.assert_all_compute_sets_and_list(ok)
def testDoNotCompileScalarConstGraph(self): with self.session() as sess: def my_graph(a, b): with ops.device("/device:IPU:0"): x = math_ops.add(a, b) return x with ops.device('cpu'): a = 2 b = 3 out = ipu.ipu_compiler.compile(my_graph, [a, b]) report = ReportJSON(self, sess) report.reset() result = sess.run(out) report.parse_log() report.assert_contains_no_compile_event() self.assertEqual(result, [5])
def testDoNotCompileScalarElementWiseGraphWithParameter(self): with self.session() as sess: def my_graph(a, b): with ops.device("/device:IPU:0"): x = math_ops.add(a, b) return x with ops.device('cpu'): a = array_ops.placeholder(np.int32, name="a") b = array_ops.placeholder(np.int32, name="b") out = ipu.ipu_compiler.compile(my_graph, [a, b]) report = ReportJSON(self, sess) report.reset() fd = {a: np.int32(2), b: np.int32(3)} result = sess.run(out, fd) report.parse_log() report.assert_contains_no_compile_event() self.assertAllClose(result, [5])
def testInplaceReadWrite(self): with self.session() as sess: def my_net(x, y, a): z = x + y c = a + x return c, z with ops.device('cpu'): x = array_ops.placeholder(np.int32, [100]) y = array_ops.placeholder(np.int32, [100]) a = array_ops.placeholder(np.int32, [100]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[x, y, a]) report = ReportJSON(self, sess) report.reset() i_x = np.full(100, 1) i_y = np.full(100, 2) i_a = np.full(100, 10) expect_c = np.full(100, 11) expect_z = np.full(100, 3) result_c, result_z = sess.run(r, {x: i_x, y: i_y, a: i_a}) self.assertAllClose(result_c, expect_c) self.assertAllClose(result_z, expect_z) report.parse_log() tm = report.get_tensor_map() bad_maps = [] for tensor in tm.all_tensors(): # Number of elements in tensor 100. # Number of used tiles should be larger than 1 if tensor.num_elements != 100 or len(tensor.tiles) <= 1: bad_maps += [tensor.inst] self.assertFalse(bad_maps)
def testNormCacheConstants(self): with self.session() as sess: def model(x, y, z): scale = gen_array_ops.broadcast_to(z, shape=[65536]) offset = scale b_mean, b_var = nn.moments(x, [0, 1, 2], name='moments') a = nn.fused_batch_norm(x, scale, offset, b_mean, b_var, 1e-3, is_training=False, name="a") b = nn.fused_batch_norm(y, scale, offset, b_mean, b_var, 1e-3, is_training=False, name="b") return a[0] + b[0] with ops.device('cpu'): x = array_ops.placeholder(np.float16, [1, 1, 1, 65536], name="x") y = array_ops.placeholder(np.float16, [1, 1, 1, 65536], name="y") z = array_ops.placeholder(np.float16, shape=[1]) with ops.device("/device:IPU:0"): res = ipu_compiler.compile(model, inputs=[x, y, z]) report = ReportJSON(self, sess) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report.reset() r = sess.run(res, { x: np.ones(x.shape), y: np.ones(y.shape), z: [1.0] }) self.assertAllClose(r[0], np.full(r[0].shape, 2)) report.parse_log() report.assert_total_tile_memory(1634674) report.assert_max_tile_memory(1551) # Would fail if there were two batch norms in the graph ok = [ '__seed*', 'host-exchange-local-copy', 'Copy_', 'moments/SquaredDifference/multiply', 'a/batch-norm-inference', 'add/add*/Add', ] report.assert_all_compute_sets_and_list(ok)
def testGroupNormsMatchFwdBwd(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv1') gamma = constant_op.constant([0.5, 0.5], np.float32) beta = constant_op.constant([0.5, 0.5], np.float32) y, _, _ = gen_popnn_ops.popnn_group_norm_training( inputs=y, gamma=gamma, beta=beta, data_format="NHWC", epsilon=0.0015, num_groups=2) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv2') y, _, _ = gen_popnn_ops.popnn_group_norm_training( inputs=y, gamma=gamma, beta=beta, data_format="NHWC", epsilon=0.0015, num_groups=2) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv3') y, _, _ = gen_popnn_ops.popnn_group_norm_training( inputs=y, gamma=gamma, beta=beta, data_format="NHWC", epsilon=0.0015, num_groups=2) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) report = ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])}) report.parse_log() # One GN for forwards and one GN for grad # pylint: disable=line-too-long ok = [ '__seed*', 'Copy_', 'vs/conv1/Conv2D/convolution*/Conv_1x1/Convolve', 'vs/PopnnGroupNormTraining/group-norm-training*/Norm', 'vs/PopnnGroupNormTraining/group-norm-training*/iStdDev', 'vs/PopnnGroupNormTraining/group-norm-training*/Whiten', 'Sum/reduce.*/*/Reduce', 'gradients/vs/PopnnGroupNormTraining_2_grad/PopnnGroupNormGrad/group-norm-grad*/', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose', ] # pylint: enable=line-too-long report.assert_all_compute_sets_and_list(ok)
def testBatchNormsMatchFwdBwd(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv1') y = layers_norm.batch_normalization(y, fused=True, training=True) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv2') y = layers_norm.batch_normalization(y, fused=True, training=True) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv3') y = layers_norm.batch_normalization(y, fused=True, training=True) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) report = ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])}) report.parse_log() # One BN for forwards and one BN for grad # (note that we don't cache gradient application) # pylint: disable=line-too-long ok = [ '__seed*', 'Copy*', 'vs/conv1/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization/FusedBatchNorm*/batch-norm-training.*/', 'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce', 'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce', 'gradients/vs/batch_normalization_2/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/', 'GradientDescent/update_vs/batch_normalization/', 'GradientDescent/update_vs/batch_normalization_1/', 'GradientDescent/update_vs/batch_normalization_2/', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose', 'gradients/vs/conv*/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose', ] # pylint: enable=line-too-long report.assert_all_compute_sets_and_list(ok)
def testCombineStreamCopies(self): with self.session() as sess: def with_outside_scope(x1, x2): with ipu_scope("/device:IPU:0"): x1 *= 1.0 x2 *= 2.0 with outside_compilation_scope(): y1 = constant_op.constant(1.0, dtype=dtypes.float32) y1 += x1 y2 = constant_op.constant(2.0, dtype=dtypes.float32) y2 += x2 x1 += y1 x2 += y2 return x1, x2 def without_outside_scope(x1, x2): with ipu_scope("/device:IPU:0"): x1 *= 1.0 x2 *= 2.0 y1 = constant_op.constant(1.0, dtype=dtypes.float32) y1 += x1 y2 = constant_op.constant(2.0, dtype=dtypes.float32) y2 += x2 x1 += y1 x2 += y2 return x1, x2 input1 = array_ops.placeholder(dtype=dtypes.float32, shape=(2, )) input2 = array_ops.placeholder(dtype=dtypes.float32, shape=(1, )) compiled_with_outside_scope = ipu_compiler.compile( with_outside_scope, inputs=[input1, input2]) compiled_without_outside_scope = ipu_compiler.compile( without_outside_scope, inputs=[input1, input2]) opts = utils.create_ipu_config(profiling=True) opts = utils.set_optimization_options( opts, max_send_recv_cluster_size=12) utils.configure_ipu_system(opts) report = ReportJSON(self, sess, configure_device=False) def count_stream_copies(compiled_func): report.reset() out1, out2 = sess.run(compiled_func, { input1: [1.0, 1.0], input2: [1.0] }) self.assertAllEqual(out1, [3.0, 3.0]) self.assertAllEqual(out2, [6.0]) report.parse_log() main_program_index = report.get_first_program_of_type( 'Switch')['children'][1] main_program_seq = map( report.get_program, report.get_program(main_program_index)['children']) stream_copies = [ p for p in main_program_seq if p['type'] == 'StreamCopy' ] return len(stream_copies) num_copies_without_outside_scope = count_stream_copies( compiled_without_outside_scope) num_copies_with_outside_scope = count_stream_copies( compiled_with_outside_scope) # There should be at most two new SendToHost/RecvFromHost stream copies. self.assertLessEqual(num_copies_with_outside_scope, num_copies_without_outside_scope + 2)
def testMappingJson(self): with self.session() as sess: def my_net(a, b, c): a = array_ops.broadcast_to(a, shape=[1024]) b = array_ops.strided_slice(b, [0], [8192], [8]) c = array_ops.pad(c, paddings=[[256, 256]]) out = a + b + c return [out] with ops.device('cpu'): a = array_ops.placeholder(np.float32, []) b = array_ops.placeholder(np.float32, [8192]) c = array_ops.placeholder(np.float32, [512]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[a, b, c]) report = ReportJSON(self, sess) report.reset() fd = {a: 1.0, b: np.ones([8192]), c: np.ones([512])} result = sess.run(r, fd) expected = [2] * 256 + [3] * 512 + [2] * 256 self.assertAllClose(result[0], expected) report.parse_log() tm = report.get_tensor_map() # There are two fusions in the graph, zero pad and implicit # broadcast add. We work out which one's which by looking at # layouts. fusion_0_layout = [] fusion_1_layout = [] slice_layout = [] add_layout = [] for tensor in tm.all_tensors(): if tensor.inst.startswith('fusion.'): fusion_1_layout = tensor elif tensor.inst.startswith('fusion'): fusion_0_layout = tensor elif tensor.inst.startswith('slice'): slice_layout = tensor elif tensor.inst.startswith('add'): add_layout = tensor # The slice contains 4 elements on 256 tiles self.assertEqual(len(slice_layout.tiles), 256) for tile_idx, tile in enumerate(slice_layout.tiles): self.assertEqual(tile.tile, tile_idx) self.assertEqual(tile.num_elements, 4) # The broadcast add will have the same layout as the slice as it # should be done inplace. if slice_layout.tiles == fusion_1_layout.tiles: pad_layout = fusion_0_layout else: self.assertEqual(slice_layout.tiles, fusion_0_layout.tiles) pad_layout = fusion_1_layout # The pad contains 512 elements on tile 0, # and one region with 4 elements on tiles 64-192 self.assertEqual(len(pad_layout.tiles), 129) for tile_idx, tile in enumerate(pad_layout.tiles): if tile_idx == 0: self.assertEqual(tile.tile, tile_idx) self.assertEqual(tile.num_elements, 512) else: self.assertEqual(tile.tile, 63 + tile_idx) self.assertEqual(tile.num_elements, 4) # The add is done inplace self.assertEqual(slice_layout.tiles, add_layout.tiles)