def testCheckMaxTileSize(self): dtype = np.float32 shape = (1024, 2048) with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): a = variable_scope.get_variable( "a", shape=shape, initializer=init_ops.constant_initializer(2), dtype=dtype) pb = array_ops.placeholder(shape=shape, dtype=dtype, name="b") c = constant_op.constant(4, shape=shape, dtype=dtype, name="c") output = a + pb + c with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(execution_trace=False) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) max_tile_size = tu.get_maximum_tile_size_from_events(s) self.assertTrue(max_tile_size < 17000) out = sess.run(output, {pb: np.ones(shape=shape, dtype=dtype)}) self.assertAllClose(np.full(shape, 7, dtype=dtype), out) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) max_tile_size = tu.get_maximum_tile_size_from_events(s) self.assertTrue(max_tile_size < 40000)
def testDepthwiseConvBackpropFilter1x1WithRelu(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [1, 6, 6, 3], name="a") pb = constant_op.constant([1, 1, 3, 2], dtype=np.int32) # filter sizes pc = array_ops.placeholder(np.float32, [1, 6, 6, 6], name="c") c = nn.depthwise_conv2d_native_backprop_filter( pa, pb, pc, strides=[1, 1, 1, 1], padding="SAME") c = nn.relu(c) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: np.zeros([1, 6, 6, 3]), pc: np.zeros([1, 6, 6, 6])} result = sess.run(c, fd) self.assertAllClose(result, np.zeros([1, 1, 3, 2])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_', 'DepthwiseConv2dNativeBackpropFilter/fusion*/Conv_6x6', 'Relu/custom-call*/Nonlinearity' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testTopK(self): n_categories = 1200 topn = 24 def model(a): values, indices = nn.top_k(a, topn) return indices with ops.device('cpu'): pa = array_ops.placeholder(np.float32, [n_categories], name="a") report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): out = model(pa) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) input = np.random.random(n_categories) expected = (-input).argsort()[:topn] fd = {pa: input} result = sess.run(out, fd) self.assertAllClose(result, expected) result = sess.run(report) self.assertTrue(len(result) == 3)
def testPrefixPathWithTranspose(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[4, 4, 2, 1]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = array_ops.transpose(y, [1, 2, 3, 0]) + z tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([4, 4, 2, 1]) }) self.assertAllClose(result, [[[[2.], [2.]], [[6.], [6.]], [[10.], [10.]], [[14.], [14.]]], [[[18.], [18.]], [[22.], [22.]], [[26.], [26.]], [[30.], [30.]]], [[[34.], [34.]], [[38.], [38.]], [[42.], [42.]], [[46.], [46.]]], [[[50.], [50.]], [[54.], [54.]], [[58.], [58.]], [[62.], [62.]]]])
def testPrefixPathWithReshape(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[32]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = gen_array_ops.reshape(y, [32]) + z tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) result = sess.run(res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([32]) }) # Confirmed with values on the CPU. self.assertAllClose(result, [ 2., 2., 6., 6., 10., 10., 14., 14., 18., 18., 22., 22., 26., 26., 30., 30., 34., 34., 38., 38., 42., 42., 46., 46., 50., 50., 54., 54., 58., 58., 62., 62. ])
def testMaxPool(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [1, 1, 10, 10], name="a") c = nn.max_pool(pa, ksize=[1, 1, 5, 5], strides=[1, 1, 2, 2], data_format='NCHW', padding='SAME', name="max") with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: np.ones([1, 1, 10, 10]), } result = sess.run(c, fd) self.assertAllClose(result, np.ones([1, 1, 5, 5])) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'max/custom-call*/maxPool5x5'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testPrefixPathWithElementwiseInPath(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) s = array_ops.placeholder(np.float32, shape=[]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = y + z * s tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.reshape(np.arange(32), [1, 4, 4, 2]), s: 2.0 }) # Confirmed with values on the CPU. self.assertAllClose( result, [[[[1., 3.], [9., 11.], [17., 19.], [25., 27.]], [[33., 35.], [41., 43.], [49., 51.], [57., 59.]], [[65., 67.], [73., 75.], [81., 83.], [89., 91.]], [[97., 99.], [105., 107.], [113., 115.], [121., 123.]]]])
def testArgMax(self): batchsize = 4 n_categories = 1200 def model(a): return math_ops.argmax(a, axis=1, output_type=dtypes.int32) with ops.device('cpu'): pa = array_ops.placeholder(np.float32, [batchsize, n_categories]) report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): out = model(pa) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) input = np.random.rand(batchsize, n_categories) fd = {pa: input} result = sess.run(out, fd) self.assertAllClose(result, np.argmax(input, axis=1)) result = sess.run(report) self.assertTrue(len(result) == 3)
def testConvBackpropFilter(self): with ops.device("/device:IPU:0"): inp = array_ops.placeholder(np.float32, [2, 8, 8, 3]) fil = constant_op.constant([2, 2, 3, 5], np.int32) bck = array_ops.placeholder(np.float32, [2, 8, 8, 5], name="wei") output = nn_ops.conv2d_backprop_filter( inp, fil, bck, strides=[1, 1, 1, 1], padding="SAME") with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { inp: np.zeros([2, 8, 8, 3]), bck: np.zeros([2, 8, 8, 5]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros([2, 2, 3, 5])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'Copy_', 'Conv2DBackpropFilter/convolution.*/Conv_8x8'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testNamedOperations(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") with ops.name_scope('my_ops'): out = math_ops.add(pa, pb, 'my_add_op') with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) result = sess.run(out, fd) self.assertAllClose(result, [[1., 2.], [6., 8.]]) rep = sess.run(report, fd) s = tu.extract_all_strings_from_event_trace(rep) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'my_ops/my_add_op/add'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testTuplesOfTuplesAreStreamed(self): with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): pa = array_ops.placeholder(np.int64, [2, 2], name="a") pb = array_ops.placeholder(np.int64, [2, 2], name="b") pc = array_ops.placeholder(np.int64, [2, 2], name="c") c = control_flow_ops.tuple((pa + pc, pb + pc)) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(report) in0 = np.full((2, 2), 7) in1 = np.full((2, 2), 6) in2 = np.full((2, 2), 5) fd = { pa: in0, pb: in1, pc: in2, } out = sess.run(c, fd) self.assertEqual(len(out), 2) self.assertAllClose(out, (np.full((2, 2), 12), np.full( (2, 2), 11))) rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) # No io_events implies the data was streamed self.assertEqual(len(list(io_evts)), 0)
def testIpuEventsWithoutPoplarReporting(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(enable_ipu_events=True, compilation_trace=False, io_trace=False, execution_trace=False) with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = tu.extract_all_events(rep) self.assertEqual(len(evts), 3) # compile begin, compile end, execute for e in evts: if e.type == IpuTraceEvent.COMPILE_END: self.assertTrue(len(e.compile_end.compilation_report) == 0) if e.type == IpuTraceEvent.EXECUTE: self.assertTrue(len(e.execute.execution_report) == 0) sess.close()
def testDefaultTruncatedNormalInitalizer(self): with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): i = init_ops.truncated_normal_initializer() z = variable_scope.get_variable("z1", shape=[2, 4], dtype=np.float32, initializer=i) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) o = sess.run(z) self.assertAllClose(o, np.ones((2, 4)), 2.0, 2.0) # Find of the names of compute sets r = sess.run(report) s = tu.extract_all_strings_from_event_trace(r) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'z1/Initializer/truncated_normal/TruncatedNormal/custom-call*/truncatedNormal' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testUniformRandomNonScalarInitalizer(self): with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): i = init_ops.random_uniform_initializer(minval=-2.0, maxval=2.0) z = variable_scope.get_variable("z1", shape=[2], dtype=np.float32, initializer=i) tu.configure_ipu_system() with tu.ipu_session() as sess: # Clean existing reports sess.run(report) sess.run(variables.global_variables_initializer()) r = sess.run(report) o = sess.run(z) self.assertAllClose(o, [0.0, 0.0], 2.0, 2.0) s = tu.extract_all_strings_from_event_trace(r) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'vs/z1/Initializer/random_uniform/RandomUniform/fusion/uniform' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testScaledSubtractFrom(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float16, [3]) pb = array_ops.placeholder(np.float16, [3]) const = array_ops.constant(2.0, np.float16) # note how const operand index varies compared to testScaledAddTo # still should match as it will be reordered c = pa - const * pb with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: [2.0, 0.5, 1.0], pb: [1.0, 2.0, 3.0]} result = sess.run(c, fd) self.assertAllClose(result, [0.0, -3.5, -5.0]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'host-exchange-local-copy-', 'sub/fusion/AddTo'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testRandomNormalInitalizer(self): with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): i = init_ops.random_normal_initializer(mean=2.0, stddev=0.01) z = variable_scope.get_variable("z1", shape=[], dtype=np.float32, initializer=i) tu.configure_ipu_system() with tu.ipu_session() as sess: # Clean existing reports sess.run(report) sess.run(variables.global_variables_initializer()) r = sess.run(report) o = sess.run(z) self.assertAllClose(o, 2.0, 0.2, 0.2) s = tu.extract_all_strings_from_event_trace(r) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'vs/z1/Initializer/random_normal/RandomStandardNormal/fusion/normal' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testBatchNormalizeLayerFusedFp16(self): with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): x = array_ops.placeholder(np.float16, [4, 64, 64, 4], name="a") normed = layers_norm.batch_normalization(x, fused=True) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) sess.run(variables.global_variables_initializer()) result = sess.run(normed, {x: np.zeros([4, 64, 64, 4])}) self.assertAllClose(result, np.zeros([4, 64, 64, 4])) rep = sess.run(report) s = tu.extract_all_strings_from_event_trace(rep) cs = tu.get_compute_sets_from_report(s) bl = ['*convert*/Cast*'] self.assertTrue(tu.check_compute_sets_not_in_blacklist(cs, bl))
def testDontOutlineInplaceExpression(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, []) pb = array_ops.placeholder(np.float32, []) pc = array_ops.placeholder(np.float32, []) pd = array_ops.placeholder(np.float32, []) e = pa + pb - pc + pd with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: 1, pb: 2, pc: 3, pd: 4} result = sess.run(e, fd) self.assertAllClose(result, 4) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'add/add.*/AddTo', 'sub/subtract.*/AddTo', 'add_1/add.*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testSigmoidGrad(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [3], name="grad") pb = array_ops.placeholder(np.float32, [3], name="in") c = gen_math_ops.sigmoid_grad(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: [2.0, 0.5, 1.0], pb: [-1.0, 1.0, 6.0]} result = sess.run(c, fd) self.assertAllClose(result, [2.0, 0.25, 0.0]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'SigmoidGrad/custom-call/NonLinearityGrad'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testDepthwiseConvBackpropInput1x1(self): with ops.device("/device:IPU:0"): pa = constant_op.constant([1, 8, 8, 3], dtype=np.int32) # input sizes pb = array_ops.placeholder(np.float32, [1, 1, 3, 2], name="b") pc = array_ops.placeholder(np.float32, [1, 8, 8, 6], name="c") c = nn.depthwise_conv2d_native_backprop_input( pa, pb, pc, strides=[1, 1, 1, 1], padding="SAME") with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pb: np.zeros([1, 1, 3, 2]), pc: np.zeros([1, 8, 8, 6])} result = sess.run(c, fd) self.assertAllClose(result, np.zeros([1, 8, 8, 3])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'DepthwiseConv2dNativeBackpropInput/fusion*/WeightTranspose', 'DepthwiseConv2dNativeBackpropInput/fusion*/Conv_1x1', 'Copy_' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testSigmoidNotInplace(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [3], name="a") c = math_ops.sigmoid(pa) + pa with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: [-6.0, 0.0, 6.0]} result = sess.run(c, fd) self.assertAllClose(result, [-5.997527, 0.5, 6.997527]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Sigmoid/custom-call/Nonlinearity', 'Copy_XLA_Args/arg0.*_to_Sigmoid/custom-call.clone/OnTileCopy-0', 'add/add.*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testScaledSubtractFromVariable(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float16, [3]) pb = array_ops.placeholder(np.float16, [3]) pc = array_ops.placeholder(np.float16, [1]) c = pa - pc * pb with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = {pa: [2.0, 0.5, 1.0], pb: [1.0, 2.0, 3.0], pc: [2.0]} result = sess.run(c, fd) self.assertAllClose(result, [0.0, -3.5, -5.0]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'host-exchange-local-copy-', 'sub/fusion/AddTo'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testCborReport(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(text_report=False, cbor_report=True) with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = tu.extract_all_events(rep) self.assertEqual(len(evts), 3) # begin, end, execute self.assertEqual(evts[1].compile_end.compilation_report[0], bytes(bytearray([217]))[0]) self.assertEqual(evts[2].execute.execution_report[0], bytes(bytearray([217]))[0])
def testConvolutionBiasApplyVariableLR(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) lr = array_ops.placeholder(np.float32, shape=[]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(y) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(lr) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run([train, loss], {x: np.zeros([1, 4, 4, 2]), lr: 0.1}) result = sess.run(report) self.assertEqual( len(result), 6) # 2xcompile, 1xupload, 1xload, 1xdownload, 1xexecute s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_', 'host-exchange-local-copy-', 'vs/conv2d/BiasAdd/fusion*/addToChannel', 'vs/conv2d/Conv2D/convolution*', 'vs/conv2d_1/BiasAdd/fusion.2/addToChannel', 'GradientDescent/update_vs/conv2d/bias/ResourceApplyGradientDescent/fusion.3/ReduceFinalStage/IntermediateToOutput/Reduce', 'GradientDescent/update_vs/conv2d/bias/ResourceApplyGradientDescent/fusion*/negate/Op/Negate', 'gradients/vs/conv2d_1/Conv2D_grad/Conv2DBackpropFilter/fusion*/Conv_4x4/', 'gradients/vs/conv2d_1/Conv2D_grad/Conv2DBackpropFilter/fusion*/AddTo', 'GradientDescent/update_vs/conv2d_1/bias/ResourceApplyGradientDescent/multiply*/Op/Multiply', 'GradientDescent/update_vs/conv2d_1/bias/ResourceApplyGradientDescent/subtract*/AddTo', 'vs/conv2d/BiasAdd/fusion*/addToChannel', 'Sum/reduce*/ReduceFinalStage/IntermediateToOutput/Reduce', ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConvolutionsMatchFwdBwdWu(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv1')(x) y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv2')(y) y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv3')(y) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Fwd and BackpropInput should be shared # Weight transpose for BackpropInput should be present # Both BackpropFilter should be shared ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_', 'vs/conv1/Conv2D/convolution.*/Conv_1x1', 'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce', 'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropInput/fusion.*/WeightTranspose', 'gradients/vs/conv2/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4', 'gradients/vs/conv2/Conv2D_grad/Conv2DBackpropFilter/fusion.*/DeltasPartialTranspose', 'gradients/vs/conv2/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo' ]
def testBatchNormAndGroupNormalizeMixedInference(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) gamma = constant_op.constant([0.5, 0.5], np.float32) beta = constant_op.constant([0.5, 0.5], np.float32) mean = constant_op.constant([0.5, 0.5], np.float32) inv_std_dev = constant_op.constant([0.5, 0.5], np.float32) y = gen_popnn_ops.popnn_group_norm_inference( inputs=y, gamma=gamma, beta=beta, mean=mean, inv_std_dev=inv_std_dev, data_format="NHWC", epsilon=0.0015, num_groups=2) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Would fail if there were two batch norms in the graph ok = [ '__seed*', 'host-exchange-local-copy', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1/Convolve', 'vs/PopnnGroupNormInference/custom-call*/', 'vs/batch_normalization/FusedBatchNorm/batch-norm-inference.*/' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testNonModifiedResourceIsNotOverwrittenInPlaceOp(self): # This test verifies that if we have a resource varaible (w) which is marked # as not modified then a copy is inserted to make sure it is not overwritten # between executions if it is used by an inplace op w_val = [1, 2, 3, 4] with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): w = variable_scope.get_variable( "w", shape=[4], dtype=np.float32, initializer=init_ops.constant_initializer( np.array(w_val, dtype=np.float32))) px = array_ops.placeholder(np.float32, shape=[4]) y = w + px with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) xs = [ np.array([7, 3, 5, 9], dtype=np.float32), np.array([1, 8, 3, 4], dtype=np.float32), np.array([9, 2, 2, 6], dtype=np.float32) ] for x in xs: out = sess.run(y, {px: x}) self.assertAllClose(out, x + w_val) rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) host_to_device = list( filter(lambda x: x[0] == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER, io_evts)) self.assertEqual(len(list(host_to_device)), 1) device_to_host = list( filter(lambda x: x[0] == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER, io_evts)) self.assertEqual(len(list(device_to_host)), 0) # w should be copied to device once and that should be the only io event w_dl = "1.0" self.assertEqual( len(list(filter(lambda x: x[1] == w_dl, host_to_device))), 1)
def testFwdAndBwdMaxPool(self): input = np.arange(16).reshape(1, 4, 4, 1) output_grad = np.full((1, 2, 2, 1), 0.1) with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [1, 4, 4, 1], name="a") pb = array_ops.placeholder(np.float32, [1, 2, 2, 1], name="b") c = nn.max_pool(pa, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], data_format='NCHW', padding='SAME') d = gen_nn_ops.max_pool_grad(pa, c, pb, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], data_format='NCHW', padding='SAME') with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fe = { pa: input, pb: output_grad, } output, input_grad = sess.run((c, d), fe) self.assertAllClose(output, [[[[5.], [7.]], [[13.], [15.]]]]) self.assertAllClose( input_grad, [[[[0.], [0.], [0.], [0.]], [[0.], [0.1], [0.], [0.1]], [[0.], [0.], [0.], [0.]], [[0.], [0.1], [0.], [0.1]]]]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_*', 'MaxPool/custom-call*/maxPool2x2/', 'MaxPoolGrad/custom-call*/maxPool2x2' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testWideConstantWithAllocationTarget(self): # This test will fail if the dynamic slice is not mapped correctly. dtype = np.float32 shape = (512, 2, 2048) def my_net(y): def cond(i, x, y): return i < 2 def body(i, x, y): s = array_ops.slice(x, [i, i, i], [1, 1, 2048]) y = y + math_ops.reduce_mean(s) i = i + 1 return (i, x, y) i = 0 c = constant_op.constant(4, shape=shape, dtype=dtype, name="c") return control_flow_ops.while_loop(cond, body, (i, c, y))[2] with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) with tu.ipu_session() as sess: sess.run(report) y = sess.run(r, {y: [10]}) self.assertAllClose(y[0], [18]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_*_to_*', 'while/Slice/dynamic-slice*/dynamicSlice', 'while/Mean/reduce', 'while/Mean/multiply', 'while/add*/add*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok)) max_tile_size = tu.get_maximum_tile_size_from_events(s) self.assertTrue(max_tile_size < 60000)
def testBatchNormalizeInferenceDontMatchDifferentTypes(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) y = math_ops.cast(y, np.float16) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Matches two convolutions ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization/FusedBatchNorm/batch-norm-inference.*/', 'vs/Cast/convert.*/Cast', 'vs/conv2d_1/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization_1/FusedBatchNormV2/batch-norm-inference.*/' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))