def get_config(opts): """Builds ipu_options""" profile = opts.report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, opts.shards * opts.replicas) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) if opts.matmul_options: config = utils.set_matmul_options(config, json.loads(opts.matmul_options)) if opts.enable_half_partials: config = utils.set_matmul_options(config, {"partialsType": 'half'}) config = utils.set_convolution_options(config, {"partialsType": 'half'}) return config
def testCrossReplicaAndStatefulGradientAccumulate(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, y): del y return i < 10 def body(i, y): cr = gen_popops_ops.ipu_cross_replica_sum( array_ops.ones_like(y)) ga = gen_poputil_ops.ipu_stateful_gradient_accumulate( cr, num_mini_batches=5) y = y + ga i = i + 1 return (i, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() opts = utils.auto_select_ipus(opts, num_ipus=2) utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) y = sess.run(r, {y: [10]}) self.assertEqual(y[0], 10) self.assertAllEqual(y[1], [30])
def testCborReport(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True, use_poplar_text_report=False, use_poplar_cbor_report=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = utils.extract_all_events(rep) self.assertEqual(len(evts), 4) # engine, begin, end, execute self.assertEqual(evts[1].compile_end.compilation_report[0], bytes(bytearray([217]))[0]) self.assertEqual(evts[3].execute.execution_report[0], bytes(bytearray([217]))[0])
def testIpuModelDeviceWithMultipleReport(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out1 = pa + pb out2 = pa - pb with ops.device('cpu'): with ops.control_dependencies([out1, out2]): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) result = sess.run(out1, fd) self.assertAllClose(result, [[1., 2.], [6., 8.]]) result, rep = sess.run([out2, report], fd) self.assertAllClose(result, [[1., 0.], [-2., -2.]]) # 2x engine, 2x compile_begin, 2x compile_end, 2x load engine self.assertEqual(len(rep), 8)
def testPrefixPathWithTranspose(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[4, 4, 2, 1]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = array_ops.transpose(y, [1, 2, 3, 0]) + z opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([4, 4, 2, 1]) }) self.assertAllClose(result, [[[[2.], [2.]], [[6.], [6.]], [[10.], [10.]], [[14.], [14.]]], [[[18.], [18.]], [[22.], [22.]], [[26.], [26.]], [[30.], [30.]]], [[[34.], [34.]], [[38.], [38.]], [[42.], [42.]], [[46.], [46.]]], [[[50.], [50.]], [[54.], [54.]], [[58.], [58.]], [[62.], [62.]]]])
def get_config(report_n=1): """Builds ipu_options""" config = utils.create_ipu_config(profiling=False, use_poplar_text_report=False, report_every_nth_execution=report_n) config = utils.auto_select_ipus(config, [1]) return config
def testIoTilesAreExcludedFromShard(self): def my_net(a, b): with ipu_shard(0): aa = math_ops.matmul(a, a, transpose_b=True, name="aa") with ipu_shard(1): bb = math_ops.matmul(b, b, transpose_b=True, name="bb") return aa, bb input_a = array_ops.placeholder(np.float32, [1216, 1]) input_b = array_ops.placeholder(np.float32, [1216, 1]) with ops.device("/device:IPU:0"): compiled_net = ipu_compiler.compile(my_net, inputs=[input_a, input_b]) num_io_tiles = 128 cfg = ipu_utils.create_ipu_config(profiling=True) cfg = ipu_utils.set_gcl_options(cfg, num_io_tiles=num_io_tiles) cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=2) ipu_utils.configure_ipu_system(cfg) with session.Session() as sess: report = ReportJSON(self, sess, configure_device=False) report.reset() sess.run(compiled_net, { input_a: np.ones(input_a.shape), input_b: np.ones(input_b.shape) }) report.parse_log() num_compute_tiles = report.get_num_tiles_per_ipu() - num_io_tiles for t in report.get_tensor_map().all_tensors(): self.assertLessEqual(len(t.tiles), num_compute_tiles)
def testPrefixPathWithReshape(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[32]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = gen_array_ops.reshape(y, [32]) + z opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run(res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([32]) }) # Confirmed with values on the CPU. self.assertAllClose(result, [ 2., 2., 6., 6., 10., 10., 14., 14., 18., 18., 22., 22., 26., 26., 30., 30., 34., 34., 38., 38., 42., 42., 46., 46., 50., 50., 54., 54., 58., 58., 62., 62. ])
def testStatefulGradientAccumulate(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, x, y): del x del y return i < 10 def body(i, x, y): x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate( array_ops.ones_like(x), num_mini_batches=5, verify_usage=False) y = y + array_ops.ones_like(x) i = i + 1 return (i, x, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) y = sess.run(r, {y: [10]}) self.assertEqual(y[0], 10) self.assertAllEqual(y[1], [20]) self.assertAllEqual(y[2], [20])
def testStatefulGradientAccumulateInvalidUse(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, x, y): del x del y return i < 10 def body(i, x, y): x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate( array_ops.ones_like(x), num_mini_batches=5) y = y + array_ops.ones_like(x) i = i + 1 return (i, x, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) with self.assertRaisesRegex( errors.FailedPreconditionError, "The .*IpuStatefulGradientAccumulate op"): sess.run(r, {y: [10]})
def testTrainReplicated(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): # pylint: disable=unused-argument self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) loss = ipu.ops.cross_replica_ops.cross_replica_sum(features, name="loss") train_op = array_ops.identity(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = tu.create_dual_increasing_dataset(10, data_shape=[1], label_shape=[1]) dataset = dataset.batch(batch_size=1, drop_remainder=True) return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_replicas=4, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) session_run_counter = _SessionRunCounter() num_steps = 6 estimator.train(input_fn=my_input_fn, steps=num_steps, hooks=[session_run_counter]) self.assertEqual( session_run_counter.num_session_runs, num_steps // config.ipu_run_config.iterations_per_loop) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) # loss is averaged across iterations per loop self.assertEqual(loss_output, [14.0, 16.0, 18.0])
def testNumUniqueDevicesBelowNumShardsRange(self): def model_fn_with_zero_stages(mode): def optimizer_function(): pass return IPUPipelineEstimatorSpec(mode, computational_stages=[], gradient_accumulation_count=1, device_mapping=[0, 1, 0], optimizer_function=optimizer_function) def my_input_fn(): return dataset_ops.Dataset.from_tensor_slices(([0], [0])) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( num_shards=4, iterations_per_loop=1, ipu_options=ipu_options)) estimator = IPUPipelineEstimator(model_fn=model_fn_with_zero_stages, config=config) with self.assertRaisesRegex( ValueError, r"This pipeline requires 2 devices, but " "`IPURunConfig.num_shards` was set to 4"): estimator.train(input_fn=my_input_fn, steps=1)
def testPrefixPathWithElementwiseInPath(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) s = array_ops.placeholder(np.float32, shape=[]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = y + z * s opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.reshape(np.arange(32), [1, 4, 4, 2]), s: 2.0 }) # Confirmed with values on the CPU. self.assertAllClose( result, [[[[1., 3.], [9., 11.], [17., 19.], [25., 27.]], [[33., 35.], [41., 43.], [49., 51.], [57., 59.]], [[65., 67.], [73., 75.], [81., 83.], [89., 91.]], [[97., 99.], [105., 107.], [113., 115.], [121., 123.]]]])
def testIpuEventsWithoutPoplarReporting(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=False, enable_ipu_events=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = utils.extract_all_events(rep) self.assertEqual(len(evts), 3) # compile begin, compile end, execute for e in evts: if e.type == IpuTraceEvent.COMPILE_END: self.assertFalse(e.compile_end.compilation_report) if e.type == IpuTraceEvent.EXECUTE: self.assertFalse(e.execute.execution_report) sess.close()
def testSendScalar(self, dtype): with self.session() as sess: def device_fn(x): return gen_sendrecv_ops.ipu_send_to_host( x, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") inputs = array_ops.placeholder(dtype=dtype, shape=()) with ipu_scope("/device:IPU:0"): send_op = ipu_compiler.compile(device_fn, inputs=[inputs]) with ops.device("/device:CPU:0"): recv_op = gen_sendrecv_ops.ipu_recv_at_host( T=dtype, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sent, received = sess.run([send_op, recv_op], feed_dict={inputs: 1}) self.assertIsNone(sent) # Send op has no output self.assertEqual(dtype, received.dtype) self.assertEqual(0, len(received.shape)) self.assertEqual(1, received)
def test_ipu_horovod_strategy(self): hvd_size = hvd.size() hvd_rank = hvd.rank() strategy = IPUHorovodStrategy() self.assertEqual(strategy.num_replicas_in_sync, hvd_size) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with strategy.scope(): def per_replica_fn(): w = variable_scope.get_variable(name="w", initializer=hvd_rank + 1.0) self.assertEqual("/replica:0/task:0/device:IPU:0", w.device) return w * w per_replica_val = strategy.experimental_run_v2(per_replica_fn) strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val) strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val) with session.Session() as sess: sess.run(variables.global_variables_initializer()) # All workers should have the initial value from the first worker. self.assertEqual([1.0], sess.run(variables.global_variables())) self.assertEqual(1.0 * hvd_size, strategy_sum.eval()) self.assertEqual(1.0, strategy_mean.eval())
def testVectorInputOutput(self): with self.session() as sess: def device_fn(x): with ipu_scope("/device:IPU:0"): x = x + x with outside_compilation_scope(): # Use float64 which is not supported on IPU x = math_ops.cast(x, dtype=dtypes.float64) c = constant_op.constant(2.0, dtype=dtypes.float64, shape=(2, )) x += c x = math_ops.cast(x, dtype=dtypes.float32) x = x + 2.0 return x inputs = array_ops.placeholder(dtype=dtypes.float32, shape=(2, )) [device_out] = ipu_compiler.compile(device_fn, inputs=[inputs]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) result = sess.run(device_out, feed_dict={inputs: [1.0, 2.0]}) self.assertEqual((2, ), result.shape) self.assertAllEqual([6.0, 8.0], result)
def testSentTensorIsUsedAfterReceive(self): with self.session() as sess: def device_fn(x): with ipu_scope("/device:IPU:0"): x *= x # 4 with outside_compilation_scope(): y = x + 1.0 # 5 # Use `x` after receiving `y` and make sure that we still have the correct # value of `x` (i.e. it is not overwritten by the receive, in which case # we would get 25). z = x * y # 20 return z inputs = array_ops.placeholder(dtype=dtypes.float32, shape=()) [out] = ipu_compiler.compile(device_fn, inputs=[inputs]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) res = sess.run(out, feed_dict={inputs: 2.0}) self.assertEqual(20.0, res)
def testTwoInputsTwoOutputs(self): with self.session() as sess: def device_fn(x1, x2): with ipu_scope("/device:IPU:0"): x1 *= x1 x2 *= x2 with outside_compilation_scope(): x1 += 1.0 x2 += 2.0 x1 *= 1.0 x2 *= 2.0 return x1, x2 input1 = array_ops.placeholder(dtype=dtypes.float32, shape=()) input2 = array_ops.placeholder(dtype=dtypes.float32, shape=()) out1, out2 = ipu_compiler.compile(device_fn, inputs=[input1, input2]) opts = utils.create_ipu_config() opts = utils.set_optimization_options(opts, max_send_recv_cluster_size=8) utils.configure_ipu_system(opts) res1, res2 = sess.run([out1, out2], feed_dict={ input1: 1.0, input2: 2.0 }) self.assertEqual(2.0, res1) self.assertEqual(12.0, res2)
def testReportEveryNthExecution_Every1(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True, report_every_nth_execution=1, use_poplar_text_report=False) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) rep = sess.run(report, fd) r = tu.ReportJSON(self) types = r.parse_events(rep) self.assertEqual(types[IpuTraceEvent.EXECUTE], 5) self.assertEqual(len(r.get_execution_reports()), 5, "Every execution should have generated a report")
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def testTrainWithAutomaticSharding(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) with variable_scope.variable_scope("vs", use_resource=True): predictions = layers.Dense(units=1)(features) loss = losses.mean_squared_error(labels=labels, predictions=predictions) sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer( gradient_descent.GradientDescentOptimizer(0.1)) train_op = sharded_optimizer_obj.minimize(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = dataset_ops.Dataset.from_tensor_slices( _create_regression_dataset(num_samples=1000, num_features=5)) dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat() return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_shards=4, autosharding=True, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) estimator.train(input_fn=my_input_fn, steps=10) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) self.assertTrue(loss_output[0] > loss_output[-1])
def get_ipu_config(fp_exceptions=True, stochastic_rounding=True, xla_recompute=False, available_memory_proportion=None, disable_graph_outlining=False, num_ipus_required=0, max_cross_replica_sum_buffer_size=0, scheduler_selection='', compile_only=False, partials_type="half"): """Builds ipu_options""" config = utils.create_ipu_config( max_report_size=3001819596000, merge_infeed_io_copies=True, always_rearrange_copies_on_the_host=False, selection_order=utils.SelectionOrder.AUTO, disable_graph_outlining=disable_graph_outlining, max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size, scheduler_selection=scheduler_selection) config = utils.auto_select_ipus(config, num_ipus_required) config = utils.set_matmul_options(config, clear_pass_type=True) if available_memory_proportion is not None: config = utils.set_convolution_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_matmul_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_norm_options(config, use_stable_statistics=True) config = utils.set_recomputation_options(config, allow_recompute=xla_recompute) if compile_only: config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True) config = utils.set_floating_point_behaviour_options( config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=stochastic_rounding, nanoo=fp_exceptions) return config
def configureIPU(self, serialization_folder=None, offline_compilation=True): opts = utils.create_ipu_config() if offline_compilation: opts = utils.set_ipu_connection_type( opts, utils.DeviceConnectionType.NEVER, 1) if serialization_folder: opts = utils.set_serialization_options(opts, serialization_folder) utils.configure_ipu_system(opts)
def _configureIPU(self, serialization_folder, verification_options=None): opts = utils.create_ipu_config() opts = utils.set_ipu_connection_type(opts, utils.DeviceConnectionType.NEVER, 1) opts = utils.set_serialization_options(opts, serialization_folder) if verification_options: opts = utils.set_transfer_options(opts, True) opts = utils.set_verification_options(opts, verification_options) utils.configure_ipu_system(opts)
def testInput(): config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) config = utils.create_ipu_config(profiling=True, use_poplar_text_report=True) utils.configure_ipu_system(config) # config = utils.set_convolution_options(config, {"partialsType": str('half')}) # config = utils.set_matmul_options(config, {"partialsType": str('half')}) gdv = tf.Graph() with gdv.as_default(): g1 = tf.GraphDef() # Load model with pywrap isntead? https://github.com/graphcore/examples/blob/master/applications/tensorflow/cnns/training/weight_avg.py#L33 with tf.gfile.GFile('model.pb', 'rb') as fid: serialized_graph = fid.read() g1.ParseFromString(serialized_graph) tf.import_graph_def(g1, name='') with tf.Session(graph=gdv) as sess: inp_tensor = gdv.get_tensor_by_name('input:0') out_tensor = gdv.get_tensor_by_name( 'InceptionV3/Predictions/Softmax:0') image_np = getExamples() #image_np = getSyntheticExamples() np.set_printoptions(threshold=np.inf) import time tic = time.time() # This is new and doesn't crash # But doesn't seem to do anything either with ipu_scope("/device:IPU:0"): proba = sess.run(out_tensor, {inp_tensor: image_np}) print(proba) toc = time.time() duration = toc - tic num_images = len(image_np) print("Total time taken: {0} seconds".format(duration)) print("Number of examples: {0}".format(num_images)) print("Throughput: {0} im/s".format(num_images / duration))
def testReplicatedEvaluationOnHost(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [0, 0, 0, 1, 0, 0, 0, 1] labels = [0, 1, 0, 1, 0, 1, 0, 1] return dataset_ops.Dataset.from_tensor_slices( (features, labels)).batch(2, drop_remainder=True) def my_metrics_fn(features, labels): labels64 = math_ops.cast(labels, np.int64) return { "accuracy": metrics_impl.accuracy(labels, features), "precision": metrics_impl.precision(labels, features), "recall": metrics_impl.recall(labels, features), "recall_at_1": metrics_impl.recall_at_k(labels64, features, k=1), "recall_at_2": metrics_impl.recall_at_k(labels64, features, k=2), "mse": metrics_impl.mean_squared_error(labels, features), "rmse": metrics_impl.root_mean_squared_error(labels, features), } def my_model_fn(features, labels, mode): loss = math_ops.cast(replication_ops.replication_index(), np.float32) eval_metrics = (my_metrics_fn, [features, labels]) return ipu_estimator.IPUEstimatorSpec(mode, loss=loss, eval_metrics=eval_metrics) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(0.75, scores["accuracy"]) self.assertEqual(1.0, scores["precision"]) self.assertEqual(0.5, scores["recall"]) self.assertEqual(0.5, scores["recall_at_1"]) self.assertEqual(1.0, scores["recall_at_2"]) self.assertEqual(0.25, scores["mse"]) self.assertEqual(0.5, scores["rmse"]) self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
def testResetSeed(self): # The dataset for feeding the graphs ds = dataset_ops.Dataset.from_tensors( array_ops.constant(1.0, shape=[SIZE])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=REPLICAS) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=REPLICAS) # The device side def body(x1, x2): d1 = rand_ops.dropout(x1) d2 = rand_ops.dropout(x2) outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(REPEATS, body, [], infeed_queue) return r with scopes.ipu_scope('/device:IPU:0'): res = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config(profiling=True) config = utils.auto_select_ipus(config, REPLICAS) config = utils.set_floating_point_behaviour_options(config) utils.configure_ipu_system(config) with session.Session() as sess: res_all = set() total = 0 sess.run(infeed_queue.initializer) for _ in range(EXECS): sess.run(res) outfed_result = sess.run(dequeue_outfeed) for r in np.array(list(outfed_result.values())).reshape( [-1, SIZE]): total += 1 res_all.add(r.tostring()) # 2 dropouts per replica * REPLICAS * REPEATS * EXECS expected = 2 * REPLICAS * REPEATS * EXECS self.assertEqual(total, expected) self.assertEqual(len(res_all), expected)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding