def setUpClass(cls): # Set up input to the network img_width = img_height = 224 img_channels = 3 densenet_121_blocks = (6, 12, 24, 16) cls.batch_size = 1 cls.num_classes = 1000 # Set up image input placeholder cls.placeholder_input = tf.placeholder(dtype=tf.float16, shape=(cls.batch_size, img_height, img_width, img_channels), name="image_input") # Set compile and device options opts = IPUConfig() opts.auto_select_ipus = [1] opts.configure_ipu_system() # Construct Densenet model cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes, image_width=img_width, image_height=img_height, image_channels=img_channels) cls.densenet_model(cls.placeholder_input) # Restore weights checkpoint_file = CHECKPOINT_PATH if not Path(checkpoint_file + ".index").exists(): print('Checkpoint file does not exist, attempting to download pre-trained weights') checkpoint_file = get_densenet_weights(Path(checkpoint_file)) # Create test session saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, checkpoint_file) logging.info('Restored imagenet weights.') # Optimize inference graph logging.info('Starting graph optimization.') densenet_graph_def = tf.get_default_graph().as_graph_def() frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def, output_node_names=["output-prob"]) # Remove identity ops in initializers to allow fusing batch norm with conv in the next line frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def) optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def) logging.info('Completed graph optimization.') tf.reset_default_graph() with tf.device('/device:IPU:0'): with tf.variable_scope('', use_resource=True): cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized", return_elements=["output-prob:0"])[0]
def get_config(fp_exceptions, enable_recomputation, disable_graph_outlining, num_required_ipus, enable_stochastic_rounding, max_cross_replica_sum_buffer_size, max_reduce_scatter_buffer_size, scheduler_selection, compile_only, ipu_id, available_memory_proportion=None, partials_type="half", minimum_remote_tensor_size=128): # Builds ipu_options cfg = IPUConfig() if ipu_id: cfg.select_ipus = [ipu_id] else: cfg.auto_select_ipus = num_required_ipus cfg.allow_recompute = enable_recomputation cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection] cfg.norms.use_stable_statistics = True cfg.matmuls.clear_pass_type = True # Floating-point exceptions cfg.floating_point_behaviour.inv = fp_exceptions cfg.floating_point_behaviour.div0 = fp_exceptions cfg.floating_point_behaviour.oflo = fp_exceptions cfg.floating_point_behaviour.nanoo = fp_exceptions # Stochastic rounding cfg.floating_point_behaviour.esr = enable_stochastic_rounding cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size cfg.optimizations.merge_infeed_io_copies = True cfg.optimizations.enable_graph_outlining = not disable_graph_outlining cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size if available_memory_proportion is not None: cfg.convolutions.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } cfg.matmuls.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } return cfg
def create_estimator(args): cfg = IPUConfig() cfg.floating_point_behaviour.inv = True cfg.floating_point_behaviour.div0 = True cfg.floating_point_behaviour.oflo = True cfg.floating_point_behaviour.esr = bool(args.stochastic_rounding) cfg.floating_point_behaviour.nanoo = True cfg.optimizations.maximum_cross_replica_sum_buffer_size = 20000000 if args.allow_recompute: cfg.allow_recompute = True num_replicas = args.num_replicas_train num_shards = args.num_ipus_in_pipeline_train cfg.auto_select_ipus = num_replicas * num_shards cfg.device_connection.version = 'ipu' + str(2) cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS cfg.convolutions.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } cfg.matmuls.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } iterations_per_loop = (args.batches_per_step * args.gradient_accumulation_batches) ipu_run_config = ipu.ipu_run_config.IPURunConfig( iterations_per_loop=iterations_per_loop, num_replicas=num_replicas, num_shards=num_shards, ipu_options=cfg, ) config = ipu.ipu_run_config.RunConfig( ipu_run_config=ipu_run_config, log_step_count_steps=args.log_interval, save_summary_steps=args.summary_interval, model_dir=args.model_dir, tf_random_seed=42) return ipu.ipu_pipeline_estimator.IPUPipelineEstimator( config=config, model_fn=partial(model_fn, args=args), params={}, )
def get_ipu_option_dict(ipu_id=None, prng=False, n_ipus=1): """ Collates IPU config into single dict, to be used as **kwargs input to tf.ConfigProto Returns: dict of config """ options = IPUConfig() options.optimizations.prefetch_data_streams = True options.optimizations.merge_infeed_io_copies = True if ipu_id is None: options.auto_select_ipus = [n_ipus] else: options.select_ipus = [ipu_id] options.floating_point_behaviour.esr = prng return {'ipu_options': options}
def get_config(opts, training=True): """Builds ipu_options """ config = IPUConfig() ipus = opts.select_ipus if ipus[0] == -1: train_ipus = 1 # opts.shards valid_ipus = 1 # This might want an option to control if not opts.multiprocessing: config.auto_select_ipus = [train_ipus, valid_ipus] else: ipus = train_ipus if training else valid_ipus config.auto_select_ipus = [ipus] else: if opts.multiprocessing: ipus = [ipus[0] if training else ipus[1]] config.select_ipus = ipus config.floating_point_behaviour.esr = opts.prng return config
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = IPUConfig() config.device_connection.enable_remote_buffers = True if opts.compile_only and opts.on_demand: raise ValueError("Can only provide one of --on-demand, --compile-only.") if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set.") config.device_connection.version = opts.compile_only_ipu_version config.device_connection.type = utils.DeviceConnectionType.NEVER if opts.on_demand: config.device_connection.type = utils.DeviceConnectionType.ON_DEMAND config.auto_select_ipus = num_ipus config.allow_recompute = opts.recompute # Enable stochastic rounding config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config = sparse.set_system_config(config, custom_op_debug_printing=opts.debug_dense_grad) config.configure_ipu_system() transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)
Graph compile calls """ # Compiles graph and targets IPU(s) inference_output = ipu.ipu_compiler.compile(ssd_model, inputs=[]) # Compiles decoder on host (CPU) decoder = decoder_component(input_detection) # Assignment operator for trained weight file param_setters = dict() for var in tf.trainable_variables(): placeholder = tf.placeholder(var.dtype, var.shape, var.name.split(':')[0] + '_setter') param_setters[var.name] = (tf.assign(var, placeholder), placeholder) # Setup IPU configuration and build session cfg = IPUConfig() cfg.auto_select_ipus = NUM_IPUS cfg.convolutions.poplar_options = {'availableMemoryProportion': '0.4'} cfg.configure_ipu_system() ipu.utils.move_variable_initialization_to_cpu() outfeed = outfeed_queue.dequeue() # Calculate total flops for graph (experimental) run_meta = tf.RunMetadata() opts = tf.profiler.ProfileOptionBuilder.float_operation() flops = tf.profiler.profile(tf.get_default_graph(), run_meta=run_meta, cmd='op', options=opts) print("Total FLOPs reported by TF is: ", flops.total_float_ops)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=50 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, conv_output=False, enable_recomputation=False, seed=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, num_io_tiles=0, number_of_distributed_batch_norm_replicas=1, min_remote_tensor_size=128, compile_only=False, nanoo=True, scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST, max_reduce_many_buffer_size=0): """Builds ipu_options""" config = IPUConfig() config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST: if get_ipu_arch() == 2: scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH else: # work around to avoid OOM on MK1 scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST config.scheduling.algorithm = scheduling_algorithm config.experimental.always_rearrange_copies_on_the_host = False config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size config.optimizations.maximum_cross_replica_sum_buffer_size = ( max_cross_replica_buffer_size) config.optimizations.maximum_reduce_many_buffer_size = ( max_reduce_many_buffer_size) if ipu_id == -1: config.auto_select_ipus = number_of_replicas * shards else: config.select_ipus = [ipu_id] config.compilation_poplar_options = { 'target.deterministicWorkers': 'false' if seed is None else 'portable' } if internalExchangeOptimisationTarget is not None: config.compilation_poplar_options[ 'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget if num_io_tiles != 0: config.io_tiles.place_ops_on_io_tiles = True config.io_tiles.num_io_tiles = num_io_tiles config.convolutions.poplar_options = {} if availableMemoryProportion is not None: config.convolutions.poplar_options['availableMemoryProportion'] = str( availableMemoryProportion) if half_partials: config.convolutions.poplar_options['partialsType'] = 'half' config.matmuls.poplar_options['partialsType'] = 'half' if conv_dithering: config.convolutions.poplar_options['enableConvDithering'] = 'true' if conv_output: config.convolutions.poplar_options['gatherConvOutput'] = 'true' if stable_norm: config.norms.use_stable_statistics = True if enable_recomputation: config.allow_recompute = True if compile_only: config.device_connection.version = 'ipu2' config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache path, defaulting if it doesnt exist tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or '' if '--executable_cache_path' not in tf_poplar_flags: print("Warning: --executable_cache_path not set. " + "Defaulting to '/tmp/tf_cache'.") tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache" os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags config.floating_point_behaviour.inv = fp_exceptions config.floating_point_behaviour.div0 = fp_exceptions config.floating_point_behaviour.oflo = fp_exceptions config.floating_point_behaviour.esr = prng config.floating_point_behaviour.nanoo = nanoo config.norms.experimental.distributed_batch_norm_replica_group_size = ( number_of_distributed_batch_norm_replicas) return config
dataset=test_set, number_of_epochs=opts.epochs, elements_per_epochs=num_test, print_stats=False, apply_options=True) logging.info("Starting benchmarks...\n") with tf.Session() as sess: logger.info("Benchmarking training dataset") train_results = sess.run(ds_perf_train) process_benchmark_results(train_results, opts) logger.info("Benchmarking training infeed") train_results = sess.run(infeed_perf_train) process_benchmark_results(train_results, opts) logger.info("Benchmarking test dataset") test_results = sess.run(ds_perf_test) process_benchmark_results(test_results, opts) logger.info("Benchmarking test infeed") test_results = sess.run(infeed_perf_test) process_benchmark_results(test_results, opts) # Set config config = IPUConfig() config.auto_select_ipus = 1 config.configure_ipu_system() # Now run on device make_and_run_on_device_benchmark(opts, train=True) make_and_run_on_device_benchmark(opts, train=False)
def run_mnist(opts): if opts.pipelining and opts.gradient_accumulation_count < 4: raise ValueError( "Pipelining requires at least 4 gradient accumulation steps.") if opts.seed is not None: utils.reset_ipu_seed(opts.seed) random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size // opts.gradient_accumulation_count batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32 # Flatten the images and cast the labels: permutation = make_pixel_permutation_matrix(opts, image_shape) x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape( -1, num_pixels) x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) x_train_flat[:, ...] = x_train_flat[:, permutation] x_test_flat[:, ...] = x_test_flat[:, permutation] if opts.records_path: os.makedirs(opts.records_path, exist_ok=True) filename = os.path.join(opts.records_path, "pixel_permutation") np.save(filename, permutation) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: if opts.pipelining: logger.info( f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}" ) batches_per_epoch = num_train // (batch_size * opts.gradient_accumulation_count) test_batches = num_test // (batch_size * opts.gradient_accumulation_count) batches_per_step = opts.batches_per_step_override if batches_per_step is None: batches_per_step = batches_per_epoch // opts.steps_per_epoch if not (batches_per_epoch % opts.steps_per_epoch) == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): lr_placeholder = tf.placeholder(dtype, shape=[]) # Create dataset and IPU feeds: def make_generator(features, labels): return lambda: zip(features, labels) # Input pipeline def make_dataset(features, labels, is_training: bool): dataset = tf.data.Dataset.from_generator( generator=make_generator(features, labels), output_types=(features.dtype, labels.dtype), output_shapes=(features.shape[1:], labels.shape[1:])) if is_training: dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) return dataset train_dataset = make_dataset(features=x_train_flat, labels=y_train, is_training=True) test_dataset = make_dataset(features=x_test_flat, labels=y_test, is_training=False) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset) outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue() outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue() infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset) outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue() # Get optimiser opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg) logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__, opt_kws) # Get the bound model functions bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn( fc_layers=fc_layers, opts=opts, lr_placeholder=lr_placeholder, opt_cls=opt_cls, opt_kws=opt_kws, train_batches_per_step=batches_per_step, test_batches_per_step=test_batches, train_queues=(outfeed_train_queue, infeed_train_queue), test_queues=(outfeed_test_queue, infeed_test_queue), png_queue=outfeed_prune_and_grow_queue, disable_dense_grad=opts.disable_dense_grad_override) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=train_inputs) test_loop = ipu_compiler.compile(bound_test_loop) # Placeholders can only be created on cpu after all the slots have registered: with tf.device("cpu"): for fc in fc_layers.values(): fc.create_placeholders() # Create update op on IPU: with scopes.ipu_scope("/device:IPU:0"): update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: utils.move_variable_initialization_to_cpu() config = IPUConfig() config.auto_select_ipus = 1 config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config.configure_ipu_system() # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() # Add dense gradient outfeed if we have sparse layers dequeue_prune_and_grow_outfeed = None if not opts.disable_dense_grad_override and any( fc.is_sparse() for fc in fc_layers.values()): dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue() logger.info( f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) logger.info( f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}" ) total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") if opts.restore: logpath = os.path.join(opts.checkpoint_path, opts.restore) else: logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer) if opts.restore: saver.restore(sess, logpath + '/model.ckpt') if opts.test_mode in ["all", "training"]: logger.info(f"Training...") start = opts.start_epoch if opts.restore else 0 progress = tqdm( range(start, opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) t1 = time.perf_counter() sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) t2 = time.perf_counter() sess_time = t2 - t1 batch_time = sess_time / batches_per_step throughput = batch_size / batch_time logger.info(f"Time for sess.run: {sess_time:0.3f} " f"Time per batch: {batch_time:0.6f} " f"Throughput: {throughput}") if opts.single_train_step_only: return train_outputs = sess.run(dequeue_train_outfeed) if opts.pipelining: train_outputs = train_outputs[-1] # Get the last value for all items: for k, v in train_outputs.items(): train_outputs[k] = v[-1] logger.debug(f"Train outputs: {train_outputs.keys()}") # Merge prune and grow fetches with last fetches: if dequeue_prune_and_grow_outfeed is not None: png_data = sess.run(dequeue_prune_and_grow_outfeed) for k in png_data: png_data[k] = png_data[k][-1] logger.debug( f"Prune and grow outputs: {png_data.keys()}") steps = 1 + i + e * opts.steps_per_epoch batches_processed = batches_per_step * steps for name, fc in fc_layers.items(): if fc.is_sparse(): var_name = fc.get_values_var().name logger.info( f"Average weights for layer {name}: {np.mean(png_data[var_name])}" ) for slot_name in fc.sparse_slots: logger.info( f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}" ) if i == 0 and e == opts.start_epoch: metainfo = sess.run(fc.get_metainfo_var()) else: metainfo = None if not opts.disable_pruning: logger.info( f"Starting prune and grow for layer {name}" ) t0 = time.perf_counter() prune_sched = prune_and_grow(name, fc, png_data, random_gen, steps, total_steps, opts, metainfo=metainfo) t1 = time.perf_counter() logger.info( f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds" ) logger.info( f"Pruned proportion: {prune_sched}") if opts.use_wandb: wandb.log({'Prune Schedule': prune_sched}, commit=False) if opts.log: log_file.write( f"{batches_processed} {train_outputs['acc']}\n") if opts.use_wandb: wandb.log( { 'Loss': train_outputs['mean_loss'], 'Accuracy': train_outputs['acc'], 'Throughput': throughput }, commit=True) progress.set_description( f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}" ) # Only need to feed an updated sparsity representation if we are running rig-L: if not opts.disable_pruning: # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): if fc.is_sparse(): sparse_feed.update(fc.feed_dict()) sess.run(update_representation, feed_dict=sparse_feed) if e % opts.checkpoint_freq == 0: logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info( f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}" ) if opts.use_wandb: wandb.run.summary["Test Loss"] = test_loss wandb.run.summary["Test Accuracy"] = test_acc
def create_ipu_config(): cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system()
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.optimizations.combine_embedding_lookups = True ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.configure_ipu_system() if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def run_testing(opts, transformer, x_test, y_test): batches_per_epoch = len(y_test) // opts.batch_size testing_graph = tf.Graph() with testing_graph.as_default(): with tf.device("cpu"): input_shape = [None, *x_test.shape[1:]] place_x = tf.placeholder(dtype=opts.dtype, shape=input_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices( (place_x, place_y)).cache() dataset = dataset.batch(opts.batch_size, drop_remainder=True) test_infeed = IPUInfeedQueue(dataset) test_outfeed = IPUOutfeedQueue() # Helper function def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed) # Compile the forward pass for testing with scopes.ipu_scope("/device:IPU:0"): test_loop = partial(forward_pass, opts, transformer, None, batches_per_epoch, False, test_outfeed, None) test_loop = partial(loop_builder, batches_per_epoch, test_loop, test_infeed) test_loop = ipu_compiler.compile(test_loop, inputs=[]) # Metrics with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer( var_list=metrics_vars) saver = tf.train.Saver() test_outfeed_dequeue = test_outfeed.dequeue() # Setup and acquire an IPU device: config = IPUConfig() config.auto_select_ipus = opts.num_shards config.configure_ipu_system() logpath = os.path.join(opts.train_checkpoint_path, "test") checkpoint = tf.train.latest_checkpoint(opts.train_checkpoint_path) summary_writer = tf.summary.FileWriter(logpath) testing_graph.finalize() # no more new ops added from here on out with tf.Session(graph=testing_graph) as sess: logger.info(f"Testing...") # The sparsity will also be streamed from the checkpoint # The host and device sparsity are not in sync here saver.restore(sess, checkpoint) sess.run(test_infeed.initializer, feed_dict={ place_x: x_test, place_y: y_test }) sess.run(metrics_initializer) # Run inference (whole dataset in one session call) dt = time.perf_counter() sess.run(test_loop) dt = time.perf_counter() - dt session_outputs = sess.run(test_outfeed_dequeue) # Test set performance throughput = transformer.source_sequence_length * len(y_test) / dt test_loss = session_outputs['mean_loss'].mean() test_acc = session_outputs['acc'][-1] desc = f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}" logger.info(desc + f" Throughput {throughput:.1f} token/s") # Regression tests accuracy_threshold = 0.85 assert test_acc >= accuracy_threshold, f"Test accuracy ({test_acc:3.2f}) is below threshold of ({accuracy_threshold:3.2f})" print("All asserts pass.")
def run_training(opts, transformer, x_train, y_train): # Calculate dataset length num_train = len(y_train) batches_per_epoch = num_train // opts.batch_size batches_per_step = batches_per_epoch // (opts.steps_per_epoch) total_steps = (opts.steps_per_epoch) * opts.nepochs logging.info( f"Batches per epoch: {batches_per_epoch} Batches per step: {batches_per_step}" ) if not batches_per_epoch % (opts.steps_per_epoch) == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Construct the training graph training_graph = tf.Graph() with training_graph.as_default(): with tf.device("cpu"): input_shape = [None, *x_train.shape[1:]] place_x = tf.placeholder(dtype=opts.dtype, shape=input_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") lr_placeholder = tf.placeholder(opts.dtype, shape=[]) # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y)) dataset = dataset.shuffle(buffer_size=len(y_train), reshuffle_each_iteration=True, seed=opts.random_seed).cache() dataset = dataset.repeat().batch(opts.batch_size, drop_remainder=True) # Queues for streaming from host to device and back train_infeed = IPUInfeedQueue(dataset) train_outfeed = IPUOutfeedQueue() png_outfeed = IPUOutfeedQueue() # Helper function def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed) # Compile the forward and backward pass for training with scopes.ipu_scope("/device:IPU:0"): train_loop = partial(forward_pass, opts, transformer, lr_placeholder, batches_per_step, True, train_outfeed, png_outfeed) train_loop = partial(loop_builder, batches_per_step, train_loop, train_infeed) train_loop = ipu_compiler.compile(train_loop, inputs=[]) transformer.buildSparsityUpdateOps() # Metrics with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer( var_list=metrics_vars) saver = tf.train.Saver(max_to_keep=5) # These ops are declared here so that the graph can be frozen afterwards global_initializer = tf.global_variables_initializer() train_outfeed_dequeue = train_outfeed.dequeue() png_outfeed_dequeue = png_outfeed.dequeue() # Setup and acquire an IPU device: config = IPUConfig() config.auto_select_ipus = opts.num_shards config.configure_ipu_system() logpath = os.path.join(opts.train_checkpoint_path, "train") summary_writer = tf.summary.FileWriter(logpath) # Run the model: training_graph.finalize() # no more new ops added from here on out with tf.Session(graph=training_graph) as sess: logger.info(f"Creating training session") sess.run(global_initializer) sess.run(train_infeed.initializer, feed_dict={ place_x: x_train, place_y: y_train }) progress = tqdm(range(opts.nepochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): # Train the model sess.run(metrics_initializer) dt = time.perf_counter() sess.run(train_loop, feed_dict={ lr_placeholder: learning_rate_schedule(e, opts) }) dt = time.perf_counter() - dt session_outputs = sess.run(train_outfeed_dequeue) logger.debug(f"Train outputs: {session_outputs}") # Calculate avg throughput num_tokens = transformer.source_sequence_length * batches_per_step * opts.batch_size throughput = num_tokens / dt desc = f"Loss {session_outputs['mean_loss'][-1]:.5f} " \ f"Accuracy {session_outputs['acc'][-1]:.5f} " \ f"Iteration: {session_outputs['iteration'][-1]}" progress.set_description( desc + f" Throughput {throughput:.1f} token/s") # Perform pruning (if using RigL the dense grads from session_outputs are used) step = 1 + i + e * (opts.steps_per_epoch) if transformer.prune_ratio is not None: t0 = time.perf_counter() png_results = sess.run(png_outfeed_dequeue) t1 = time.perf_counter() for k in png_results: png_results[k] = png_results[k][-1] logger.debug( f"Prune and grow outputs: {png_results.keys()}") logger.info( f"Downloaded the prune and grow data from Device to Host in {t1-t0:0.3f} seconds" ) transformer.syncPruneAndRegrowOnHost( opts.cosine_prune_schedule, step, total_steps, png_results) transformer.streamSparsityFromHostToDevice() # Save at the end of each epoch logger.info(f"Saving model") saver.save(sess, os.path.join(opts.train_checkpoint_path, 'model.ckpt'))
def generic_infer_graph(opts, is_training): data_type = 'float32' infer_graph = tf.Graph() with infer_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_val = get_synthetic_dataset(opts) else: dataset_val = get_dataset_embed(opts, is_training=False) infeed_val = ipu_infeed_queue.IPUInfeedQueue(dataset_val) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val) outputs_val = ipu_compiler.compile(comp_fn_validate, []) outfeed = outfeed_queue.dequeue() saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.optimizations.combine_embedding_lookups = True ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.configure_ipu_system() if seed is not None: utils.reset_ipu_seed(seed) ops_val = [outputs_val] sess = tf.compat.v1.Session(graph=infer_graph) return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def generic_graph(opts): data_type = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, True, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts, return_neg=True) feed_dict_values = {} else: dataset, feed_dict_values = get_dataset_embed_from_tensors( opts, data_type) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats, use_negsampling=True) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, data_type)] * 3, infeed) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 ipu_options.configure_ipu_system() utils.reset_ipu_seed(opts['seed']) graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.Session(graph=graph) return GraphOps( sess, init, graph_outputs, placeholders, infeed, saver, feed_dict_values), uid_embedding, mid_embedding, cat_embedding
from spektral.layers import EdgeConditionedConv, GlobalSumPool from spektral.utils import label_to_one_hot from qm9_argparser import get_argparser ################################################################################ # PARAMETERS (defaults set in get_argparser()) ################################################################################ parser = get_argparser() args = parser.parse_args() gradient_accumulation_count, epochs = (1, 2) if args.profile else (6, args.epochs) ################################################################################ # CONFIGURE THE DEVICE ################################################################################ cfg = IPUConfig() cfg.auto_select_ipus = args.num_ipus cfg.configure_ipu_system() # Mixed precision support tf.keras.backend.set_floatx('float16') ################################################################################ # LOAD DATA ################################################################################ A, X, E, y = qm9.load_data(return_type='numpy', nf_keys='atomic_num', ef_keys='type', self_loops=True, amount=args.amount) # Set to None to train on whole dataset
def set_up_ipu_devices(opts): config = IPUConfig() config.auto_select_ipus = 1 config.configure_ipu_system() # Set the seed for the stochastic rounding ipu.utils.reset_ipu_seed = opts.seed
# Make estimator estimator = create_estimator(args) if args.training: print("\nTraining...") train(estimator, args) if args.evaluation: print("\nEvaluating...") evaluate(estimator, args) if not (args.training or args.evaluation): # Configure IPU system for inference only # (no need to do this if an Estimator was already initialized) cfg = IPUConfig() if args.allow_recompute: cfg.allow_recompute = True cfg.auto_select_ipus = (args.num_replicas_infer * args.num_ipus_in_pipeline_infer) cfg.device_connection.version = 'ipu' + str(2) cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS cfg.convolutions.poplar_options = { 'partialsType': 'half' if args.artials_type == 'float16' else 'float' } cfg.matmuls.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } cfg.configure_ipu_system()
current_state=initial_chain_state, kernel=hmc_kernel) # Compile the graph [p], kernel_results = ipu_compiler.compile(hmc_graph, []) return (p, kernel_results) # Place the graphs on IPUs ops = [] for i in range(args.num_ipus): with ipu_scope('/device:IPU:'+str(i)): ops.append(build_graph(scope_id=str(i))) # Configure IPU config = IPUConfig() # Create num_chips TF devices, with 1 IPU per device config.auto_select_ipus = [1] * args.num_ipus config.configure_ipu_system() utils.move_variable_initialization_to_cpu() # Initialize variables init_g = tf.global_variables_initializer() sess.run(init_g) # Warm up print("\nWarming up...") sess.run(ops) print("Done\n") # Sample
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder( opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat( batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: ipu_config = IPUConfig() ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 if opts.compile_only: ipu_config.device_connection.version = opts.compile_only_ipu_version ipu_config.device_connection.enable_remote_buffers = True ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE if opts.select_ipus == 'AUTO': ipu_config.auto_select_ipus = [opts.replication_factor] else: ipu_config.select_ipus = [opts.select_ipus[not training]] ipu_config.floating_point_behaviour.esr = opts.prng ipu_config.configure_ipu_system() graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, trainFlag)
def bs_matmul_test(opts): data_type = opts.data_type partial_data_type = opts.partial_data_type dim = [opts.lhs_rows, opts.lhs_cols, opts.rhs_cols] block_size = [opts.lhs_block_row, opts.lhs_block_col, opts.rhs_block_col] block_dim = [0] * 3 for i in range(3): assert (dim[i] > 0) assert (block_size[i] > 0) assert (dim[i] % block_size[i] == 0) block_dim[i] = dim[i] // block_size[i] if opts.sparsity_mask is not None: sparsity_or_mask = list(int(c) for c in opts.sparsity_mask) else: sparsity_or_mask = opts.sparsity inner_group_size = opts.inner_group_size partition_method = opts.partition_method memory_cycle_ratio = opts.memory_cycle_ratio tf_type = tf.float32 if data_type == "float" else tf.float16 sparse_out = False op_name = "BuildDSD" compute_grads = False if opts.scenario[:3] == "dds": sparse_out = True op_name = "BuildDDS" if len(opts.scenario) > 3: compute_grads = True transposed_rhs = False if (not sparse_out): transposed_rhs = opts.transposed_rhs if (not sparse_out): if not transposed_rhs: dim_dense = [dim[1], dim[2]] block_size_sparse = [block_size[1], block_size[2]] dim_sparse_mask = [block_dim[1], block_dim[2]] else: dim_dense = [dim[2], dim[1]] block_size_sparse = [block_size[2], block_size[1]] dim_sparse_mask = [block_dim[2], block_dim[1]] else: dim_dense = [dim[0], dim[2]] block_size_sparse = [block_size[0], block_size[2]] dim_sparse_mask = [block_dim[0], block_dim[2]] if opts.group_dims is not None: dim_dense = opts.group_dims + dim_dense dim_sparse_mask = opts.group_dims + dim_sparse_mask sparse_matrix, dense_masked_matrix, sparsity_mask = utils.create_block_sparse_tensor( dim_dense, block_size_sparse, sparsity_or_mask) if transposed_rhs: sparse_transposed_indices = list(range(len(dim_dense))) sparse_transposed_indices[-2], sparse_transposed_indices[ -1] = sparse_transposed_indices[-1], sparse_transposed_indices[-2] dense_masked_matrix = dense_masked_matrix.transpose( sparse_transposed_indices) # leaving sparsity_mask is in transposed form nz = reduce(add, sparsity_mask, 0) logger.debug(f"sparsity_mask: {sparsity_mask}, nz blocks: {nz}") dim_lhs = [dim[0], dim[1]] dim_block_sparse = [nz, block_size_sparse[0] * block_size_sparse[1]] if opts.group_dims is not None: dim_lhs = opts.group_dims + dim_lhs if (not sparse_out): dim_rhs = dim_block_sparse dim_res = [dim[0], dim[2]] if opts.group_dims is not None: dim_res = opts.group_dims + dim_res else: dim_rhs = [dim[1], dim[2]] if opts.group_dims is not None: dim_rhs = opts.group_dims + dim_rhs dim_res = dim_block_sparse lhs_np = utils.create_dense_tensor(dim_lhs) lhs = tf.Variable(lhs_np, dtype=tf_type) if (not sparse_out): rhs = tf.Variable(sparse_matrix, dtype=tf_type) rhs_ref = tf.Variable(dense_masked_matrix, dtype=tf_type) else: rhs_np = utils.create_dense_tensor(dim_rhs) rhs = tf.Variable(rhs_np, dtype=tf_type) rhs_ref = rhs sparsity_mask_2d = np.reshape(sparsity_mask, dim_sparse_mask) block_one = np.ones([block_size[0], block_size[2]], dtype=np.float32) res_mask_np = np.kron(sparsity_mask_2d, block_one) res_mask = tf.constant(res_mask_np, dtype=tf_type) if (not sparse_out): if compute_grads: def dense_matmul(a, b): with tf.variable_scope("matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = tf.matmul(a, b) s = tf.reduce_sum(c) a_grad = tf.gradients(s, a) b_grad = tf.gradients(s, b) return c, a_grad, b_grad else: def dense_matmul(a, b): with tf.variable_scope("matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = tf.matmul(a, b) return c else: if compute_grads: def dense_matmul(a, b): with tf.variable_scope("matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = tf.matmul(a, b) c = c * res_mask s = tf.reduce_sum(c) a_grad = tf.gradients(s, a) b_grad = tf.gradients(s, b) return c, a_grad, b_grad else: def dense_matmul(a, b): with tf.variable_scope("matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = tf.matmul(a, b) c = c * res_mask return c bs_matmul_args = { "dim": dim, "block_size": block_size, "sparsity_mask": "".join(str(c) for c in sparsity_mask), "transposed_rhs": transposed_rhs, "data_type": data_type, "partial_data_type": partial_data_type, "inner_group_size": inner_group_size, "partition_method": partition_method, "memory_cycle_ratio": memory_cycle_ratio } json_attribs = json.dumps(bs_matmul_args) logger.debug(f"json_attribs: {json_attribs}") if compute_grads: def bs_matmul(a, b): outputs = { "output_types": [tf_type], "output_shapes": [tf.TensorShape(dim_res)] } lib_path = utils.get_lib_path("block_sparse") with tf.variable_scope("bs_matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = ipu.custom_ops.precompiled_user_op( [a, b], lib_path, outs=outputs, op_name=op_name, separate_gradients=False, inputs_with_gradients=[0, 1], attributes=json_attribs, gradient_attributes=json_attribs) s = tf.reduce_sum(c) a_grad = tf.gradients(s, a) b_grad = tf.gradients(s, b) return c, a_grad, b_grad else: def bs_matmul(a, b): outputs = { "output_types": [tf_type], "output_shapes": [tf.TensorShape(dim_res)] } lib_path = utils.get_lib_path("block_sparse") with tf.variable_scope("bs_matmul", reuse=tf.AUTO_REUSE, use_resource=True): c = ipu.custom_ops.precompiled_user_op( [a, b], lib_path, outs=outputs, op_name=op_name, separate_gradients=False, inputs_with_gradients=[], attributes=json_attribs) return c # Configure the IPU: cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system() with ipu.scopes.ipu_scope("/device:IPU:0"): dense_matmul_fetches = ipu.ipu_compiler.compile( dense_matmul, [lhs, rhs_ref]) bs_matmul_fetches = ipu.ipu_compiler.compile(bs_matmul, [lhs, rhs]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) results_ref = sess.run(dense_matmul_fetches) results = sess.run(bs_matmul_fetches) if compute_grads: out_ref, lhs_grad_ref, rhs_grad_ref = (results_ref[0], results_ref[1][0], results_ref[2][0]) out, lhs_grad, rhs_grad = (results[0][0], results[1][0], results[2][0]) else: out_ref, lhs_grad_ref, rhs_grad_ref = (results_ref[0], None, None) out, lhs_grad, rhs_grad = (results[0], None, None) if (sparse_out): out_ref = utils.to_block_sparse(np.array(out_ref), block_size_sparse, sparsity_mask) else: if compute_grads: rhs_grad_ref = np.array(rhs_grad_ref) if transposed_rhs: rhs_grad_ref = rhs_grad_ref.transpose( sparse_transposed_indices) rhs_grad_ref = utils.to_block_sparse(rhs_grad_ref, block_size_sparse, sparsity_mask) return out, lhs_grad, rhs_grad, out_ref, lhs_grad_ref, rhs_grad_ref
def run_model(opts): training = opts.test_mode in ["all", "training"] testing = opts.test_mode in ["all", "tests"] # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = 16 num_train = y_train.shape[0] num_test = y_test.shape[0] data_shape = [None, num_pixels] w_dense_shape = [num_pixels, h1Size] assert (batch_size % block_size[0] == 0) assert (w_dense_shape[0] % block_size[1] == 0) assert (w_dense_shape[1] % block_size[2] == 0) block_rows = w_dense_shape[0] // block_size[1] block_cols = w_dense_shape[1] // block_size[2] sparsity_mask = None if opts.sparsity >= 0.0: sparsity_mask = utils.create_random_sparse_mask( opts.sparsity, block_rows, block_cols).flatten() # Flatten the images and cast the labels: x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels) x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: epochs = opts.epochs ipu_steps_per_epoch = 15 batches_per_epoch = num_train // batch_size train_batches = (num_train * epochs) // batch_size test_batches = num_test // batch_size batches_per_step = batches_per_epoch // ipu_steps_per_epoch if not batches_per_epoch % ipu_steps_per_epoch == 0: raise ValueError( f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}." ) # Put placeholders on the CPU host: with tf.device("cpu"): place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") lr_placeholder = tf.placeholder(tf.float32, shape=[]) # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y)) dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(dataset) outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue() infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(dataset) outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue() # Use function binding to create all the builder functions that are neeeded: if training: bound_train_model = partial(model, lr_placeholder, outfeed_train_queue, True, sparsity_mask) bound_train_loop = partial(loop_builder, batches_per_step, bound_train_model, infeed_train_queue) if testing: bound_test_model = partial(model, lr_placeholder, outfeed_test_queue, False, sparsity_mask) bound_test_loop = partial(loop_builder, test_batches, bound_test_model, infeed_test_queue) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): if training: train_loop = ipu_compiler.compile(bound_train_loop, inputs=[]) if testing: test_loop = ipu_compiler.compile(bound_test_loop, inputs=[]) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: config = IPUConfig() config.auto_select_ipus = 1 config.configure_ipu_system() # These allow us to retrieve the results of IPU feeds: if training: dequeue_train_outfeed = outfeed_train_queue.dequeue() if testing: dequeue_test_outfeed = outfeed_test_queue.dequeue() # Create a benchmark program for the infeed to determine maximum achievable throughput: infeed_perf = dataset_benchmark.infeed_benchmark(infeed_train_queue, epochs, num_train, True) print( f"\nImage shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) print( f"Epochs: {epochs} Batch-size: {batch_size} Steps-per-epoch: {ipu_steps_per_epoch} Batches-per-step: {batches_per_step}" ) # Run the model: with tf.Session() as sess: print(f"Benchmarking the infeed...") sess.run(infeed_perf, feed_dict={ place_x: x_train_flat, place_y: y_train }) sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer, feed_dict={ place_x: x_train_flat, place_y: y_train }) if training: print(f"Training...") progress = tqdm( range(epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: sess.run(metrics_initializer) for i in range(ipu_steps_per_epoch): sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e)}) result = sess.run(dequeue_train_outfeed) if len(result['mean_loss'] != 0) and len( result['acc'] != 0): progress.set_description( f"Loss {result['mean_loss'][0]:.5f} Accuracy {result['acc'][0]:.5f}" ) print(f"Saving...") saver.save(sess, "model") if testing: print(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer, feed_dict={ place_x: x_test_flat, place_y: y_test }) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = np.mean(result['mean_loss']) test_acc = np.mean(result['acc']) print(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
def test_gru(self): seqLen = 2 bs = 3 inputs_value = np.array( [[[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]]], np.float32) seq_len_value = np.array([1, 2, 2], np.int32) inputs = tf.placeholder(shape=[bs, seqLen, self.HIDDEN_SIZE], dtype=self.model_dtype) seq_len = tf.placeholder(shape=[bs], dtype=tf.int32) cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system() utils.move_variable_initialization_to_cpu() with ops.device("/device:IPU:0"): train_ipu = ipu_compiler.compile(self.gru_model, inputs=[inputs, seq_len]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for var in tf.global_variables(): if var.name == 'popnn_dynamic_gru/kernel:0': gru_kernel = np.array([[ 0.36324948, 0.34305102, -0.47945526, 0.29105264, -0.55362725, 0.33607864 ], [ -0.20881158, 0.79369456, 0.3866263, -0.55099547, 0.41944432, 0.39612126 ], [ 0.48400682, 0.16632384, -0.78809285, 0.47519642, 0.4464376, -0.63623476 ], [ -0.57933414, -0.29082513, -0.7381171, 0.77089626, -0.24111485, 0.9164796 ]]) gru_kernel_var = var sess.run(tf.assign(gru_kernel_var, gru_kernel)) outputs_expected = np.array([[[-0.03196924, 0.06592286], [-0, 0]], [[-0.03196924, 0.06592286], [-0.06241067, 0.12973404]], [[-0.03196924, 0.06592286], [-0.06241067, 0.12973404]]]) outputs = sess.run(train_ipu, feed_dict={ inputs: inputs_value, seq_len: seq_len_value }) gru_kernel_updated = sess.run(gru_kernel_var) gru_kernel_expected = np.array([[ 0.35011762, 0.37606436, -0.4793783, 0.29105875, -0.6845508, 0.3001622 ], [ -0.22194342, 0.8267079, 0.38670325, -0.55098933, 0.28852075, 0.36020482 ], [ 0.48412853, 0.16602053, -0.7880953, 0.4751962, 0.4473563, -0.6360037 ], [ -0.57958513, -0.2901997, -0.73811203, 0.7708967, -0.24294817, 0.9160184 ]]) self.assertAlmostEqual(np.mean(outputs - outputs_expected), np.float32(0.0), delta=1e-7) self.assertAlmostEqual(np.mean(gru_kernel_expected - gru_kernel_updated), np.float32(0.0), delta=1e-8)
def training_graph(opts, training_data, device_index=0, learning_rate=0.001): train_graph = tf.Graph() with train_graph.as_default(): dataset, _, placeholders = training_data.get_dataset( opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder(opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"]) with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat(opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", learning_rate) tf.summary.scalar("RMSE/train", rmse) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter( opts.logs_path + '/train{0}'.format(device_index), graph=train_graph, flush_secs=30) ipu_options = IPUConfig() ipu_options.floating_point_behaviour.inv = opts.fp_exceptions ipu_options.floating_point_behaviour.div0 = opts.fp_exceptions ipu_options.floating_point_behaviour.oflo = opts.fp_exceptions ipu_options.floating_point_behaviour.esr = opts.prng ipu_options.floating_point_behaviour.nanoo = True ipu_options.auto_select_ipus = 1 ipu_options.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def test_augru(self): seqlen = 3 bs = 3 inputs_value = np.ones([bs, seqlen, self.HIDDEN_SIZE], np.float32) seq_len_value = np.array([1, 3, 2], np.int32) alphas_value = np.ones([seqlen, bs], np.float32) alphas_value = alphas_value * 0.5 inputs = tf.placeholder(shape=[bs, seqlen, self.HIDDEN_SIZE], dtype=self.model_dtype) seq_len = tf.placeholder(shape=[bs], dtype=tf.int32) alphas = tf.placeholder(shape=[seqlen, bs], dtype=self.model_dtype) cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system() utils.move_variable_initialization_to_cpu() with ops.device("/device:IPU:0"): train_ipu = ipu_compiler.compile(self.augru_model, inputs=[inputs, seq_len, alphas]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for var in tf.global_variables(): if var.name == 'popnn_augru/kernel:0': augru_kernel = np.array([[ 0.3188401, 0.8256132, -0.12287354, 0.8648142, -0.17983055, -0.45415568 ], [ -0.29249465, 0.65579015, -0.75681853, 0.4331085, -0.07700777, -0.47652483 ], [ -0.20116574, 0.52735907, -0.08258069, -0.21897888, -0.54514384, 0.32709408 ], [ -0.43361932, -0.62175727, 0.28278595, 0.13071388, -0.29585528, -0.14058399 ]]) augru_kernel_var = var sess.run(tf.assign(augru_kernel_var, augru_kernel)) outputs_expected = np.array([[[-0.15881832, -0.39365855], [0., 0.], [0., 0.]], [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [-0.09283338, -0.6407641]], [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [0., 0.]]]) outputs = sess.run(train_ipu, feed_dict={ inputs: inputs_value, seq_len: seq_len_value, alphas: alphas_value }) augru_kernel_updated = sess.run(augru_kernel_var) augru_kernel_expected = np.array([[ 0.31478855, 0.81888944, -0.12453551, 0.863326, -0.40852502, -0.5518727 ], [ -0.2965462, 0.6490664, -0.7584805, 0.4316203, -0.30570224, -0.5742418 ], [ -0.20129025, 0.52758944, -0.08233033, -0.21876118, -0.5368969, 0.3306306 ], [ -0.43399453, -0.6211322, 0.28351453, 0.13140172, -0.25127774, -0.12138209 ]]) self.assertAlmostEqual(np.mean(outputs - outputs_expected), np.float32(0.0), delta=1e-7) self.assertAlmostEqual(np.mean(augru_kernel_expected - augru_kernel_updated), np.float32(0.0), delta=1e-8)
def main(args): tf.logging.set_verbosity(tf.logging.ERROR) np.set_printoptions(linewidth=200) random_seed = args.random_seed checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt") # Input activations for the attention layer random_gen = np.random.default_rng(seed=random_seed) activations_np = random_gen.uniform(-0.1, 0.1, size=(args.batch_size, args.source_sequence_length, args.hidden_length)) # Configure the IPU cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system() # Build IPU graphs sparse_decoder_graph = tf.Graph() sparse_transformer = DynsparseTransformer(args) with sparse_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weight placeholders are created inside sparse_transfomer inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): sparse_decoder = partial(sparse_transformer_fwd_and_grad, sparse_transformer) sparse_decoder_fetches = ipu.ipu_compiler.compile(sparse_decoder, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() # sparse-decoder with tf.Session(graph=sparse_decoder_graph) as sess: # initialize weights sess.run(tf.global_variables_initializer()) # Save the sparse weights to checkpoint as dense sparse_transformer.checkpointAsDense(checkpoint_path) # run sparse decoder sparse_result = sess.run(sparse_decoder_fetches, feed_dict={inputs_ph: activations_np}) # Create a dense transformer and initialize the weights to the values that # the sparse model was initialzed with originally dense_decoder_graph = tf.Graph() dense_transformer = DenseTransformer(args) with dense_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weights will get streamed from checkpoint inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): dense_decoder_fetches = partial(dense_transformer_fwd_and_grad, dense_transformer) dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() with tf.device("cpu"): # We will only load the trainable variables, not momentum etc. loader = tf.train.Saver(tf.trainable_variables()) # dense-decoder with tf.Session(graph=dense_decoder_graph) as sess: # Initialized momentums which are not part of the checkpoint sess.run(tf.global_variables_initializer()) # Restore saved trainable variables loader.restore(sess, checkpoint_path) dense_result = sess.run(dense_graph, feed_dict={inputs_ph: activations_np}) # TEST rtol = 1e-05 atol = 1e-05 if args.dtype == tf.float16: rtol = 1e-04 atol = 1e-02 # Compare model output activations (actual vs. desired) -> (sparse vs. dense) np.testing.assert_allclose(sparse_result["output_activation"], dense_result["output_activation"], atol=atol, rtol=rtol, err_msg="Output activations do not match.") # Compate gradient of output wrt. input np.testing.assert_allclose(sparse_result["input_grad"], dense_result["input_grad"], atol=atol, rtol=rtol, err_msg="Grads wrt. inputs do not match") # Compare the dense_w and sparse grads of every sparse layer for name, sparse_layer in sparse_transformer.sparse_layers.items(): # Compate the dense grads dense_grad = dense_result[name + "/weight" + "_grad"] sparse_grad_w = sparse_result[name + "_grad_w"] np.testing.assert_allclose(sparse_grad_w, dense_grad, atol=atol, rtol=rtol, err_msg=f"Dense grads for layer {name} do not match") # Compare the sparse grads sparse_grad_padded = sparse_result[name + "/sparse_layer/nz_values_grad"] sparse_grad_data = sparse.SparseRepresentation(sparse_layer.weights.get_metainfo(), sparse_grad_padded) i, j, sparse_grad = sparse.triplets_from_representation(sparse_layer.weights.spec, sparse_grad_data, sparse_layer.weights.matmul_options) # Convert dense grads to blocks block_size, _ = sparse_layer.get_nonzero_blocks_shape() nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[1] // block_size strides = np.array(dense_grad.strides) # strides are in bytes strides = tuple(strides * block_size) + tuple(strides) blocked_dense_grad = np.lib.stride_tricks.as_strided(dense_grad, (nx, ny, block_size, block_size), strides) if block_size == 1: blocked_dense_grad = np.squeeze(np.copy(blocked_dense_grad), axis=(-2, -1)) np.testing.assert_allclose(sparse_grad, blocked_dense_grad[i, j], atol=atol, rtol=rtol, err_msg=f"Sparse grads for layer {name} do not match") print("All results match.") return sparse_result, dense_result
def generic_graph(opts, is_training): master_dtype = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts) else: dataset = get_dataset_embed(opts, False) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, sl): prob, accuracy = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, use_negsampling=False) with tf.control_dependencies([prob]): return outfeed_queue.enqueue((prob, target, accuracy)) return loops.repeat(opts['batches_per_step'], body, [], infeed) outputs = ipu_compiler.compile(comp_fn, []) outfeed = outfeed_queue.dequeue() saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 ipu_options.configure_ipu_system() graph_outputs = [outputs] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def run_inference(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, batch_size: int, batches_per_step: int, network_name: str, decode_predictions: Callable, ground_truth: Tuple[str], num_iterations: Optional[int] = 500, num_ipus: Optional[int] = 1, mode: Optional[str] = "single_ipu", data: Optional[str] = "real", available_memory_proportion: Optional[float] = 0.6) -> None: """Run inference on device and decode predictions. Args: loop_op: Inference op. infeed_queue_initializer: Initializer for the infeed queue. outfeed_op: Outfeed operator to extract results. batch_size: Batch size per forward pass. batches_per_step: Number of forward passes per step. network_name: Name of this network, to use in frames_per_second plot filename. decode_predictions: Function to decode predictions with. ground_truth: Ground-truth labels. num_iterations: Number of iterations to run the inference, if running in a loop. num_ipus: Number of ipus to run the inference on. mode: Mode of inference - {"single_ipu", "replicated"} data: Run on real data transferred from host or on random synthetic data generated on device. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options opts = IPUConfig() opts.matmuls.poplar_options = { 'availableMemoryProportion': str(available_memory_proportion) } opts.convolutions.poplar_options = { 'availableMemoryProportion': str(available_memory_proportion) } if mode == 'replicated': num_replicas = num_ipus else: num_replicas = 1 opts.auto_select_ipus = num_ipus opts.configure_ipu_system() with tf.Session() as session: session.run(infeed_queue_initializer) fps = [] for iter_count in range(num_iterations): start = time.time() session.run(loop_op) predictions = session.run(outfeed_op) stop = time.time() fps.append(batch_size * batches_per_step * num_replicas / (stop - start)) logging.info( "Iter {4}: {0} Throughput using {1} data = {2:.1f} imgs/sec at batch size = {3}" .format(network_name, data, fps[-1], batch_size, iter_count)) duration = stop - start report_string = "{:<7.3} sec/itr.".format(duration) report_string += " {:5f} images/sec.".format(fps[-1]) print(report_string) print("Total time: {}".format(duration)) # Decode a random prediction per step to check functional correctness. if data == 'real': predictions = np.reshape(predictions, (-1, predictions.shape[-1])) index = np.random.randint(0, len(predictions)) if network_name in ("inceptionv1", "efficientnet-s", "efficientnet-m", "efficientnet-l"): # These models encode background in 0th index. decoded_predictions = decode_predictions( predictions[index:index + 1, 1:], top=3) else: decoded_predictions = decode_predictions( predictions[index:index + 1, :], top=3) labels_and_probs = [ (label, prob) for _, label, prob in decoded_predictions[0] ] print( 'Actual: ', ground_truth[(index + num_replicas * iter_count * batches_per_step * batch_size) % len(ground_truth)]) print('Predicted: ', labels_and_probs) print("Average statistics excluding the 1st 20 iterations.") print( "-------------------------------------------------------------------------------------------" ) fps = fps[20:] print("Throughput at bs={}, data_mode={}, data_type={}, mode={}," " num_ipus={}, of {}: min={}, max={}, mean={}, std={}.".format( batch_size, data, predictions.dtype, mode, num_ipus, network_name, min(fps), max(fps), np.mean(fps), np.std(fps)))