def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) if "GCL_REAL_COLLECTIVES" in os.environ: config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def testEngineCompilationOptions(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [480], name="a") pb = array_ops.placeholder(np.float32, [480], name="b") output = pa + pb opts = utils.create_ipu_config() opts = utils.set_compilation_options(opts, {"some_option": "some_value"}) utils.configure_ipu_system(opts) fd = {pa: np.zeros([480]), pb: np.zeros([480])} with self.assertRaisesRegex(errors.InvalidArgumentError, "Unrecognised option"): sess.run(output, fd)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=False, availableMemoryProportion=None): """Builds ipu_options""" config = utils.create_ipu_config( max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile, profile_execution=profile) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def get_config(opts, training=True, profiling=False): """Builds ipu_options """ config = utils.create_ipu_config(profiling=profiling) ipus = opts.select_ipus if ipus[0] == -1: train_ipus = 1 # opts.shards valid_ipus = 1 # This might want an option to control if not opts.multiprocessing: config = utils.auto_select_ipus(config, [train_ipus, valid_ipus]) else: ipus = train_ipus if training else valid_ipus config = utils.auto_select_ipus(config, [ipus]) else: if opts.multiprocessing: ipus = [ipus[0] if training else ipus[1]] config = utils.select_ipus(config, ipus) config = utils.set_compilation_options( config, {"prng.enable": "true" if opts.prng else "false"}) return config
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, limitVertexState=None): """Builds ipu_options""" profile_exec_modes = { "NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE } config = utils.create_ipu_config( merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options( config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options( config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if limitVertexState is not None: config = utils.set_compilation_options( config, { "opt.limitVertexStateToLower256K": "true" if limitVertexState else "false" }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if half_partials: config = utils.set_convolution_options(config, {"partialsType": 'half'}) config = utils.set_matmul_options(config, {"partialsType": 'half'}) if conv_dithering: config = utils.set_convolution_options(config, {"enableConvDithering": "true"}) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor} infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder(opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat(batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))]*2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None if opts.compiler_report: if training: summary_ops.ipu_compile_summary('compile_summary', avg_loss) with tf.device('cpu'): print('Initializing training report...') report = gen_ipu_ops.ipu_event_trace() writer = tf.summary.FileWriter( opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: config = ipu_utils.create_ipu_config(profiling=training, use_poplar_text_report=True, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) if opts.select_ipus == 'AUTO': config = ipu_utils.auto_select_ipus(config, [opts.replication_factor]) else: config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]]) config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()}) ipu_utils.configure_ipu_system(config) graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, report, trainFlag)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options(config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if "GCL_REAL_COLLECTIVES" in os.environ: # The GCL_NUM_IO_TILES environment variable sets how many tiles in the IPU are reserved for Graphcore Communication Library (GCL) collectives. iotiles = int(os.environ['GCL_NUM_IO_TILES']) if iotiles % 2 or iotiles < 32 or iotiles > 192: raise ValueError( 'GCL IO Tiles must be a multiple of 2 in between 32 and 192.'.format(iotiles)) config = utils.set_gcl_options(config, num_io_tiles=iotiles, gcl_options={ "useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options(config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if half_partials: config = utils.set_convolution_options(config, { "partialsType": 'half' }) config = utils.set_matmul_options(config, { "partialsType": 'half' }) if conv_dithering: config = utils.set_convolution_options(config, { "enableConvDithering": "true" }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config