def get_config(opts): """Builds ipu_options""" profile = opts.report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, opts.shards * opts.replicas) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) if opts.matmul_options: config = utils.set_matmul_options(config, json.loads(opts.matmul_options)) if opts.enable_half_partials: config = utils.set_matmul_options(config, {"partialsType": 'half'}) config = utils.set_convolution_options(config, {"partialsType": 'half'}) return config
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) if "GCL_REAL_COLLECTIVES" in os.environ: config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def get_config(opts): """Builds ipu_options""" profile = opts.cycle_report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, [opts.shards or 1]) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) return config
def get_config(fp_exceptions, xla_recompute, disable_graph_outlining, num_required_ipus, enable_stochastic_rounding, max_cross_replica_sum_buffer_size, scheduler_selection, compile_only, ipu_id): # Builds ipu_options config = utils.create_ipu_config( merge_infeed_io_copies=True, always_rearrange_copies_on_the_host=False, disable_graph_outlining=disable_graph_outlining, selection_order=utils.SelectionOrder.AUTO, scheduler_selection=scheduler_selection ) if ipu_id: config = utils.select_ipus(config, [ipu_id]) else: config = utils.auto_select_ipus(config, num_required_ipus) config = utils.set_recomputation_options( config, allow_recompute=xla_recompute) # simple way to skip the big `Transpose` operation due to bad allocation # config = utils.set_matmul_options(config, clear_pass_type=True) config = utils.set_norm_options(config, use_stable_statistics=True) config = utils.set_floating_point_behaviour_options( config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=enable_stochastic_rounding, nanoo=fp_exceptions) config = utils.set_optimization_options( config, merge_remote_buffers=True, max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size) # Do not acquire a device, compile only. if compile_only: config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True) return config
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=False, availableMemoryProportion=None): """Builds ipu_options""" config = utils.create_ipu_config( max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile, profile_execution=profile) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def get_config(opts, training=True, profiling=False): """Builds ipu_options """ config = utils.create_ipu_config(profiling=profiling) ipus = opts.select_ipus if ipus[0] == -1: train_ipus = 1 # opts.shards valid_ipus = 1 # This might want an option to control if not opts.multiprocessing: config = utils.auto_select_ipus(config, [train_ipus, valid_ipus]) else: ipus = train_ipus if training else valid_ipus config = utils.auto_select_ipus(config, [ipus]) else: if opts.multiprocessing: ipus = [ipus[0] if training else ipus[1]] config = utils.select_ipus(config, ipus) config = utils.set_compilation_options( config, {"prng.enable": "true" if opts.prng else "false"}) return config
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, limitVertexState=None): """Builds ipu_options""" profile_exec_modes = { "NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE } config = utils.create_ipu_config( merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options( config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options( config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if limitVertexState is not None: config = utils.set_compilation_options( config, { "opt.limitVertexStateToLower256K": "true" if limitVertexState else "false" }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if half_partials: config = utils.set_convolution_options(config, {"partialsType": 'half'}) config = utils.set_matmul_options(config, {"partialsType": 'half'}) if conv_dithering: config = utils.set_convolution_options(config, {"enableConvDithering": "true"}) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor} infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder(opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat(batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))]*2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None if opts.compiler_report: if training: summary_ops.ipu_compile_summary('compile_summary', avg_loss) with tf.device('cpu'): print('Initializing training report...') report = gen_ipu_ops.ipu_event_trace() writer = tf.summary.FileWriter( opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: config = ipu_utils.create_ipu_config(profiling=training, use_poplar_text_report=True, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) if opts.select_ipus == 'AUTO': config = ipu_utils.auto_select_ipus(config, [opts.replication_factor]) else: config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]]) config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()}) ipu_utils.configure_ipu_system(config) graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, report, trainFlag)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options(config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if "GCL_REAL_COLLECTIVES" in os.environ: # The GCL_NUM_IO_TILES environment variable sets how many tiles in the IPU are reserved for Graphcore Communication Library (GCL) collectives. iotiles = int(os.environ['GCL_NUM_IO_TILES']) if iotiles % 2 or iotiles < 32 or iotiles > 192: raise ValueError( 'GCL IO Tiles must be a multiple of 2 in between 32 and 192.'.format(iotiles)) config = utils.set_gcl_options(config, num_io_tiles=iotiles, gcl_options={ "useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options(config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if half_partials: config = utils.set_convolution_options(config, { "partialsType": 'half' }) config = utils.set_matmul_options(config, { "partialsType": 'half' }) if conv_dithering: config = utils.set_convolution_options(config, { "enableConvDithering": "true" }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config