Ejemplo n.º 1
0
def get_config(fp_exceptions,
               enable_recomputation,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               max_reduce_scatter_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id,
               available_memory_proportion=None,
               partials_type="half",
               minimum_remote_tensor_size=128):

    # Builds ipu_options
    cfg = IPUConfig()

    if ipu_id:
        cfg.select_ipus = [ipu_id]
    else:
        cfg.auto_select_ipus = num_required_ipus

    cfg.allow_recompute = enable_recomputation
    cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection]
    cfg.norms.use_stable_statistics = True
    cfg.matmuls.clear_pass_type = True

    # Floating-point exceptions
    cfg.floating_point_behaviour.inv = fp_exceptions
    cfg.floating_point_behaviour.div0 = fp_exceptions
    cfg.floating_point_behaviour.oflo = fp_exceptions
    cfg.floating_point_behaviour.nanoo = fp_exceptions

    # Stochastic rounding
    cfg.floating_point_behaviour.esr = enable_stochastic_rounding
    cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE
    cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size
    cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size
    cfg.optimizations.merge_infeed_io_copies = True
    cfg.optimizations.enable_graph_outlining = not disable_graph_outlining
    cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size

    if available_memory_proportion is not None:
        cfg.convolutions.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }
        cfg.matmuls.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }

    return cfg
Ejemplo n.º 2
0
def get_ipu_option_dict(ipu_id=None, prng=False, n_ipus=1):
    """
    Collates IPU config into single dict, to be used as **kwargs input to tf.ConfigProto

    Returns:
        dict of config
    """
    options = IPUConfig()
    options.optimizations.prefetch_data_streams = True
    options.optimizations.merge_infeed_io_copies = True

    if ipu_id is None:
        options.auto_select_ipus = [n_ipus]
    else:
        options.select_ipus = [ipu_id]
    options.floating_point_behaviour.esr = prng

    return {'ipu_options': options}
Ejemplo n.º 3
0
def get_config(opts, training=True):
    """Builds ipu_options
    """
    config = IPUConfig()

    ipus = opts.select_ipus
    if ipus[0] == -1:
        train_ipus = 1  # opts.shards
        valid_ipus = 1  # This might want an option to control
        if not opts.multiprocessing:
            config.auto_select_ipus = [train_ipus, valid_ipus]
        else:
            ipus = train_ipus if training else valid_ipus
            config.auto_select_ipus = [ipus]
    else:
        if opts.multiprocessing:
            ipus = [ipus[0] if training else ipus[1]]
        config.select_ipus = ipus

    config.floating_point_behaviour.esr = opts.prng

    return config
Ejemplo n.º 4
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope(f'/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(
                        opts,
                        observed=batch[:, :-1],
                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                        learning_rate=placeholders['learning_rate']
                        if training else None,
                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse

                return loops.repeat(
                    batches_per_step, body,
                    [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed)

            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None

    writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}',
                                   graph=graph,
                                   flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        ipu_config = IPUConfig()

        ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
        ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000

        if opts.compile_only:
            ipu_config.device_connection.version = opts.compile_only_ipu_version
            ipu_config.device_connection.enable_remote_buffers = True
            ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE

        if opts.select_ipus == 'AUTO':
            ipu_config.auto_select_ipus = [opts.replication_factor]
        else:
            ipu_config.select_ipus = [opts.select_ipus[not training]]

        ipu_config.floating_point_behaviour.esr = opts.prng
        ipu_config.configure_ipu_system()

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph, sess, init, graph_outputs,
                    placeholders if training else None, infeed, saver, writer,
                    trainFlag)
Ejemplo n.º 5
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=50 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               conv_output=False,
               enable_recomputation=False,
               seed=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None,
               num_io_tiles=0,
               number_of_distributed_batch_norm_replicas=1,
               min_remote_tensor_size=128,
               compile_only=False,
               nanoo=True,
               scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST,
               max_reduce_many_buffer_size=0):
    """Builds ipu_options"""
    config = IPUConfig()

    config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies
    if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST:
        if get_ipu_arch() == 2:
            scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH
        else:
            # work around to avoid OOM on MK1
            scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST
    config.scheduling.algorithm = scheduling_algorithm
    config.experimental.always_rearrange_copies_on_the_host = False
    config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size
    config.optimizations.maximum_cross_replica_sum_buffer_size = (
        max_cross_replica_buffer_size)
    config.optimizations.maximum_reduce_many_buffer_size = (
        max_reduce_many_buffer_size)

    if ipu_id == -1:
        config.auto_select_ipus = number_of_replicas * shards
    else:
        config.select_ipus = [ipu_id]
    config.compilation_poplar_options = {
        'target.deterministicWorkers': 'false' if seed is None else 'portable'
    }

    if internalExchangeOptimisationTarget is not None:
        config.compilation_poplar_options[
            'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget

    if num_io_tiles != 0:
        config.io_tiles.place_ops_on_io_tiles = True
        config.io_tiles.num_io_tiles = num_io_tiles

    config.convolutions.poplar_options = {}

    if availableMemoryProportion is not None:
        config.convolutions.poplar_options['availableMemoryProportion'] = str(
            availableMemoryProportion)

    if half_partials:
        config.convolutions.poplar_options['partialsType'] = 'half'
        config.matmuls.poplar_options['partialsType'] = 'half'
    if conv_dithering:
        config.convolutions.poplar_options['enableConvDithering'] = 'true'
    if conv_output:
        config.convolutions.poplar_options['gatherConvOutput'] = 'true'

    if stable_norm:
        config.norms.use_stable_statistics = True

    if enable_recomputation:
        config.allow_recompute = True

    if compile_only:
        config.device_connection.version = 'ipu2'
        config.device_connection.enable_remote_buffers = True
        # PRE_COMPILE allows for runing execuatables on graph without being online
        config.device_connection.type = DeviceConnectionType.PRE_COMPILE

        # Enforce using a exe cache path, defaulting if it doesnt exist
        tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or ''

        if '--executable_cache_path' not in tf_poplar_flags:
            print("Warning: --executable_cache_path not set. " +
                  "Defaulting to '/tmp/tf_cache'.")

            tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache"
            os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags

    config.floating_point_behaviour.inv = fp_exceptions
    config.floating_point_behaviour.div0 = fp_exceptions
    config.floating_point_behaviour.oflo = fp_exceptions
    config.floating_point_behaviour.esr = prng
    config.floating_point_behaviour.nanoo = nanoo

    config.norms.experimental.distributed_batch_norm_replica_group_size = (
        number_of_distributed_batch_norm_replicas)

    return config