Beispiel #1
0
def get_config(opts):
    """Builds ipu_options"""
    profile = opts.report

    config = utils.create_ipu_config(profiling=profile,
                                     profile_execution=profile,
                                     report_every_nth_execution=1)
    if opts.device_id == -1:
        config = utils.auto_select_ipus(config, opts.shards * opts.replicas)
    else:
        config = utils.select_ipus(config, [opts.device_id])

    if opts.convolution_options:
        config = utils.set_convolution_options(
            config, json.loads(opts.convolution_options))

    if opts.matmul_options:
        config = utils.set_matmul_options(config,
                                          json.loads(opts.matmul_options))

    if opts.enable_half_partials:
        config = utils.set_matmul_options(config, {"partialsType": 'half'})
        config = utils.set_convolution_options(config,
                                               {"partialsType": 'half'})
    return config
Beispiel #2
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10*1024*1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False):
    """Builds ipu_options"""

    profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE,
                          "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
                          "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
                          "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE}

    config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
                                     merge_infeed_io_copies=merge_infeed_io_copies,
                                     always_rearrange_copies_on_the_host=False,
                                     profiling=profile is not None,
                                     profile_execution=profile_exec_modes[profile] if profile else None)

    if "GCL_REAL_COLLECTIVES" in os.environ:
        config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", })

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas*shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(config, {
        "device.clearAtomicFlagAfterExchange": "false",
        "prng.enable": "true" if prng else "false",
        "target.deterministicWorkers": "false" if seed is None else "true",
    })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(config, {
            "availableMemoryProportion": str(availableMemoryProportion)
        })

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions,
                                                        oflo=fp_exceptions, esr=prng, nanoo=True)

    return config
Beispiel #3
0
def get_config(opts):
    """Builds ipu_options"""
    profile = opts.cycle_report

    config = utils.create_ipu_config(profiling=profile,
                                     profile_execution=profile,
                                     report_every_nth_execution=1)
    if opts.device_id == -1:
        config = utils.auto_select_ipus(config, [opts.shards or 1])
    else:
        config = utils.select_ipus(config, [opts.device_id])

    if opts.convolution_options:
        config = utils.set_convolution_options(
            config, json.loads(opts.convolution_options))
    return config
Beispiel #4
0
def get_config(fp_exceptions,
               xla_recompute,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id):

    # Builds ipu_options
    config = utils.create_ipu_config(
        merge_infeed_io_copies=True,
        always_rearrange_copies_on_the_host=False,
        disable_graph_outlining=disable_graph_outlining,
        selection_order=utils.SelectionOrder.AUTO,
        scheduler_selection=scheduler_selection
    )

    if ipu_id:
        config = utils.select_ipus(config, [ipu_id])
    else:
        config = utils.auto_select_ipus(config, num_required_ipus)

    config = utils.set_recomputation_options(
        config, allow_recompute=xla_recompute)
    # simple way to skip the big `Transpose` operation due to bad allocation
    # config = utils.set_matmul_options(config, clear_pass_type=True)
    config = utils.set_norm_options(config, use_stable_statistics=True)
    config = utils.set_floating_point_behaviour_options(
        config,
        inv=fp_exceptions,
        div0=fp_exceptions,
        oflo=fp_exceptions,
        esr=enable_stochastic_rounding,
        nanoo=fp_exceptions)
    config = utils.set_optimization_options(
        config,
        merge_remote_buffers=True,
        max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size)

    # Do not acquire a device, compile only.
    if compile_only:
        config = utils.set_ipu_connection_type(
            config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True)

    return config
Beispiel #5
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=False,
               availableMemoryProportion=None):
    """Builds ipu_options"""
    config = utils.create_ipu_config(
        max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
        merge_infeed_io_copies=merge_infeed_io_copies,
        always_rearrange_copies_on_the_host=False,
        profiling=profile,
        profile_execution=profile)
    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas * shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(
        config, {
            "device.clearAtomicFlagAfterExchange": "false",
            "prng.enable": "true" if prng else "false",
            "target.deterministicWorkers": "false" if seed is None else "true",
        })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(
            config,
            {"availableMemoryProportion": str(availableMemoryProportion)})

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=fp_exceptions,
                                                        div0=fp_exceptions,
                                                        oflo=fp_exceptions,
                                                        esr=prng,
                                                        nanoo=True)

    return config
Beispiel #6
0
def get_config(opts, training=True, profiling=False):
    """Builds ipu_options
    """
    config = utils.create_ipu_config(profiling=profiling)

    ipus = opts.select_ipus
    if ipus[0] == -1:
        train_ipus = 1  # opts.shards
        valid_ipus = 1  # This might want an option to control
        if not opts.multiprocessing:
            config = utils.auto_select_ipus(config, [train_ipus, valid_ipus])
        else:
            ipus = train_ipus if training else valid_ipus
            config = utils.auto_select_ipus(config, [ipus])
    else:
        if opts.multiprocessing:
            ipus = [ipus[0] if training else ipus[1]]
        config = utils.select_ipus(config, ipus)

    config = utils.set_compilation_options(
        config, {"prng.enable": "true" if opts.prng else "false"})

    return config
Beispiel #7
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None,
               limitVertexState=None):
    """Builds ipu_options"""

    profile_exec_modes = {
        "NO_PROFILE": ExecutionProfileType.NO_PROFILE,
        "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
        "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
        "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE
    }

    config = utils.create_ipu_config(
        merge_infeed_io_copies=merge_infeed_io_copies,
        always_rearrange_copies_on_the_host=False,
        profiling=profile is not None,
        profile_execution=profile_exec_modes[profile] if profile else None)

    config = utils.set_optimization_options(
        config,
        max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size)

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas * shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(
        config, {
            "device.clearAtomicFlagAfterExchange": "false",
            "prng.enable": "true" if prng else "false",
            "target.deterministicWorkers":
            "false" if seed is None else "portable",
        })

    if internalExchangeOptimisationTarget is not None:
        utils.set_compilation_options(
            config, {
                "opt.internalExchangeOptimisationTarget":
                internalExchangeOptimisationTarget
            })

    if limitVertexState is not None:
        config = utils.set_compilation_options(
            config, {
                "opt.limitVertexStateToLower256K":
                "true" if limitVertexState else "false"
            })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(
            config,
            {"availableMemoryProportion": str(availableMemoryProportion)})

    if half_partials:
        config = utils.set_convolution_options(config,
                                               {"partialsType": 'half'})
        config = utils.set_matmul_options(config, {"partialsType": 'half'})

    if conv_dithering:
        config = utils.set_convolution_options(config,
                                               {"enableConvDithering": "true"})

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=fp_exceptions,
                                                        div0=fp_exceptions,
                                                        oflo=fp_exceptions,
                                                        esr=prng,
                                                        nanoo=True)

    return config
Beispiel #8
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor}
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs)

        with ipu_scope(f'/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(opts,
                                                        observed=batch[:, :-1],
                                                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                                                        learning_rate=placeholders['learning_rate'] if training else None,
                                                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse
                return loops.repeat(batches_per_step,
                                    body,
                                    [tf.constant(0, getattr(np, opts.dtypes[0]))]*2,
                                    infeed)
            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None
        if opts.compiler_report:
            if training:
                summary_ops.ipu_compile_summary('compile_summary', avg_loss)
            with tf.device('cpu'):
                print('Initializing training report...')
                report = gen_ipu_ops.ipu_event_trace()

    writer = tf.summary.FileWriter(
        opts.logs_path + f'/{mode_name}',
        graph=graph,
        flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        config = ipu_utils.create_ipu_config(profiling=training,
                                             use_poplar_text_report=True,
                                             max_cross_replica_sum_buffer_size=10000000,
                                             max_inter_ipu_copies_buffer_size=10000000)
        if opts.select_ipus == 'AUTO':
            config = ipu_utils.auto_select_ipus(config, [opts.replication_factor])
        else:
            config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]])
        config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()})
        ipu_utils.configure_ipu_system(config)

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph,
                    sess,
                    init,
                    graph_outputs,
                    placeholders if training else None,
                    infeed,
                    saver,
                    writer,
                    report,
                    trainFlag)
Beispiel #9
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10*1024*1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None):
    """Builds ipu_options"""

    profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE,
                          "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
                          "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
                          "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE}

    config = utils.create_ipu_config(merge_infeed_io_copies=merge_infeed_io_copies,
                                     always_rearrange_copies_on_the_host=False,
                                     profiling=profile is not None,
                                     profile_execution=profile_exec_modes[profile] if profile else None)

    config = utils.set_optimization_options(config,
                                            max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size)

    if "GCL_REAL_COLLECTIVES" in os.environ:
        # The GCL_NUM_IO_TILES environment variable sets how many tiles in the IPU are reserved for Graphcore Communication Library (GCL) collectives.
        iotiles = int(os.environ['GCL_NUM_IO_TILES'])
        if iotiles % 2 or iotiles < 32 or iotiles > 192:
            raise ValueError(
                'GCL IO Tiles must be a multiple of 2 in between 32 and 192.'.format(iotiles))

        config = utils.set_gcl_options(config, num_io_tiles=iotiles, gcl_options={
                                       "useGclCollectives": "true", })

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas*shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(config, {
        "device.clearAtomicFlagAfterExchange": "false",
        "prng.enable": "true" if prng else "false",
        "target.deterministicWorkers": "false" if seed is None else "portable",
    })

    if internalExchangeOptimisationTarget is not None:
        utils.set_compilation_options(config, {
            "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget
        })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(config, {
            "availableMemoryProportion": str(availableMemoryProportion)
        })

    if half_partials:
        config = utils.set_convolution_options(config, {
            "partialsType": 'half'
        })
        config = utils.set_matmul_options(config, {
            "partialsType": 'half'
        })

    if conv_dithering:
        config = utils.set_convolution_options(config, {
            "enableConvDithering": "true"
        })

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions,
                                                        oflo=fp_exceptions, esr=prng, nanoo=True)

    return config