Example #1
0
def create_popdist_strategy():
    """
    Creates a distribution strategy for use with popdist. We use the
    Horovod-based IPUMultiReplicaStrategy. Horovod is used for the initial
    broadcast of the weights and when reductions are requested on the host.
    Imports are placed here so they are only done when required, as Horovod
    might not always be available.
    """

    from tensorflow.python.ipu.horovod import popdist_strategy

    hvd.init()

    # We add the IPU cross replica reductions explicitly in the IPUOptimizer,
    # so disable them in the IPUMultiReplicaStrategy.
    return popdist_strategy.IPUMultiReplicaStrategy(
        add_ipu_cross_replica_reductions=False)
Example #2
0
 def setUpClass(cls):
     hvd.init()
Example #3
0
    def ipu_prog(num_replicas, gradient_accumulation):
        import logging
        import sys
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        popdist_on = popdist.isPopdistEnvSet()

        num_global_replicas = popdist.getNumTotalReplicas(
        ) if popdist_on else num_replicas
        num_instances = popdist.getNumInstances() if popdist_on else 1

        dataset_size = global_batch_size = 16
        micro_batch_size = int(global_batch_size / num_global_replicas /
                               gradient_accumulation)

        X = np.arange(1, dataset_size + 1, 1, dtype=float)
        Y = [0] * dataset_size
        ds = tf.data.Dataset.from_tensor_slices((X, Y))
        if popdist_on:
            ds = ds.shard(num_instances, index=popdist.getInstanceIndex())
        ds = ds.batch(micro_batch_size, drop_remainder=True)
        ds = ds.repeat()

        cfg = ipu.config.IPUConfig()
        if popdist_on:
            cfg = popdist.tensorflow.set_ipu_config(
                cfg,
                ipus_per_replica=popdist.getNumIpusPerReplica(),
                configure_device=True)
            hvd.init()
        else:
            cfg.auto_select_ipus = num_global_replicas
        cfg.configure_ipu_system()

        strategy = popdist_strategy.PopDistStrategy(
        ) if popdist_on else ipu.ipu_strategy.IPUStrategy()

        with strategy.scope():

            def get_model():
                input_layer = tf.keras.Input(shape=1)
                kernel_initializer = tf.keras.initializers.Constant(1)
                x = tf.keras.layers.Dense(
                    1, use_bias=False,
                    kernel_initializer=kernel_initializer)(input_layer)
                return tf.keras.Model(input_layer, x)

            model = get_model()
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=gradient_accumulation)
            model.build(input_shape=(micro_batch_size, 1))

            if popdist_on:

                def gradient_normalizer(grads_and_vars):
                    return [(grad / gradient_accumulation, var)
                            for grad, var in grads_and_vars]
            else:

                def gradient_normalizer(grads_and_vars):
                    return [
                        (grad / num_global_replicas / gradient_accumulation,
                         var) for grad, var in grads_and_vars
                    ]

            optimizer = tf.keras.optimizers.SGD(
                learning_rate=1.0, gradient_transformers=[gradient_normalizer])

            loss_class = tf.keras.losses.MeanSquaredError
            loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
            loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue)
            loss = loss_class()

            micro_batches_per_weight_update = num_global_replicas * gradient_accumulation
            steps_per_execution = dataset_size // (
                micro_batch_size * micro_batches_per_weight_update
            ) * micro_batches_per_weight_update

            model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=[tf.keras.losses.MSE],
                          steps_per_execution=steps_per_execution)

            callbacks = [
                OutFeedQueueCallback(queue=loss_outfeed_queue,
                                     name='average_loss')
            ]
            if num_instances > 1:
                callbacks += [AllReduceMetricsCallback()]
            callbacks += [LoggingCallback(1)]

            model.fit(ds,
                      steps_per_epoch=steps_per_execution,
                      callbacks=callbacks)

            return model.get_weights()[0][0][0]
Example #4
0
            available_memory_proportion[0] / 100)
    if gather_conv_output:
        cfg.convolutions.poplar_options['gatherConvOutput'] = 'true'
    cfg.floating_point_behaviour.inv = fp_exceptions
    cfg.floating_point_behaviour.div0 = fp_exceptions
    cfg.floating_point_behaviour.oflo = fp_exceptions
    cfg.compilation_poplar_options[
        'target.deterministicWorkers'] = 'false' if seed is None else 'portable'
    if internal_exchange_optimization_target is not None:
        cfg.compilation_poplar_options[
            'opt.internalExchangeOptimisationTarget'] = internal_exchange_optimization_target

    if distributed_training:
        popdist.tensorflow.set_ipu_config(
            cfg, ipus_per_replica=num_ipus_per_replica, configure_device=True)
        hvd.init()
    else:
        cfg.auto_select_ipus = num_ipus_per_replica * num_replicas

    cfg.configure_ipu_system()

    set_seed(seed)

    batch_config = BatchConfig(micro_batch_size, num_replicas,
                               gradient_accumulation_count, global_batch_size)

    logging.info(f'micro batch size {batch_config.micro_batch_size}')
    logging.info(f'global batch size {batch_config.global_batch_size}')
    logging.info(
        f'gradient accumulation {batch_config.gradient_accumulation_count}')
    logging.info(f'num replicas {batch_config.num_replicas}')
Example #5
0
def validation_graph(model, opts):
    reconfigure = not opts.get('reuse_IPUs', False)
    if opts['use_popdist'] and reconfigure:
        hvd.init()

    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_dataset = dataset.data(
            opts, is_training=False).map(lambda x: {'data_dict': x})

        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            valid_dataset, prefetch_depth=opts['prefetch_depth'])

        if opts['latency']:
            timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    if opts['latency']:
                        timestamp_enqueue = timestamp_queue.enqueue(
                            data_dict['timestamp'])
                        return (total_accuracy +
                                (tf.cast(accuracy, tf.float32) /
                                 opts["validation_batches_per_step"]),
                                timestamp_enqueue)
                    else:
                        return total_accuracy + (
                            tf.cast(accuracy, tf.float32) /
                            opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['total_replicas'] * opts['shards'] > 1 and not opts.get(
                        'inference', False):
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['total_replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        if opts['latency']:
            print(f'relative_timer start {relative_timer.get_start()}')
            timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(),
                                tf.float32)
            latency_per_batch = tf.reshape(
                timestamp - timestamp_queue.dequeue(), [-1])
        else:
            latency_per_batch = None

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

        if opts['use_popdist']:
            broadcast_weights = []
            for var in tf.global_variables():
                broadcast_weights.append(
                    var.assign(hvd.broadcast(var, root_rank=0)))
            global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph,
                                                        root_rank=0)
            num_files_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0)
            iteration_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0)
        else:
            broadcast_weights = None
            broadcast_global_batch_size, global_batch_size_ph = None, None
            broadcast_num_files, num_files_ph = None, None
            broadcast_iteration, iteration_ph = None, None

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=False,  # disable Stochastic Rounding for validation
        shards=opts['shards'],
        number_of_replicas=opts['total_replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        half_partials=opts["enable_half_partials"],
        conv_dithering=opts["enable_conv_dithering"],
        enable_recomputation=opts["enable_recomputation"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"],
        compile_only=opts["compile_only"],
        internalExchangeOptimisationTarget=opts[
            "internal_exchange_optimisation_target"],
        num_io_tiles=opts["num_io_tiles"],
        number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1),
        nanoo=not opts["saturate_on_overflow"],
    )

    if opts['use_popdist'] and reconfigure:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    if opts['on_demand'] and reconfigure:
        ipu_options.device_connection.enable_remote_buffers = True
        ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND

    if reconfigure:
        ipu_options.configure_ipu_system()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    ops = {
        'accuracy': accuracy,
        'broadcast_weights': broadcast_weights,
        'broadcast_global_batch_size': broadcast_global_batch_size,
        'broadcast_num_files': broadcast_num_files,
        'broadcast_iteration': broadcast_iteration,
        'latency_per_batch': latency_per_batch
    }

    placeholders = {
        'global_batch_size': global_batch_size_ph,
        'num_files': num_files_ph,
        'iteration': iteration_ph
    }

    valid_graph.finalize()

    return train.GraphOps(valid_graph, valid_sess, valid_init, ops,
                          placeholders, valid_iterator, None, valid_saver)