Example #1
0
def configure_ipu(opts):
    """Set the IPU configuration based on execution options."""
    global_amp = None
    if opts['available_memory_proportion'] and len(
            opts['available_memory_proportion']) == 1:
        global_amp = opts['available_memory_proportion'][0]

    ipu_options = get_config(
        ipu_id=opts['select_ipu'],
        prng=not opts['no_stochastic_rounding'],
        shards=opts['shards'],
        number_of_replicas=opts['total_replicas'],
        max_cross_replica_buffer_size=opts['max_cross_replica_buffer_size'],
        fp_exceptions=opts['fp_exceptions'],
        half_partials=opts['enable_half_partials'],
        conv_dithering=opts['enable_conv_dithering'],
        enable_recomputation=opts['enable_recomputation'],
        seed=opts['seed'],
        availableMemoryProportion=global_amp,
        stable_norm=opts['stable_norm'],
        compile_only=opts['compile_only'],
        internalExchangeOptimisationTarget=opts[
            "internal_exchange_optimisation_target"],
        num_io_tiles=opts['num_io_tiles'],
        number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1),
        nanoo=not opts['saturate_on_overflow'],
    )

    if opts['on_demand']:
        ipu_options.device_connection.enable_remote_buffers = True
        ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND

    ipu_options.configure_ipu_system()
Example #2
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=1,
        number_of_replicas=opts['replicas'] * opts['shards'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"])
    ipu.utils.configure_ipu_system(ipu_options)

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver, None)
Example #3
0
def training_graph(model, opts, iterations_per_step=1):

    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver)
Example #4
0
def training_graph(model, opts, iterations_per_step=1):
    train_graph = tf.Graph()
    sess_config = tf.ConfigProto()
    sess_target = None
    strategy = None

    if opts['distributed_cluster']:
        strategy, sess_target, sess_config = configure_distribution(
            opts, sess_config)

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()
        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)
        with tf.device('cpu'):
            profile_report = gen_ipu_ops.ipu_event_trace()
        ipu.utils.move_variable_initialization_to_cpu(graph=None)
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph,
                            config=sess_config,
                            target=sess_target)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, profile_report)
Example #5
0
def build_graph(opts, is_training=True):
    train_graph = tf.Graph()
    strategy = None

    if opts['use_popdist']:
        strategy = create_popdist_strategy()

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        if opts["groupbert"]:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.GroupBertConfig(vocab_size=None))
        else:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.BertConfig(vocab_size=None))

        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16

        # define placeholders
        placeholders = {
            'learning_rate': tf.placeholder(tf.float32, shape=[]),
            'loss_scaling': tf.placeholder(tf.float32, shape=[])
        }
        learning_rate = placeholders['learning_rate']
        loss_scaling = placeholders['loss_scaling']

        # define input, datasets must be defined outside the ipu device scope.
        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        # define output
        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()

        # building networks with pipeline
        def bert_net():
            return build_network(train_iterator, outfeed_queue, bert_config,
                                 opts, learning_rate, loss_scaling,
                                 is_training)

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, bert_config, opts,
                learning_rate, loss_scaling, is_training)

        # get result from outfeed queue
        outfeed = outfeed_queue.dequeue()

        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        if opts['distributed_worker_index'] == 0 or opts['log_all_workers']:
            log.print_trainable_variables(opts)

        model_and_optimiser_variables = tf.global_variables()
        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_checkpoint'] else model_variables)

        train_saver = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['save_optimiser_to_checkpoint'] else model_variables,
            max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # calculate the number of required IPU
    num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas']
    num_ipus = ipu_utils.next_power_of_two(num_ipus)

    ipu_config = ipu_utils.get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        minimum_remote_tensor_size=opts['min_remote_tensor_size'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        ipu_id=opts['select_ipu'])

    if opts['use_popdist']:
        ipu_config = popdist.tensorflow.set_ipu_config(ipu_config,
                                                       opts['shards'],
                                                       configure_device=False)

    # Do not acquire a device, compile only.
    if opts["compile_only"]:
        ipu_config.device_connection.version = "ipu2"
        ipu_config.device_connection.enable_remote_buffers = True
        # PRE_COMPILE allows for runing execuatables on graph without being online
        ipu_config.device_connection.type = DeviceConnectionType.PRE_COMPILE

        # Enforce using a exe cache dir, defaulting if not given
        if ("TF_POPLAR_FLAGS" in os.environ):
            if ("--executable_cache_path"
                    not in os.environ["TF_POPLAR_FLAGS"]):
                print(
                    "Warning: --executable_cache_path in TF_POPLAR_FLAGS " +
                    "(for 'poprun --mpi_local_args') not set. Setting to default "
                    + "path: ./tmp/tf_cache/")
                os.environ[
                    "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache"

        # Sometimes TF_POPLAR_FLAGS might not even exist
        else:
            print(
                "Warning: TF_POPLAR_FLAGS environment variable (for 'poprun " +
                "--mpi_local_args') not set. --executable_cache_path must be "
                +
                "defined when using --compile-only. Setting to default path: "
                + "./tmp/tf_cache/")
            os.environ[
                "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache"

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)
Example #6
0
def build_graph(opts, iterations_per_step=1, is_training=True):

    train_graph = tf.Graph()
    with train_graph.as_default():
        bert_config = bert_ipu.BertConfig.from_dict(
            opts, config=bert_ipu.BertConfig(vocab_size=None))
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16
        placeholders = dict()

        learning_rate = None
        opts['version_2_with_negative'] = False
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        # building networks with pipeline
        if not should_be_pipeline_when_inference(opts):

            def bert_net():
                return build_infer_network_without_pipeline(
                    train_iterator,
                    outfeed_queue,
                    iterations_per_step,
                    bert_config=bert_config,
                    opts=opts)
        else:

            def bert_net():
                return build_network(train_iterator, outfeed_queue,
                                     iterations_per_step, bert_config, opts,
                                     learning_rate, is_training)

        with ipu_scope('/device:IPU:0'):
            embedded = opts["embedded_runtime"]

            if embedded and is_training:
                raise ValueError(
                    "embedded_runtime is only to be used for inference.")

            train = ipu.ipu_compiler.compile(bert_net,
                                             []) if not embedded else None

        exec_path = None
        compile_op = None
        poplar_exec_filepath = get_exec_path(
            opts['seq_length'], opts['micro_batch_size'],
            opts['device_mapping'], should_be_pipeline_when_inference(opts))
        exec_path = os.path.join(poplar_exec_filepath)
        compile_op = application_compile_op.experimental_application_compile_op(
            bert_net, output_path=exec_path, freeze_variables=True)

        outfeed = outfeed_queue.dequeue()

        restore = tf.train.Saver(var_list=tf.global_variables())

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # Calculate the number of required IPU"""
    num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas'])
    # The number of acquired IPUs must be the power of 2.
    if num_ipus & (num_ipus - 1) != 0:
        num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2)))
    ipu_config = get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection='CLUSTERING',
        compile_only=False,
        ipu_id=None,
        partials_type=opts["partials_type"],
        available_memory_proportion=opts['available_memory_proportion'])

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)
    _ = train_sess.run(train_init, [])
    # -----------------
    # Checkpoints    restore and save
    init_checkpoint_path = opts['init_checkpoint']
    logger.info(f"At the checkpoint location {init_checkpoint_path}")
    if init_checkpoint_path:
        logger.info("Loading checkpoint...")
        if os.path.isfile(init_checkpoint_path):
            init_checkpoint_path = os.path.splitext(init_checkpoint_path)[0]
            logger.info(f"checkpoint path: {init_checkpoint_path}")

        (assignment_map, initialized_variable_names
         ) = bert_ipu.get_assignment_map_from_checkpoint(
             tvars, init_checkpoint_path)

        for var in tvars:
            if var.name in initialized_variable_names:
                mark = "*"
            else:
                mark = " "
            logger.info("%-60s [%s]\t%s (%s)", var.name, mark, var.shape,
                        var.dtype.name)

        reader = tf.train.NewCheckpointReader(init_checkpoint_path)
        load_vars = reader.get_variable_to_shape_map()

        saver_restore = tf.train.Saver(assignment_map)
        saver_restore.restore(train_sess, init_checkpoint_path)
    # -----------------
    if compile_op is not None:
        logger.info(
            f"Compiling and saving Poplar executable to {poplar_exec_filepath}"
        )
        _ = train_sess.run(compile_op, [])
    else:
        exec_path = None
    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, restore, tvars,
                    exec_path), ipu_config
Example #7
0
def build_graph(opts, iterations_per_step=1, is_training=True):

    train_graph = tf.Graph()
    with train_graph.as_default():
        if opts["groupbert"]:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.GroupBertConfig(vocab_size=None))
        else:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.BertConfig(vocab_size=None))
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16
        placeholders = dict()

        if is_training:
            placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                           shape=[])
            learning_rate = placeholders['learning_rate']
        else:
            learning_rate = None

        # Need to load the Glue File here
        label_list = opts["pass_in"][1]
        bert_config.num_lables = len(label_list)
        if opts['do_training'] and opts['current_mode'] == 'train':
            input_file = os.path.join(opts["output_dir"],
                                      f"train_{opts['task_type']}.tf_record")
        elif opts['do_eval'] and opts['current_mode'] == 'eval':
            input_file = os.path.join(opts["output_dir"],
                                      f"eval_{opts['task_type']}.tf_record")
        elif opts['do_predict'] and opts['current_mode'] == 'predict':
            input_file = os.path.join(
                opts["output_dir"], f"predict_{opts['task_type']}.tf_record")
        else:
            raise NotImplementedError()

        opts['input_file'] = input_file
        opts['drop_remainder'] = True

        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        def bert_net():
            return build_network(train_iterator, outfeed_queue,
                                 iterations_per_step, bert_config, opts,
                                 learning_rate, is_training)

        with ipu_scope('/device:IPU:0'):
            train = ipu.ipu_compiler.compile(bert_net, [])

        outfeed = outfeed_queue.dequeue()

        log.print_trainable_variables(opts)

        restore = tf.train.Saver(var_list=tf.global_variables())
        train_saver = tf.train.Saver(max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()
    """calculate the number of required IPU"""
    num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas'])
    # The number of acquired IPUs must be the power of 2.
    if num_ipus & (num_ipus - 1) != 0:
        num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2)))
    ipu_config = get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection='CLUSTERING',
        compile_only=False,
        ipu_id=None,
        available_memory_proportion=opts["available_memory_proportion"])

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)
Example #8
0
def build_graph(opts, iterations_per_step=1, is_training=True):

    train_graph = tf.Graph()
    with train_graph.as_default():
        if opts["groupbert"]:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.GroupBertConfig(vocab_size=None))
        else:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.BertConfig(vocab_size=None))
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16
        placeholders = dict()

        if is_training:
            placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                           shape=[])
            learning_rate = placeholders['learning_rate']
        else:
            learning_rate = None

        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        # building networks with pipeline
        if not should_be_pipeline_when_inference(opts):

            def bert_net():
                return build_infer_network_without_pipeline(
                    train_iterator,
                    outfeed_queue,
                    iterations_per_step,
                    bert_config=bert_config,
                    opts=opts)
        else:

            def bert_net():
                return build_network(train_iterator, outfeed_queue,
                                     iterations_per_step, bert_config, opts,
                                     learning_rate, is_training)

        with ipu_scope('/device:IPU:0'):
            train = ipu.ipu_compiler.compile(bert_net, [])

        outfeed = outfeed_queue.dequeue()

        restore = tf.train.Saver(var_list=tf.global_variables())
        train_saver = tf.train.Saver(max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()
    """calculate the number of required IPU"""
    num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas'])
    # The number of acquired IPUs must be the power of 2.
    if num_ipus & (num_ipus - 1) != 0:
        num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2)))
    ipu_config = get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection='CLUSTERING',
        compile_only=False,
        ipu_id=None,
        partials_type=opts["partials_type"])

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)
Example #9
0
def validation_graph(model, opts):
    reconfigure = not opts.get('reuse_IPUs', False)
    if opts['use_popdist'] and reconfigure:
        hvd.init()

    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_dataset = dataset.data(
            opts, is_training=False).map(lambda x: {'data_dict': x})

        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            valid_dataset, prefetch_depth=opts['prefetch_depth'])

        if opts['latency']:
            timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    if opts['latency']:
                        timestamp_enqueue = timestamp_queue.enqueue(
                            data_dict['timestamp'])
                        return (total_accuracy +
                                (tf.cast(accuracy, tf.float32) /
                                 opts["validation_batches_per_step"]),
                                timestamp_enqueue)
                    else:
                        return total_accuracy + (
                            tf.cast(accuracy, tf.float32) /
                            opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['total_replicas'] * opts['shards'] > 1 and not opts.get(
                        'inference', False):
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['total_replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        if opts['latency']:
            print(f'relative_timer start {relative_timer.get_start()}')
            timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(),
                                tf.float32)
            latency_per_batch = tf.reshape(
                timestamp - timestamp_queue.dequeue(), [-1])
        else:
            latency_per_batch = None

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

        if opts['use_popdist']:
            broadcast_weights = []
            for var in tf.global_variables():
                broadcast_weights.append(
                    var.assign(hvd.broadcast(var, root_rank=0)))
            global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph,
                                                        root_rank=0)
            num_files_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0)
            iteration_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0)
        else:
            broadcast_weights = None
            broadcast_global_batch_size, global_batch_size_ph = None, None
            broadcast_num_files, num_files_ph = None, None
            broadcast_iteration, iteration_ph = None, None

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=False,  # disable Stochastic Rounding for validation
        shards=opts['shards'],
        number_of_replicas=opts['total_replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        half_partials=opts["enable_half_partials"],
        conv_dithering=opts["enable_conv_dithering"],
        enable_recomputation=opts["enable_recomputation"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"],
        compile_only=opts["compile_only"],
        internalExchangeOptimisationTarget=opts[
            "internal_exchange_optimisation_target"],
        num_io_tiles=opts["num_io_tiles"],
        number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1),
        nanoo=not opts["saturate_on_overflow"],
    )

    if opts['use_popdist'] and reconfigure:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    if opts['on_demand'] and reconfigure:
        ipu_options.device_connection.enable_remote_buffers = True
        ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND

    if reconfigure:
        ipu_options.configure_ipu_system()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    ops = {
        'accuracy': accuracy,
        'broadcast_weights': broadcast_weights,
        'broadcast_global_batch_size': broadcast_global_batch_size,
        'broadcast_num_files': broadcast_num_files,
        'broadcast_iteration': broadcast_iteration,
        'latency_per_batch': latency_per_batch
    }

    placeholders = {
        'global_batch_size': global_batch_size_ph,
        'num_files': num_files_ph,
        'iteration': iteration_ph
    }

    valid_graph.finalize()

    return train.GraphOps(valid_graph, valid_sess, valid_init, ops,
                          placeholders, valid_iterator, None, valid_saver)
Example #10
0
def build_graph(opts, is_training=True, feed_name=None):
    train_graph = tf.Graph()
    strategy = None

    if opts['use_popdist']:
        strategy = create_popdist_strategy()

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        bert_config = bert_ipu.BertConfig.from_dict(opts)
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16

        # define placeholders
        placeholders = {
            'learning_rate': tf.placeholder(bert_config.dtype, shape=[]),
            'loss_scaling': tf.placeholder(bert_config.dtype, shape=[])
        }
        learning_rate = placeholders['learning_rate']
        loss_scaling = placeholders['loss_scaling']

        # define input, datasets must be defined outside the ipu device scope.
        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.load(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])
        # define output
        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        # building networks with pipeline
        def bert_net():
            return build_network(train_iterator, outfeed_queue, bert_config,
                                 opts, learning_rate, loss_scaling,
                                 is_training)

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, bert_config, opts,
                learning_rate, loss_scaling, is_training)

        # get result from outfeed queue
        outfeed = outfeed_queue.dequeue()

        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        if opts['distributed_worker_index'] == 0 or opts['log_all_workers']:
            log.print_trainable_variables(opts)

        model_and_optimiser_variables = tf.global_variables()
        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_checkpoint'] else model_variables)

        train_saver = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['save_optimiser_to_checkpoint'] else model_variables,
            max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # calculate the number of required IPU
    num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas']
    num_ipus = ipu_utils.next_power_of_two(num_ipus)

    ipu_options = ipu_utils.get_config(
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        ipu_id=opts['select_ipu'])

    if opts['use_popdist']:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    ipu.utils.configure_ipu_system(ipu_options)

    # This is a workaround bug https://github.com/tensorflow/tensorflow/issues/23780
    from tensorflow.core.protobuf import rewriter_config_pb2
    sess_cfg = tf.ConfigProto()
    sess_cfg.graph_options.rewrite_options.memory_optimization = (
        rewriter_config_pb2.RewriterConfig.OFF)

    train_sess = tf.Session(graph=train_graph, config=sess_cfg)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)