Beispiel #1
0
def get_config(fp_exceptions,
               enable_recomputation,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               max_reduce_scatter_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id,
               available_memory_proportion=None,
               partials_type="half",
               minimum_remote_tensor_size=128):

    # Builds ipu_options
    cfg = IPUConfig()

    if ipu_id:
        cfg.select_ipus = [ipu_id]
    else:
        cfg.auto_select_ipus = num_required_ipus

    cfg.allow_recompute = enable_recomputation
    cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection]
    cfg.norms.use_stable_statistics = True
    cfg.matmuls.clear_pass_type = True

    # Floating-point exceptions
    cfg.floating_point_behaviour.inv = fp_exceptions
    cfg.floating_point_behaviour.div0 = fp_exceptions
    cfg.floating_point_behaviour.oflo = fp_exceptions
    cfg.floating_point_behaviour.nanoo = fp_exceptions

    # Stochastic rounding
    cfg.floating_point_behaviour.esr = enable_stochastic_rounding
    cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE
    cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size
    cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size
    cfg.optimizations.merge_infeed_io_copies = True
    cfg.optimizations.enable_graph_outlining = not disable_graph_outlining
    cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size

    if available_memory_proportion is not None:
        cfg.convolutions.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }
        cfg.matmuls.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }

    return cfg
Beispiel #2
0
def create_estimator(args):
    cfg = IPUConfig()
    cfg.floating_point_behaviour.inv = True
    cfg.floating_point_behaviour.div0 = True
    cfg.floating_point_behaviour.oflo = True
    cfg.floating_point_behaviour.esr = bool(args.stochastic_rounding)
    cfg.floating_point_behaviour.nanoo = True

    cfg.optimizations.maximum_cross_replica_sum_buffer_size = 20000000

    if args.allow_recompute:
        cfg.allow_recompute = True

    num_replicas = args.num_replicas_train
    num_shards = args.num_ipus_in_pipeline_train

    cfg.auto_select_ipus = num_replicas * num_shards

    cfg.device_connection.version = 'ipu' + str(2)
    cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS

    cfg.convolutions.poplar_options = {
        'partialsType': 'half' if args.partials_type == 'float16' else 'float'
    }
    cfg.matmuls.poplar_options = {
        'partialsType': 'half' if args.partials_type == 'float16' else 'float'
    }

    iterations_per_loop = (args.batches_per_step *
                           args.gradient_accumulation_batches)

    ipu_run_config = ipu.ipu_run_config.IPURunConfig(
        iterations_per_loop=iterations_per_loop,
        num_replicas=num_replicas,
        num_shards=num_shards,
        ipu_options=cfg,
    )

    config = ipu.ipu_run_config.RunConfig(
        ipu_run_config=ipu_run_config,
        log_step_count_steps=args.log_interval,
        save_summary_steps=args.summary_interval,
        model_dir=args.model_dir,
        tf_random_seed=42)

    return ipu.ipu_pipeline_estimator.IPUPipelineEstimator(
        config=config,
        model_fn=partial(model_fn, args=args),
        params={},
    )
Beispiel #3
0
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = IPUConfig()
    config.device_connection.enable_remote_buffers = True

    if opts.compile_only and opts.on_demand:
        raise ValueError("Can only provide one of --on-demand, --compile-only.")

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set.")

        config.device_connection.version = opts.compile_only_ipu_version
        config.device_connection.type = utils.DeviceConnectionType.NEVER

    if opts.on_demand:
        config.device_connection.type = utils.DeviceConnectionType.ON_DEMAND

    config.auto_select_ipus = num_ipus
    config.allow_recompute = opts.recompute
    # Enable stochastic rounding
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config = sparse.set_system_config(config, custom_op_debug_printing=opts.debug_dense_grad)
    config.configure_ipu_system()

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)
Beispiel #4
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, getattr(np, 'float32'))] *
                                    3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = IPUConfig()
    ipu_options.optimizations.combine_embedding_lookups = True
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.configure_ipu_system()
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Beispiel #5
0
def generic_graph(opts, is_training):
    master_dtype = get_tf_datatype(opts)
    graph = tf.Graph()

    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts)
        else:
            dataset = get_dataset_embed(opts, False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         sl):
                    prob, accuracy = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        sl,
                        use_negsampling=False)
                    with tf.control_dependencies([prob]):
                        return outfeed_queue.enqueue((prob, target, accuracy))

                return loops.repeat(opts['batches_per_step'], body, [], infeed)

            outputs = ipu_compiler.compile(comp_fn, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.train.Saver()

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = IPUConfig()
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
    ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000
    ipu_options.configure_ipu_system()

    graph_outputs = [outputs]

    sess = tf.Session(graph=graph)

    return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Beispiel #6
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(dataset_val)

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = IPUConfig()
    ipu_options.optimizations.combine_embedding_lookups = True
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.configure_ipu_system()
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Beispiel #7
0
def generic_graph(opts):
    data_type = get_tf_datatype(opts)
    graph = tf.Graph()
    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, True, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts, return_neg=True)
            feed_dict_values = {}
        else:
            dataset, feed_dict_values = get_dataset_embed_from_tensors(
                opts, data_type)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen, noclk_mids, noclk_cats):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        noclk_mids,
                        noclk_cats,
                        use_negsampling=True)
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, data_type)] * 3, infeed)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]

        saver = tf.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = IPUConfig()
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
    ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000
    ipu_options.configure_ipu_system()
    utils.reset_ipu_seed(opts['seed'])

    graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.Session(graph=graph)

    return GraphOps(
        sess, init, graph_outputs, placeholders, infeed, saver,
        feed_dict_values), uid_embedding, mid_embedding, cat_embedding
Beispiel #8
0
    estimator = create_estimator(args)

    if args.training:
        print("\nTraining...")
        train(estimator, args)

    if args.evaluation:
        print("\nEvaluating...")
        evaluate(estimator, args)

    if not (args.training or args.evaluation):
        # Configure IPU system for inference only
        # (no need to do this if an Estimator was already initialized)
        cfg = IPUConfig()
        if args.allow_recompute:
            cfg.allow_recompute = True
        cfg.auto_select_ipus = (args.num_replicas_infer *
                                args.num_ipus_in_pipeline_infer)
        cfg.device_connection.version = 'ipu' + str(2)
        cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS
        cfg.convolutions.poplar_options = {
            'partialsType':
            'half' if args.artials_type == 'float16' else 'float'
        }
        cfg.matmuls.poplar_options = {
            'partialsType':
            'half' if args.partials_type == 'float16' else 'float'
        }
        cfg.configure_ipu_system()

    if args.inference:
Beispiel #9
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=50 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               conv_output=False,
               enable_recomputation=False,
               seed=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None,
               num_io_tiles=0,
               number_of_distributed_batch_norm_replicas=1,
               min_remote_tensor_size=128,
               compile_only=False,
               nanoo=True,
               scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST,
               max_reduce_many_buffer_size=0):
    """Builds ipu_options"""
    config = IPUConfig()

    config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies
    if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST:
        if get_ipu_arch() == 2:
            scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH
        else:
            # work around to avoid OOM on MK1
            scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST
    config.scheduling.algorithm = scheduling_algorithm
    config.experimental.always_rearrange_copies_on_the_host = False
    config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size
    config.optimizations.maximum_cross_replica_sum_buffer_size = (
        max_cross_replica_buffer_size)
    config.optimizations.maximum_reduce_many_buffer_size = (
        max_reduce_many_buffer_size)

    if ipu_id == -1:
        config.auto_select_ipus = number_of_replicas * shards
    else:
        config.select_ipus = [ipu_id]
    config.compilation_poplar_options = {
        'target.deterministicWorkers': 'false' if seed is None else 'portable'
    }

    if internalExchangeOptimisationTarget is not None:
        config.compilation_poplar_options[
            'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget

    if num_io_tiles != 0:
        config.io_tiles.place_ops_on_io_tiles = True
        config.io_tiles.num_io_tiles = num_io_tiles

    config.convolutions.poplar_options = {}

    if availableMemoryProportion is not None:
        config.convolutions.poplar_options['availableMemoryProportion'] = str(
            availableMemoryProportion)

    if half_partials:
        config.convolutions.poplar_options['partialsType'] = 'half'
        config.matmuls.poplar_options['partialsType'] = 'half'
    if conv_dithering:
        config.convolutions.poplar_options['enableConvDithering'] = 'true'
    if conv_output:
        config.convolutions.poplar_options['gatherConvOutput'] = 'true'

    if stable_norm:
        config.norms.use_stable_statistics = True

    if enable_recomputation:
        config.allow_recompute = True

    if compile_only:
        config.device_connection.version = 'ipu2'
        config.device_connection.enable_remote_buffers = True
        # PRE_COMPILE allows for runing execuatables on graph without being online
        config.device_connection.type = DeviceConnectionType.PRE_COMPILE

        # Enforce using a exe cache path, defaulting if it doesnt exist
        tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or ''

        if '--executable_cache_path' not in tf_poplar_flags:
            print("Warning: --executable_cache_path not set. " +
                  "Defaulting to '/tmp/tf_cache'.")

            tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache"
            os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags

    config.floating_point_behaviour.inv = fp_exceptions
    config.floating_point_behaviour.div0 = fp_exceptions
    config.floating_point_behaviour.oflo = fp_exceptions
    config.floating_point_behaviour.esr = prng
    config.floating_point_behaviour.nanoo = nanoo

    config.norms.experimental.distributed_batch_norm_replica_group_size = (
        number_of_distributed_batch_norm_replicas)

    return config