Example #1
0
    def setUpClass(cls):
        # Set up input to the network
        img_width = img_height = 224
        img_channels = 3
        densenet_121_blocks = (6, 12, 24, 16)
        cls.batch_size = 1
        cls.num_classes = 1000
        # Set up image input placeholder
        cls.placeholder_input = tf.placeholder(dtype=tf.float16,
                                               shape=(cls.batch_size, img_height, img_width, img_channels),
                                               name="image_input")

        # Set compile and device options
        opts = IPUConfig()
        opts.auto_select_ipus = [1]
        opts.configure_ipu_system()

        # Construct Densenet model
        cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes,
                                      image_width=img_width, image_height=img_height, image_channels=img_channels)

        cls.densenet_model(cls.placeholder_input)

        # Restore weights
        checkpoint_file = CHECKPOINT_PATH

        if not Path(checkpoint_file + ".index").exists():
            print('Checkpoint file does not exist, attempting to download pre-trained weights')
            checkpoint_file = get_densenet_weights(Path(checkpoint_file))

        # Create test session
        saver = tf.train.Saver()

        with tf.Session() as sess:
            saver.restore(sess, checkpoint_file)
            logging.info('Restored imagenet weights.')

            # Optimize inference graph
            logging.info('Starting graph optimization.')
            densenet_graph_def = tf.get_default_graph().as_graph_def()
            frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def,
                                                                                      output_node_names=["output-prob"])
            # Remove identity ops in initializers to allow fusing batch norm with conv in the next line
            frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def)
            optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def)

            logging.info('Completed graph optimization.')

        tf.reset_default_graph()
        with tf.device('/device:IPU:0'):
            with tf.variable_scope('', use_resource=True):
                cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized",
                                                 return_elements=["output-prob:0"])[0]
Example #2
0
def get_config(fp_exceptions,
               enable_recomputation,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               max_reduce_scatter_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id,
               available_memory_proportion=None,
               partials_type="half",
               minimum_remote_tensor_size=128):

    # Builds ipu_options
    cfg = IPUConfig()

    if ipu_id:
        cfg.select_ipus = [ipu_id]
    else:
        cfg.auto_select_ipus = num_required_ipus

    cfg.allow_recompute = enable_recomputation
    cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection]
    cfg.norms.use_stable_statistics = True
    cfg.matmuls.clear_pass_type = True

    # Floating-point exceptions
    cfg.floating_point_behaviour.inv = fp_exceptions
    cfg.floating_point_behaviour.div0 = fp_exceptions
    cfg.floating_point_behaviour.oflo = fp_exceptions
    cfg.floating_point_behaviour.nanoo = fp_exceptions

    # Stochastic rounding
    cfg.floating_point_behaviour.esr = enable_stochastic_rounding
    cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE
    cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size
    cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size
    cfg.optimizations.merge_infeed_io_copies = True
    cfg.optimizations.enable_graph_outlining = not disable_graph_outlining
    cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size

    if available_memory_proportion is not None:
        cfg.convolutions.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }
        cfg.matmuls.poplar_options = {
            "availableMemoryProportion": str(available_memory_proportion),
            "partialsType": partials_type
        }

    return cfg
Example #3
0
def create_estimator(args):
    cfg = IPUConfig()
    cfg.floating_point_behaviour.inv = True
    cfg.floating_point_behaviour.div0 = True
    cfg.floating_point_behaviour.oflo = True
    cfg.floating_point_behaviour.esr = bool(args.stochastic_rounding)
    cfg.floating_point_behaviour.nanoo = True

    cfg.optimizations.maximum_cross_replica_sum_buffer_size = 20000000

    if args.allow_recompute:
        cfg.allow_recompute = True

    num_replicas = args.num_replicas_train
    num_shards = args.num_ipus_in_pipeline_train

    cfg.auto_select_ipus = num_replicas * num_shards

    cfg.device_connection.version = 'ipu' + str(2)
    cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS

    cfg.convolutions.poplar_options = {
        'partialsType': 'half' if args.partials_type == 'float16' else 'float'
    }
    cfg.matmuls.poplar_options = {
        'partialsType': 'half' if args.partials_type == 'float16' else 'float'
    }

    iterations_per_loop = (args.batches_per_step *
                           args.gradient_accumulation_batches)

    ipu_run_config = ipu.ipu_run_config.IPURunConfig(
        iterations_per_loop=iterations_per_loop,
        num_replicas=num_replicas,
        num_shards=num_shards,
        ipu_options=cfg,
    )

    config = ipu.ipu_run_config.RunConfig(
        ipu_run_config=ipu_run_config,
        log_step_count_steps=args.log_interval,
        save_summary_steps=args.summary_interval,
        model_dir=args.model_dir,
        tf_random_seed=42)

    return ipu.ipu_pipeline_estimator.IPUPipelineEstimator(
        config=config,
        model_fn=partial(model_fn, args=args),
        params={},
    )
Example #4
0
def get_ipu_option_dict(ipu_id=None, prng=False, n_ipus=1):
    """
    Collates IPU config into single dict, to be used as **kwargs input to tf.ConfigProto

    Returns:
        dict of config
    """
    options = IPUConfig()
    options.optimizations.prefetch_data_streams = True
    options.optimizations.merge_infeed_io_copies = True

    if ipu_id is None:
        options.auto_select_ipus = [n_ipus]
    else:
        options.select_ipus = [ipu_id]
    options.floating_point_behaviour.esr = prng

    return {'ipu_options': options}
Example #5
0
def get_config(opts, training=True):
    """Builds ipu_options
    """
    config = IPUConfig()

    ipus = opts.select_ipus
    if ipus[0] == -1:
        train_ipus = 1  # opts.shards
        valid_ipus = 1  # This might want an option to control
        if not opts.multiprocessing:
            config.auto_select_ipus = [train_ipus, valid_ipus]
        else:
            ipus = train_ipus if training else valid_ipus
            config.auto_select_ipus = [ipus]
    else:
        if opts.multiprocessing:
            ipus = [ipus[0] if training else ipus[1]]
        config.select_ipus = ipus

    config.floating_point_behaviour.esr = opts.prng

    return config
Example #6
0
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = IPUConfig()
    config.device_connection.enable_remote_buffers = True

    if opts.compile_only and opts.on_demand:
        raise ValueError("Can only provide one of --on-demand, --compile-only.")

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set.")

        config.device_connection.version = opts.compile_only_ipu_version
        config.device_connection.type = utils.DeviceConnectionType.NEVER

    if opts.on_demand:
        config.device_connection.type = utils.DeviceConnectionType.ON_DEMAND

    config.auto_select_ipus = num_ipus
    config.allow_recompute = opts.recompute
    # Enable stochastic rounding
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config = sparse.set_system_config(config, custom_op_debug_printing=opts.debug_dense_grad)
    config.configure_ipu_system()

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)
Example #7
0
    Graph compile calls
"""
# Compiles graph and targets IPU(s)
inference_output = ipu.ipu_compiler.compile(ssd_model, inputs=[])
# Compiles decoder on host (CPU)
decoder = decoder_component(input_detection)

# Assignment operator for trained weight file
param_setters = dict()
for var in tf.trainable_variables():
    placeholder = tf.placeholder(var.dtype, var.shape,
                                 var.name.split(':')[0] + '_setter')
    param_setters[var.name] = (tf.assign(var, placeholder), placeholder)

# Setup IPU configuration and build session
cfg = IPUConfig()
cfg.auto_select_ipus = NUM_IPUS
cfg.convolutions.poplar_options = {'availableMemoryProportion': '0.4'}
cfg.configure_ipu_system()
ipu.utils.move_variable_initialization_to_cpu()
outfeed = outfeed_queue.dequeue()

# Calculate total flops for graph (experimental)
run_meta = tf.RunMetadata()
opts = tf.profiler.ProfileOptionBuilder.float_operation()
flops = tf.profiler.profile(tf.get_default_graph(),
                            run_meta=run_meta,
                            cmd='op',
                            options=opts)
print("Total FLOPs reported by TF is: ", flops.total_float_ops)
Example #8
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=50 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               conv_output=False,
               enable_recomputation=False,
               seed=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None,
               num_io_tiles=0,
               number_of_distributed_batch_norm_replicas=1,
               min_remote_tensor_size=128,
               compile_only=False,
               nanoo=True,
               scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST,
               max_reduce_many_buffer_size=0):
    """Builds ipu_options"""
    config = IPUConfig()

    config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies
    if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST:
        if get_ipu_arch() == 2:
            scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH
        else:
            # work around to avoid OOM on MK1
            scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST
    config.scheduling.algorithm = scheduling_algorithm
    config.experimental.always_rearrange_copies_on_the_host = False
    config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size
    config.optimizations.maximum_cross_replica_sum_buffer_size = (
        max_cross_replica_buffer_size)
    config.optimizations.maximum_reduce_many_buffer_size = (
        max_reduce_many_buffer_size)

    if ipu_id == -1:
        config.auto_select_ipus = number_of_replicas * shards
    else:
        config.select_ipus = [ipu_id]
    config.compilation_poplar_options = {
        'target.deterministicWorkers': 'false' if seed is None else 'portable'
    }

    if internalExchangeOptimisationTarget is not None:
        config.compilation_poplar_options[
            'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget

    if num_io_tiles != 0:
        config.io_tiles.place_ops_on_io_tiles = True
        config.io_tiles.num_io_tiles = num_io_tiles

    config.convolutions.poplar_options = {}

    if availableMemoryProportion is not None:
        config.convolutions.poplar_options['availableMemoryProportion'] = str(
            availableMemoryProportion)

    if half_partials:
        config.convolutions.poplar_options['partialsType'] = 'half'
        config.matmuls.poplar_options['partialsType'] = 'half'
    if conv_dithering:
        config.convolutions.poplar_options['enableConvDithering'] = 'true'
    if conv_output:
        config.convolutions.poplar_options['gatherConvOutput'] = 'true'

    if stable_norm:
        config.norms.use_stable_statistics = True

    if enable_recomputation:
        config.allow_recompute = True

    if compile_only:
        config.device_connection.version = 'ipu2'
        config.device_connection.enable_remote_buffers = True
        # PRE_COMPILE allows for runing execuatables on graph without being online
        config.device_connection.type = DeviceConnectionType.PRE_COMPILE

        # Enforce using a exe cache path, defaulting if it doesnt exist
        tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or ''

        if '--executable_cache_path' not in tf_poplar_flags:
            print("Warning: --executable_cache_path not set. " +
                  "Defaulting to '/tmp/tf_cache'.")

            tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache"
            os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags

    config.floating_point_behaviour.inv = fp_exceptions
    config.floating_point_behaviour.div0 = fp_exceptions
    config.floating_point_behaviour.oflo = fp_exceptions
    config.floating_point_behaviour.esr = prng
    config.floating_point_behaviour.nanoo = nanoo

    config.norms.experimental.distributed_batch_norm_replica_group_size = (
        number_of_distributed_batch_norm_replicas)

    return config
Example #9
0
            dataset=test_set,
            number_of_epochs=opts.epochs,
            elements_per_epochs=num_test,
            print_stats=False,
            apply_options=True)

        logging.info("Starting benchmarks...\n")
        with tf.Session() as sess:
            logger.info("Benchmarking training dataset")
            train_results = sess.run(ds_perf_train)
            process_benchmark_results(train_results, opts)
            logger.info("Benchmarking training infeed")
            train_results = sess.run(infeed_perf_train)
            process_benchmark_results(train_results, opts)

            logger.info("Benchmarking test dataset")
            test_results = sess.run(ds_perf_test)
            process_benchmark_results(test_results, opts)
            logger.info("Benchmarking test infeed")
            test_results = sess.run(infeed_perf_test)
            process_benchmark_results(test_results, opts)

    # Set config
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.configure_ipu_system()

    # Now run on device
    make_and_run_on_device_benchmark(opts, train=True)
    make_and_run_on_device_benchmark(opts, train=False)
Example #10
0
def run_mnist(opts):
    if opts.pipelining and opts.gradient_accumulation_count < 4:
        raise ValueError(
            "Pipelining requires at least 4 gradient accumulation steps.")
    if opts.seed is not None:
        utils.reset_ipu_seed(opts.seed)
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size // opts.gradient_accumulation_count
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32

    # Flatten the images and cast the labels:
    permutation = make_pixel_permutation_matrix(opts, image_shape)

    x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape(
        -1, num_pixels)
    x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels)

    x_train_flat[:, ...] = x_train_flat[:, permutation]
    x_test_flat[:, ...] = x_test_flat[:, permutation]

    if opts.records_path:
        os.makedirs(opts.records_path, exist_ok=True)
        filename = os.path.join(opts.records_path, "pixel_permutation")
        np.save(filename, permutation)

    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    if opts.pipelining:
        logger.info(
            f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}"
        )
    batches_per_epoch = num_train // (batch_size *
                                      opts.gradient_accumulation_count)
    test_batches = num_test // (batch_size * opts.gradient_accumulation_count)

    batches_per_step = opts.batches_per_step_override
    if batches_per_step is None:
        batches_per_step = batches_per_epoch // opts.steps_per_epoch

    if not (batches_per_epoch % opts.steps_per_epoch) == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        lr_placeholder = tf.placeholder(dtype, shape=[])

    # Create dataset and IPU feeds:
    def make_generator(features, labels):
        return lambda: zip(features, labels)

    # Input pipeline
    def make_dataset(features, labels, is_training: bool):
        dataset = tf.data.Dataset.from_generator(
            generator=make_generator(features, labels),
            output_types=(features.dtype, labels.dtype),
            output_shapes=(features.shape[1:], labels.shape[1:]))

        if is_training:
            dataset = dataset.shuffle(buffer_size=num_train,
                                      seed=opts.seed).cache()

        dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
        return dataset

    train_dataset = make_dataset(features=x_train_flat,
                                 labels=y_train,
                                 is_training=True)

    test_dataset = make_dataset(features=x_test_flat,
                                labels=y_test,
                                is_training=False)

    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset)
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset)
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue()

    # Get optimiser
    opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg)
    logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__,
                opt_kws)

    # Get the bound model functions
    bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model
    (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn(
        fc_layers=fc_layers,
        opts=opts,
        lr_placeholder=lr_placeholder,
        opt_cls=opt_cls,
        opt_kws=opt_kws,
        train_batches_per_step=batches_per_step,
        test_batches_per_step=test_batches,
        train_queues=(outfeed_train_queue, infeed_train_queue),
        test_queues=(outfeed_test_queue, infeed_test_queue),
        png_queue=outfeed_prune_and_grow_queue,
        disable_dense_grad=opts.disable_dense_grad_override)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop,
                                          inputs=train_inputs)
        test_loop = ipu_compiler.compile(bound_test_loop)

    # Placeholders can only be created on cpu after all the slots have registered:
    with tf.device("cpu"):
        for fc in fc_layers.values():
            fc.create_placeholders()

    # Create update op on IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    utils.move_variable_initialization_to_cpu()
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config.configure_ipu_system()

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    # Add dense gradient outfeed if we have sparse layers
    dequeue_prune_and_grow_outfeed = None
    if not opts.disable_dense_grad_override and any(
            fc.is_sparse() for fc in fc_layers.values()):
        dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    if opts.restore:
        logpath = os.path.join(opts.checkpoint_path, opts.restore)
    else:
        logpath = os.path.join(opts.checkpoint_path,
                               datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer)

        if opts.restore:
            saver.restore(sess, logpath + '/model.ckpt')

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            start = opts.start_epoch if opts.restore else 0
            progress = tqdm(
                range(start, opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)

                    t1 = time.perf_counter()
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    t2 = time.perf_counter()
                    sess_time = t2 - t1
                    batch_time = sess_time / batches_per_step
                    throughput = batch_size / batch_time
                    logger.info(f"Time for sess.run: {sess_time:0.3f} "
                                f"Time per batch: {batch_time:0.6f} "
                                f"Throughput: {throughput}")

                    if opts.single_train_step_only:
                        return

                    train_outputs = sess.run(dequeue_train_outfeed)
                    if opts.pipelining:
                        train_outputs = train_outputs[-1]

                    # Get the last value for all items:
                    for k, v in train_outputs.items():
                        train_outputs[k] = v[-1]
                    logger.debug(f"Train outputs: {train_outputs.keys()}")

                    # Merge prune and grow fetches with last fetches:
                    if dequeue_prune_and_grow_outfeed is not None:
                        png_data = sess.run(dequeue_prune_and_grow_outfeed)
                        for k in png_data:
                            png_data[k] = png_data[k][-1]
                        logger.debug(
                            f"Prune and grow outputs: {png_data.keys()}")

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            var_name = fc.get_values_var().name
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(png_data[var_name])}"
                            )
                            for slot_name in fc.sparse_slots:
                                logger.info(
                                    f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}"
                                )
                            if i == 0 and e == opts.start_epoch:
                                metainfo = sess.run(fc.get_metainfo_var())
                            else:
                                metainfo = None
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_sched = prune_and_grow(name,
                                                             fc,
                                                             png_data,
                                                             random_gen,
                                                             steps,
                                                             total_steps,
                                                             opts,
                                                             metainfo=metainfo)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )
                                logger.info(
                                    f"Pruned proportion: {prune_sched}")
                                if opts.use_wandb:
                                    wandb.log({'Prune Schedule': prune_sched},
                                              commit=False)

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {train_outputs['acc']}\n")
                    if opts.use_wandb:
                        wandb.log(
                            {
                                'Loss': train_outputs['mean_loss'],
                                'Accuracy': train_outputs['acc'],
                                'Throughput': throughput
                            },
                            commit=True)
                    progress.set_description(
                        f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}"
                    )

                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        # Merge the feeds needed for all layers:
                        sparse_feed = {}
                        for fc in fc_layers.values():
                            if fc.is_sparse():
                                sparse_feed.update(fc.feed_dict())
                        sess.run(update_representation, feed_dict=sparse_feed)

                if e % opts.checkpoint_freq == 0:
                    logger.info(f"Saving...")
                    saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer)
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}"
            )
            if opts.use_wandb:
                wandb.run.summary["Test Loss"] = test_loss
                wandb.run.summary["Test Accuracy"] = test_acc
Example #11
0
def create_ipu_config():
    cfg = IPUConfig()
    cfg.auto_select_ipus = 1
    cfg.configure_ipu_system()
Example #12
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, getattr(np, 'float32'))] *
                                    3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = IPUConfig()
    ipu_options.optimizations.combine_embedding_lookups = True
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.configure_ipu_system()
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Example #13
0
def run_testing(opts, transformer, x_test, y_test):
    batches_per_epoch = len(y_test) // opts.batch_size
    testing_graph = tf.Graph()
    with testing_graph.as_default():
        with tf.device("cpu"):
            input_shape = [None, *x_test.shape[1:]]
            place_x = tf.placeholder(dtype=opts.dtype,
                                     shape=input_shape,
                                     name="input")
            place_y = tf.placeholder(dtype=tf.int32,
                                     shape=[None],
                                     name="label")

            # Create dataset and IPU feeds:
            dataset = tf.data.Dataset.from_tensor_slices(
                (place_x, place_y)).cache()
            dataset = dataset.batch(opts.batch_size, drop_remainder=True)
            test_infeed = IPUInfeedQueue(dataset)
            test_outfeed = IPUOutfeedQueue()

            # Helper function
            def loop_builder(iterations, builder_func, infeed):
                return loops.repeat(iterations, builder_func, [], infeed)

            # Compile the forward pass for testing
            with scopes.ipu_scope("/device:IPU:0"):
                test_loop = partial(forward_pass, opts, transformer, None,
                                    batches_per_epoch, False, test_outfeed,
                                    None)
                test_loop = partial(loop_builder, batches_per_epoch, test_loop,
                                    test_infeed)
                test_loop = ipu_compiler.compile(test_loop, inputs=[])

            # Metrics
            with tf.device("cpu"):
                metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                                 scope="metrics")
                metrics_initializer = tf.variables_initializer(
                    var_list=metrics_vars)
                saver = tf.train.Saver()

                test_outfeed_dequeue = test_outfeed.dequeue()

    # Setup and acquire an IPU device:
    config = IPUConfig()
    config.auto_select_ipus = opts.num_shards
    config.configure_ipu_system()

    logpath = os.path.join(opts.train_checkpoint_path, "test")
    checkpoint = tf.train.latest_checkpoint(opts.train_checkpoint_path)
    summary_writer = tf.summary.FileWriter(logpath)

    testing_graph.finalize()  # no more new ops added from here on out
    with tf.Session(graph=testing_graph) as sess:
        logger.info(f"Testing...")
        # The sparsity will also  be streamed from the checkpoint
        # The host and device sparsity are not in sync here
        saver.restore(sess, checkpoint)
        sess.run(test_infeed.initializer,
                 feed_dict={
                     place_x: x_test,
                     place_y: y_test
                 })
        sess.run(metrics_initializer)

        # Run inference (whole dataset in one session call)
        dt = time.perf_counter()
        sess.run(test_loop)
        dt = time.perf_counter() - dt
        session_outputs = sess.run(test_outfeed_dequeue)

        # Test set performance
        throughput = transformer.source_sequence_length * len(y_test) / dt
        test_loss = session_outputs['mean_loss'].mean()
        test_acc = session_outputs['acc'][-1]
        desc = f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}"
        logger.info(desc + f" Throughput {throughput:.1f} token/s")

    # Regression tests
    accuracy_threshold = 0.85
    assert test_acc >= accuracy_threshold, f"Test accuracy ({test_acc:3.2f}) is below threshold of ({accuracy_threshold:3.2f})"
    print("All asserts pass.")
Example #14
0
def run_training(opts, transformer, x_train, y_train):
    # Calculate dataset length
    num_train = len(y_train)
    batches_per_epoch = num_train // opts.batch_size
    batches_per_step = batches_per_epoch // (opts.steps_per_epoch)
    total_steps = (opts.steps_per_epoch) * opts.nepochs
    logging.info(
        f"Batches per epoch: {batches_per_epoch} Batches per step: {batches_per_step}"
    )

    if not batches_per_epoch % (opts.steps_per_epoch) == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Construct the training graph
    training_graph = tf.Graph()
    with training_graph.as_default():
        with tf.device("cpu"):
            input_shape = [None, *x_train.shape[1:]]
            place_x = tf.placeholder(dtype=opts.dtype,
                                     shape=input_shape,
                                     name="input")
            place_y = tf.placeholder(dtype=tf.int32,
                                     shape=[None],
                                     name="label")
            lr_placeholder = tf.placeholder(opts.dtype, shape=[])

            # Create dataset and IPU feeds:
            dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
            dataset = dataset.shuffle(buffer_size=len(y_train),
                                      reshuffle_each_iteration=True,
                                      seed=opts.random_seed).cache()
            dataset = dataset.repeat().batch(opts.batch_size,
                                             drop_remainder=True)

            # Queues for streaming from host to device and back
            train_infeed = IPUInfeedQueue(dataset)
            train_outfeed = IPUOutfeedQueue()
            png_outfeed = IPUOutfeedQueue()

            # Helper function
            def loop_builder(iterations, builder_func, infeed):
                return loops.repeat(iterations, builder_func, [], infeed)

            # Compile the forward and backward pass for training
            with scopes.ipu_scope("/device:IPU:0"):
                train_loop = partial(forward_pass, opts, transformer,
                                     lr_placeholder, batches_per_step, True,
                                     train_outfeed, png_outfeed)
                train_loop = partial(loop_builder, batches_per_step,
                                     train_loop, train_infeed)
                train_loop = ipu_compiler.compile(train_loop, inputs=[])
                transformer.buildSparsityUpdateOps()

            # Metrics
            with tf.device("cpu"):
                metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                                 scope="metrics")
                metrics_initializer = tf.variables_initializer(
                    var_list=metrics_vars)
                saver = tf.train.Saver(max_to_keep=5)

                # These ops are declared here so that the graph can be frozen afterwards
                global_initializer = tf.global_variables_initializer()
                train_outfeed_dequeue = train_outfeed.dequeue()
                png_outfeed_dequeue = png_outfeed.dequeue()

    # Setup and acquire an IPU device:
    config = IPUConfig()
    config.auto_select_ipus = opts.num_shards
    config.configure_ipu_system()

    logpath = os.path.join(opts.train_checkpoint_path, "train")
    summary_writer = tf.summary.FileWriter(logpath)

    # Run the model:
    training_graph.finalize()  # no more new ops added from here on out
    with tf.Session(graph=training_graph) as sess:
        logger.info(f"Creating training session")
        sess.run(global_initializer)
        sess.run(train_infeed.initializer,
                 feed_dict={
                     place_x: x_train,
                     place_y: y_train
                 })

        progress = tqdm(range(opts.nepochs),
                        bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
        for e in progress:
            for i in range(opts.steps_per_epoch):
                # Train the model
                sess.run(metrics_initializer)
                dt = time.perf_counter()
                sess.run(train_loop,
                         feed_dict={
                             lr_placeholder: learning_rate_schedule(e, opts)
                         })
                dt = time.perf_counter() - dt
                session_outputs = sess.run(train_outfeed_dequeue)
                logger.debug(f"Train outputs: {session_outputs}")

                # Calculate avg throughput
                num_tokens = transformer.source_sequence_length * batches_per_step * opts.batch_size
                throughput = num_tokens / dt
                desc = f"Loss {session_outputs['mean_loss'][-1]:.5f} " \
                    f"Accuracy {session_outputs['acc'][-1]:.5f} " \
                    f"Iteration: {session_outputs['iteration'][-1]}"
                progress.set_description(
                    desc + f" Throughput {throughput:.1f} token/s")

                # Perform pruning (if using RigL the dense grads from session_outputs are used)
                step = 1 + i + e * (opts.steps_per_epoch)
                if transformer.prune_ratio is not None:
                    t0 = time.perf_counter()
                    png_results = sess.run(png_outfeed_dequeue)
                    t1 = time.perf_counter()
                    for k in png_results:
                        png_results[k] = png_results[k][-1]
                    logger.debug(
                        f"Prune and grow outputs: {png_results.keys()}")
                    logger.info(
                        f"Downloaded the prune and grow data from Device to Host in {t1-t0:0.3f} seconds"
                    )

                    transformer.syncPruneAndRegrowOnHost(
                        opts.cosine_prune_schedule, step, total_steps,
                        png_results)
                    transformer.streamSparsityFromHostToDevice()

            # Save at the end of each epoch
            logger.info(f"Saving model")
            saver.save(sess,
                       os.path.join(opts.train_checkpoint_path, 'model.ckpt'))
Example #15
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(dataset_val)

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = IPUConfig()
    ipu_options.optimizations.combine_embedding_lookups = True
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.configure_ipu_system()
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Example #16
0
def generic_graph(opts):
    data_type = get_tf_datatype(opts)
    graph = tf.Graph()
    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, True, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts, return_neg=True)
            feed_dict_values = {}
        else:
            dataset, feed_dict_values = get_dataset_embed_from_tensors(
                opts, data_type)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen, noclk_mids, noclk_cats):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        noclk_mids,
                        noclk_cats,
                        use_negsampling=True)
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, data_type)] * 3, infeed)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]

        saver = tf.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = IPUConfig()
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
    ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000
    ipu_options.configure_ipu_system()
    utils.reset_ipu_seed(opts['seed'])

    graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.Session(graph=graph)

    return GraphOps(
        sess, init, graph_outputs, placeholders, infeed, saver,
        feed_dict_values), uid_embedding, mid_embedding, cat_embedding
Example #17
0
from spektral.layers import EdgeConditionedConv, GlobalSumPool
from spektral.utils import label_to_one_hot

from qm9_argparser import get_argparser

################################################################################
# PARAMETERS (defaults set in get_argparser())
################################################################################
parser = get_argparser()
args = parser.parse_args()
gradient_accumulation_count, epochs = (1, 2) if args.profile else (6, args.epochs)

################################################################################
# CONFIGURE THE DEVICE
################################################################################
cfg = IPUConfig()
cfg.auto_select_ipus = args.num_ipus
cfg.configure_ipu_system()

# Mixed precision support
tf.keras.backend.set_floatx('float16')

################################################################################
# LOAD DATA
################################################################################
A, X, E, y = qm9.load_data(return_type='numpy',
                           nf_keys='atomic_num',
                           ef_keys='type',
                           self_loops=True,
                           amount=args.amount)  # Set to None to train on whole dataset
Example #18
0
def set_up_ipu_devices(opts):
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.configure_ipu_system()
    # Set the seed for the stochastic rounding
    ipu.utils.reset_ipu_seed = opts.seed
Example #19
0
    # Make estimator
    estimator = create_estimator(args)

    if args.training:
        print("\nTraining...")
        train(estimator, args)

    if args.evaluation:
        print("\nEvaluating...")
        evaluate(estimator, args)

    if not (args.training or args.evaluation):
        # Configure IPU system for inference only
        # (no need to do this if an Estimator was already initialized)
        cfg = IPUConfig()
        if args.allow_recompute:
            cfg.allow_recompute = True
        cfg.auto_select_ipus = (args.num_replicas_infer *
                                args.num_ipus_in_pipeline_infer)
        cfg.device_connection.version = 'ipu' + str(2)
        cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS
        cfg.convolutions.poplar_options = {
            'partialsType':
            'half' if args.artials_type == 'float16' else 'float'
        }
        cfg.matmuls.poplar_options = {
            'partialsType':
            'half' if args.partials_type == 'float16' else 'float'
        }
        cfg.configure_ipu_system()
Example #20
0
                current_state=initial_chain_state,
                kernel=hmc_kernel)

        # Compile the graph
        [p], kernel_results = ipu_compiler.compile(hmc_graph, [])
    return (p, kernel_results)

# Place the graphs on IPUs
ops = []
for i in range(args.num_ipus):
    with ipu_scope('/device:IPU:'+str(i)):
        ops.append(build_graph(scope_id=str(i)))


# Configure IPU
config = IPUConfig()
# Create num_chips TF devices, with 1 IPU per device
config.auto_select_ipus = [1] * args.num_ipus
config.configure_ipu_system()
utils.move_variable_initialization_to_cpu()

# Initialize variables
init_g = tf.global_variables_initializer()
sess.run(init_g)

# Warm up
print("\nWarming up...")
sess.run(ops)
print("Done\n")

# Sample
Example #21
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope(f'/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(
                        opts,
                        observed=batch[:, :-1],
                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                        learning_rate=placeholders['learning_rate']
                        if training else None,
                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse

                return loops.repeat(
                    batches_per_step, body,
                    [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed)

            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None

    writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}',
                                   graph=graph,
                                   flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        ipu_config = IPUConfig()

        ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
        ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000

        if opts.compile_only:
            ipu_config.device_connection.version = opts.compile_only_ipu_version
            ipu_config.device_connection.enable_remote_buffers = True
            ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE

        if opts.select_ipus == 'AUTO':
            ipu_config.auto_select_ipus = [opts.replication_factor]
        else:
            ipu_config.select_ipus = [opts.select_ipus[not training]]

        ipu_config.floating_point_behaviour.esr = opts.prng
        ipu_config.configure_ipu_system()

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph, sess, init, graph_outputs,
                    placeholders if training else None, infeed, saver, writer,
                    trainFlag)
Example #22
0
def bs_matmul_test(opts):
    data_type = opts.data_type
    partial_data_type = opts.partial_data_type

    dim = [opts.lhs_rows, opts.lhs_cols, opts.rhs_cols]
    block_size = [opts.lhs_block_row, opts.lhs_block_col, opts.rhs_block_col]
    block_dim = [0] * 3
    for i in range(3):
        assert (dim[i] > 0)
        assert (block_size[i] > 0)
        assert (dim[i] % block_size[i] == 0)
        block_dim[i] = dim[i] // block_size[i]

    if opts.sparsity_mask is not None:
        sparsity_or_mask = list(int(c) for c in opts.sparsity_mask)
    else:
        sparsity_or_mask = opts.sparsity

    inner_group_size = opts.inner_group_size
    partition_method = opts.partition_method
    memory_cycle_ratio = opts.memory_cycle_ratio

    tf_type = tf.float32 if data_type == "float" else tf.float16

    sparse_out = False
    op_name = "BuildDSD"
    compute_grads = False
    if opts.scenario[:3] == "dds":
        sparse_out = True
        op_name = "BuildDDS"
    if len(opts.scenario) > 3:
        compute_grads = True

    transposed_rhs = False
    if (not sparse_out):
        transposed_rhs = opts.transposed_rhs

    if (not sparse_out):
        if not transposed_rhs:
            dim_dense = [dim[1], dim[2]]
            block_size_sparse = [block_size[1], block_size[2]]
            dim_sparse_mask = [block_dim[1], block_dim[2]]
        else:
            dim_dense = [dim[2], dim[1]]
            block_size_sparse = [block_size[2], block_size[1]]
            dim_sparse_mask = [block_dim[2], block_dim[1]]
    else:
        dim_dense = [dim[0], dim[2]]
        block_size_sparse = [block_size[0], block_size[2]]
        dim_sparse_mask = [block_dim[0], block_dim[2]]
    if opts.group_dims is not None:
        dim_dense = opts.group_dims + dim_dense
        dim_sparse_mask = opts.group_dims + dim_sparse_mask

    sparse_matrix, dense_masked_matrix, sparsity_mask = utils.create_block_sparse_tensor(
        dim_dense, block_size_sparse, sparsity_or_mask)
    if transposed_rhs:
        sparse_transposed_indices = list(range(len(dim_dense)))
        sparse_transposed_indices[-2], sparse_transposed_indices[
            -1] = sparse_transposed_indices[-1], sparse_transposed_indices[-2]
        dense_masked_matrix = dense_masked_matrix.transpose(
            sparse_transposed_indices)
        # leaving sparsity_mask is in transposed form

    nz = reduce(add, sparsity_mask, 0)
    logger.debug(f"sparsity_mask: {sparsity_mask}, nz blocks: {nz}")

    dim_lhs = [dim[0], dim[1]]
    dim_block_sparse = [nz, block_size_sparse[0] * block_size_sparse[1]]
    if opts.group_dims is not None:
        dim_lhs = opts.group_dims + dim_lhs
    if (not sparse_out):
        dim_rhs = dim_block_sparse
        dim_res = [dim[0], dim[2]]
        if opts.group_dims is not None:
            dim_res = opts.group_dims + dim_res
    else:
        dim_rhs = [dim[1], dim[2]]
        if opts.group_dims is not None:
            dim_rhs = opts.group_dims + dim_rhs
        dim_res = dim_block_sparse

    lhs_np = utils.create_dense_tensor(dim_lhs)
    lhs = tf.Variable(lhs_np, dtype=tf_type)

    if (not sparse_out):
        rhs = tf.Variable(sparse_matrix, dtype=tf_type)
        rhs_ref = tf.Variable(dense_masked_matrix, dtype=tf_type)
    else:
        rhs_np = utils.create_dense_tensor(dim_rhs)
        rhs = tf.Variable(rhs_np, dtype=tf_type)
        rhs_ref = rhs
        sparsity_mask_2d = np.reshape(sparsity_mask, dim_sparse_mask)
        block_one = np.ones([block_size[0], block_size[2]], dtype=np.float32)
        res_mask_np = np.kron(sparsity_mask_2d, block_one)
        res_mask = tf.constant(res_mask_np, dtype=tf_type)

    if (not sparse_out):
        if compute_grads:

            def dense_matmul(a, b):
                with tf.variable_scope("matmul",
                                       reuse=tf.AUTO_REUSE,
                                       use_resource=True):
                    c = tf.matmul(a, b)
                    s = tf.reduce_sum(c)
                    a_grad = tf.gradients(s, a)
                    b_grad = tf.gradients(s, b)
                    return c, a_grad, b_grad
        else:

            def dense_matmul(a, b):
                with tf.variable_scope("matmul",
                                       reuse=tf.AUTO_REUSE,
                                       use_resource=True):
                    c = tf.matmul(a, b)
                    return c
    else:
        if compute_grads:

            def dense_matmul(a, b):
                with tf.variable_scope("matmul",
                                       reuse=tf.AUTO_REUSE,
                                       use_resource=True):
                    c = tf.matmul(a, b)
                    c = c * res_mask
                    s = tf.reduce_sum(c)
                    a_grad = tf.gradients(s, a)
                    b_grad = tf.gradients(s, b)
                    return c, a_grad, b_grad
        else:

            def dense_matmul(a, b):
                with tf.variable_scope("matmul",
                                       reuse=tf.AUTO_REUSE,
                                       use_resource=True):
                    c = tf.matmul(a, b)
                    c = c * res_mask
                    return c

    bs_matmul_args = {
        "dim": dim,
        "block_size": block_size,
        "sparsity_mask": "".join(str(c) for c in sparsity_mask),
        "transposed_rhs": transposed_rhs,
        "data_type": data_type,
        "partial_data_type": partial_data_type,
        "inner_group_size": inner_group_size,
        "partition_method": partition_method,
        "memory_cycle_ratio": memory_cycle_ratio
    }
    json_attribs = json.dumps(bs_matmul_args)

    logger.debug(f"json_attribs: {json_attribs}")

    if compute_grads:

        def bs_matmul(a, b):
            outputs = {
                "output_types": [tf_type],
                "output_shapes": [tf.TensorShape(dim_res)]
            }
            lib_path = utils.get_lib_path("block_sparse")

            with tf.variable_scope("bs_matmul",
                                   reuse=tf.AUTO_REUSE,
                                   use_resource=True):
                c = ipu.custom_ops.precompiled_user_op(
                    [a, b],
                    lib_path,
                    outs=outputs,
                    op_name=op_name,
                    separate_gradients=False,
                    inputs_with_gradients=[0, 1],
                    attributes=json_attribs,
                    gradient_attributes=json_attribs)

                s = tf.reduce_sum(c)
                a_grad = tf.gradients(s, a)
                b_grad = tf.gradients(s, b)
            return c, a_grad, b_grad
    else:

        def bs_matmul(a, b):
            outputs = {
                "output_types": [tf_type],
                "output_shapes": [tf.TensorShape(dim_res)]
            }
            lib_path = utils.get_lib_path("block_sparse")

            with tf.variable_scope("bs_matmul",
                                   reuse=tf.AUTO_REUSE,
                                   use_resource=True):
                c = ipu.custom_ops.precompiled_user_op(
                    [a, b],
                    lib_path,
                    outs=outputs,
                    op_name=op_name,
                    separate_gradients=False,
                    inputs_with_gradients=[],
                    attributes=json_attribs)
            return c

    # Configure the IPU:
    cfg = IPUConfig()
    cfg.auto_select_ipus = 1
    cfg.configure_ipu_system()

    with ipu.scopes.ipu_scope("/device:IPU:0"):
        dense_matmul_fetches = ipu.ipu_compiler.compile(
            dense_matmul, [lhs, rhs_ref])
        bs_matmul_fetches = ipu.ipu_compiler.compile(bs_matmul, [lhs, rhs])

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        results_ref = sess.run(dense_matmul_fetches)
        results = sess.run(bs_matmul_fetches)

    if compute_grads:
        out_ref, lhs_grad_ref, rhs_grad_ref = (results_ref[0],
                                               results_ref[1][0],
                                               results_ref[2][0])
        out, lhs_grad, rhs_grad = (results[0][0], results[1][0], results[2][0])
    else:
        out_ref, lhs_grad_ref, rhs_grad_ref = (results_ref[0], None, None)
        out, lhs_grad, rhs_grad = (results[0], None, None)

    if (sparse_out):
        out_ref = utils.to_block_sparse(np.array(out_ref), block_size_sparse,
                                        sparsity_mask)
    else:
        if compute_grads:
            rhs_grad_ref = np.array(rhs_grad_ref)
            if transposed_rhs:
                rhs_grad_ref = rhs_grad_ref.transpose(
                    sparse_transposed_indices)
            rhs_grad_ref = utils.to_block_sparse(rhs_grad_ref,
                                                 block_size_sparse,
                                                 sparsity_mask)

    return out, lhs_grad, rhs_grad, out_ref, lhs_grad_ref, rhs_grad_ref
Example #23
0
def run_model(opts):
    training = opts.test_mode in ["all", "training"]
    testing = opts.test_mode in ["all", "tests"]

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = 16
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    data_shape = [None, num_pixels]

    w_dense_shape = [num_pixels, h1Size]
    assert (batch_size % block_size[0] == 0)
    assert (w_dense_shape[0] % block_size[1] == 0)
    assert (w_dense_shape[1] % block_size[2] == 0)
    block_rows = w_dense_shape[0] // block_size[1]
    block_cols = w_dense_shape[1] // block_size[2]

    sparsity_mask = None
    if opts.sparsity >= 0.0:
        sparsity_mask = utils.create_random_sparse_mask(
            opts.sparsity, block_rows, block_cols).flatten()

    # Flatten the images and cast the labels:
    x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels)
    x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    epochs = opts.epochs
    ipu_steps_per_epoch = 15
    batches_per_epoch = num_train // batch_size
    train_batches = (num_train * epochs) // batch_size
    test_batches = num_test // batch_size
    batches_per_step = batches_per_epoch // ipu_steps_per_epoch
    if not batches_per_epoch % ipu_steps_per_epoch == 0:
        raise ValueError(
            f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}."
        )

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        place_x = tf.placeholder(dtype=tf.float32,
                                 shape=data_shape,
                                 name="input")
        place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
        lr_placeholder = tf.placeholder(tf.float32, shape=[])

    # Create dataset and IPU feeds:
    dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
    dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True)
    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(dataset)
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(dataset)
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue()

    # Use function binding to create all the builder functions that are neeeded:
    if training:
        bound_train_model = partial(model, lr_placeholder, outfeed_train_queue,
                                    True, sparsity_mask)
        bound_train_loop = partial(loop_builder, batches_per_step,
                                   bound_train_model, infeed_train_queue)
    if testing:
        bound_test_model = partial(model, lr_placeholder, outfeed_test_queue,
                                   False, sparsity_mask)
        bound_test_loop = partial(loop_builder, test_batches, bound_test_model,
                                  infeed_test_queue)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        if training:
            train_loop = ipu_compiler.compile(bound_train_loop, inputs=[])
        if testing:
            test_loop = ipu_compiler.compile(bound_test_loop, inputs=[])

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.configure_ipu_system()

    # These allow us to retrieve the results of IPU feeds:
    if training:
        dequeue_train_outfeed = outfeed_train_queue.dequeue()
    if testing:
        dequeue_test_outfeed = outfeed_test_queue.dequeue()

    # Create a benchmark program for the infeed to determine maximum achievable throughput:
    infeed_perf = dataset_benchmark.infeed_benchmark(infeed_train_queue,
                                                     epochs, num_train, True)

    print(
        f"\nImage shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    print(
        f"Epochs: {epochs} Batch-size: {batch_size} Steps-per-epoch: {ipu_steps_per_epoch} Batches-per-step: {batches_per_step}"
    )

    # Run the model:
    with tf.Session() as sess:
        print(f"Benchmarking the infeed...")
        sess.run(infeed_perf,
                 feed_dict={
                     place_x: x_train_flat,
                     place_y: y_train
                 })

        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer,
                 feed_dict={
                     place_x: x_train_flat,
                     place_y: y_train
                 })

        if training:
            print(f"Training...")
            progress = tqdm(
                range(epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:

                sess.run(metrics_initializer)
                for i in range(ipu_steps_per_epoch):
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e)})
                    result = sess.run(dequeue_train_outfeed)
                    if len(result['mean_loss'] != 0) and len(
                            result['acc'] != 0):
                        progress.set_description(
                            f"Loss {result['mean_loss'][0]:.5f} Accuracy {result['acc'][0]:.5f}"
                        )

            print(f"Saving...")
            saver.save(sess, "model")

        if testing:
            print(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer,
                     feed_dict={
                         place_x: x_test_flat,
                         place_y: y_test
                     })
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = np.mean(result['mean_loss'])
            test_acc = np.mean(result['acc'])
            print(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
Example #24
0
 def test_gru(self):
     seqLen = 2
     bs = 3
     inputs_value = np.array(
         [[[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]]],
         np.float32)
     seq_len_value = np.array([1, 2, 2], np.int32)
     inputs = tf.placeholder(shape=[bs, seqLen, self.HIDDEN_SIZE],
                             dtype=self.model_dtype)
     seq_len = tf.placeholder(shape=[bs], dtype=tf.int32)
     cfg = IPUConfig()
     cfg.auto_select_ipus = 1
     cfg.configure_ipu_system()
     utils.move_variable_initialization_to_cpu()
     with ops.device("/device:IPU:0"):
         train_ipu = ipu_compiler.compile(self.gru_model,
                                          inputs=[inputs, seq_len])
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         for var in tf.global_variables():
             if var.name == 'popnn_dynamic_gru/kernel:0':
                 gru_kernel = np.array([[
                     0.36324948, 0.34305102, -0.47945526, 0.29105264,
                     -0.55362725, 0.33607864
                 ],
                                        [
                                            -0.20881158, 0.79369456,
                                            0.3866263, -0.55099547,
                                            0.41944432, 0.39612126
                                        ],
                                        [
                                            0.48400682, 0.16632384,
                                            -0.78809285, 0.47519642,
                                            0.4464376, -0.63623476
                                        ],
                                        [
                                            -0.57933414, -0.29082513,
                                            -0.7381171, 0.77089626,
                                            -0.24111485, 0.9164796
                                        ]])
                 gru_kernel_var = var
         sess.run(tf.assign(gru_kernel_var, gru_kernel))
         outputs_expected = np.array([[[-0.03196924, 0.06592286], [-0, 0]],
                                      [[-0.03196924, 0.06592286],
                                       [-0.06241067, 0.12973404]],
                                      [[-0.03196924, 0.06592286],
                                       [-0.06241067, 0.12973404]]])
         outputs = sess.run(train_ipu,
                            feed_dict={
                                inputs: inputs_value,
                                seq_len: seq_len_value
                            })
         gru_kernel_updated = sess.run(gru_kernel_var)
         gru_kernel_expected = np.array([[
             0.35011762, 0.37606436, -0.4793783, 0.29105875, -0.6845508,
             0.3001622
         ],
                                         [
                                             -0.22194342, 0.8267079,
                                             0.38670325, -0.55098933,
                                             0.28852075, 0.36020482
                                         ],
                                         [
                                             0.48412853, 0.16602053,
                                             -0.7880953, 0.4751962,
                                             0.4473563, -0.6360037
                                         ],
                                         [
                                             -0.57958513, -0.2901997,
                                             -0.73811203, 0.7708967,
                                             -0.24294817, 0.9160184
                                         ]])
         self.assertAlmostEqual(np.mean(outputs - outputs_expected),
                                np.float32(0.0),
                                delta=1e-7)
         self.assertAlmostEqual(np.mean(gru_kernel_expected -
                                        gru_kernel_updated),
                                np.float32(0.0),
                                delta=1e-8)
Example #25
0
def training_graph(opts, training_data, device_index=0, learning_rate=0.001):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, _, placeholders = training_data.get_dataset(
            opts, is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(opts,
                                                                    observed_ratings=observed_ratings,
                                                                    learning_rate=placeholders["learning_rate"])
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(opts.batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32),
                                     tf.constant(0, tf.float32)],
                                    infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", learning_rate)
        tf.summary.scalar("RMSE/train", rmse)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(
        opts.logs_path + '/train{0}'.format(device_index),
        graph=train_graph,
        flush_secs=30)

    ipu_options = IPUConfig()
    ipu_options.floating_point_behaviour.inv = opts.fp_exceptions
    ipu_options.floating_point_behaviour.div0 = opts.fp_exceptions
    ipu_options.floating_point_behaviour.oflo = opts.fp_exceptions
    ipu_options.floating_point_behaviour.esr = opts.prng
    ipu_options.floating_point_behaviour.nanoo = True
    ipu_options.auto_select_ipus = 1
    ipu_options.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph,
                    train_sess,
                    train_init,
                    [loss, train_summary, rmse],
                    placeholders,
                    infeed,
                    train_saver,
                    train_writer)
Example #26
0
    def test_augru(self):
        seqlen = 3
        bs = 3
        inputs_value = np.ones([bs, seqlen, self.HIDDEN_SIZE], np.float32)
        seq_len_value = np.array([1, 3, 2], np.int32)

        alphas_value = np.ones([seqlen, bs], np.float32)
        alphas_value = alphas_value * 0.5
        inputs = tf.placeholder(shape=[bs, seqlen, self.HIDDEN_SIZE],
                                dtype=self.model_dtype)
        seq_len = tf.placeholder(shape=[bs], dtype=tf.int32)
        alphas = tf.placeholder(shape=[seqlen, bs], dtype=self.model_dtype)

        cfg = IPUConfig()
        cfg.auto_select_ipus = 1
        cfg.configure_ipu_system()
        utils.move_variable_initialization_to_cpu()

        with ops.device("/device:IPU:0"):
            train_ipu = ipu_compiler.compile(self.augru_model,
                                             inputs=[inputs, seq_len, alphas])
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for var in tf.global_variables():
                if var.name == 'popnn_augru/kernel:0':
                    augru_kernel = np.array([[
                        0.3188401, 0.8256132, -0.12287354, 0.8648142,
                        -0.17983055, -0.45415568
                    ],
                                             [
                                                 -0.29249465, 0.65579015,
                                                 -0.75681853, 0.4331085,
                                                 -0.07700777, -0.47652483
                                             ],
                                             [
                                                 -0.20116574, 0.52735907,
                                                 -0.08258069, -0.21897888,
                                                 -0.54514384, 0.32709408
                                             ],
                                             [
                                                 -0.43361932, -0.62175727,
                                                 0.28278595, 0.13071388,
                                                 -0.29585528, -0.14058399
                                             ]])
                    augru_kernel_var = var
            sess.run(tf.assign(augru_kernel_var, augru_kernel))
            outputs_expected = np.array([[[-0.15881832, -0.39365855], [0., 0.],
                                          [0., 0.]],
                                         [[-0.15881832, -0.39365855],
                                          [-0.1270374, -0.56743807],
                                          [-0.09283338, -0.6407641]],
                                         [[-0.15881832, -0.39365855],
                                          [-0.1270374, -0.56743807], [0.,
                                                                      0.]]])
            outputs = sess.run(train_ipu,
                               feed_dict={
                                   inputs: inputs_value,
                                   seq_len: seq_len_value,
                                   alphas: alphas_value
                               })
            augru_kernel_updated = sess.run(augru_kernel_var)
            augru_kernel_expected = np.array([[
                0.31478855, 0.81888944, -0.12453551, 0.863326, -0.40852502,
                -0.5518727
            ],
                                              [
                                                  -0.2965462, 0.6490664,
                                                  -0.7584805, 0.4316203,
                                                  -0.30570224, -0.5742418
                                              ],
                                              [
                                                  -0.20129025, 0.52758944,
                                                  -0.08233033, -0.21876118,
                                                  -0.5368969, 0.3306306
                                              ],
                                              [
                                                  -0.43399453, -0.6211322,
                                                  0.28351453, 0.13140172,
                                                  -0.25127774, -0.12138209
                                              ]])
            self.assertAlmostEqual(np.mean(outputs - outputs_expected),
                                   np.float32(0.0),
                                   delta=1e-7)
            self.assertAlmostEqual(np.mean(augru_kernel_expected -
                                           augru_kernel_updated),
                                   np.float32(0.0),
                                   delta=1e-8)
def main(args):
    tf.logging.set_verbosity(tf.logging.ERROR)
    np.set_printoptions(linewidth=200)
    random_seed = args.random_seed
    checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt")

    # Input activations for the attention layer
    random_gen = np.random.default_rng(seed=random_seed)
    activations_np = random_gen.uniform(-0.1, 0.1, size=(args.batch_size, args.source_sequence_length, args.hidden_length))

    # Configure the IPU
    cfg = IPUConfig()
    cfg.auto_select_ipus = 1
    cfg.configure_ipu_system()

    # Build IPU graphs
    sparse_decoder_graph = tf.Graph()
    sparse_transformer = DynsparseTransformer(args)
    with sparse_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weight placeholders are created inside sparse_transfomer
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)
        with ipu.scopes.ipu_scope("/device:IPU:0"):
            sparse_decoder = partial(sparse_transformer_fwd_and_grad, sparse_transformer)
            sparse_decoder_fetches = ipu.ipu_compiler.compile(sparse_decoder, [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

    # sparse-decoder
    with tf.Session(graph=sparse_decoder_graph) as sess:
        # initialize weights
        sess.run(tf.global_variables_initializer())

        # Save the sparse weights to checkpoint as dense
        sparse_transformer.checkpointAsDense(checkpoint_path)

        # run sparse decoder
        sparse_result = sess.run(sparse_decoder_fetches, feed_dict={inputs_ph: activations_np})

    # Create a dense transformer and initialize the weights to the values that
    # the sparse model was initialzed with originally
    dense_decoder_graph = tf.Graph()
    dense_transformer = DenseTransformer(args)
    with dense_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weights will get streamed from checkpoint
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            dense_decoder_fetches = partial(dense_transformer_fwd_and_grad, dense_transformer)
            dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches, [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

        with tf.device("cpu"):
            # We will only load the trainable variables, not momentum etc.
            loader = tf.train.Saver(tf.trainable_variables())

    # dense-decoder
    with tf.Session(graph=dense_decoder_graph) as sess:
        # Initialized momentums which are not part of the checkpoint
        sess.run(tf.global_variables_initializer())
        # Restore saved trainable variables
        loader.restore(sess, checkpoint_path)
        dense_result = sess.run(dense_graph, feed_dict={inputs_ph: activations_np})

    # TEST
    rtol = 1e-05
    atol = 1e-05
    if args.dtype == tf.float16:
        rtol = 1e-04
        atol = 1e-02
    # Compare model output activations (actual vs. desired) -> (sparse vs. dense)
    np.testing.assert_allclose(sparse_result["output_activation"], dense_result["output_activation"],
                               atol=atol, rtol=rtol, err_msg="Output activations do not match.")

    # Compate gradient of output wrt. input
    np.testing.assert_allclose(sparse_result["input_grad"], dense_result["input_grad"],
                               atol=atol, rtol=rtol, err_msg="Grads wrt. inputs do not match")

    # Compare the dense_w and sparse grads of every sparse layer
    for name, sparse_layer in sparse_transformer.sparse_layers.items():
        # Compate the dense grads
        dense_grad = dense_result[name + "/weight" + "_grad"]
        sparse_grad_w = sparse_result[name + "_grad_w"]
        np.testing.assert_allclose(sparse_grad_w, dense_grad, atol=atol, rtol=rtol,
                                   err_msg=f"Dense grads for layer {name} do not match")

        # Compare the sparse grads
        sparse_grad_padded = sparse_result[name + "/sparse_layer/nz_values_grad"]
        sparse_grad_data = sparse.SparseRepresentation(sparse_layer.weights.get_metainfo(), sparse_grad_padded)
        i, j, sparse_grad = sparse.triplets_from_representation(sparse_layer.weights.spec, sparse_grad_data, sparse_layer.weights.matmul_options)

        # Convert dense grads to blocks
        block_size, _ = sparse_layer.get_nonzero_blocks_shape()
        nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[1] // block_size
        strides = np.array(dense_grad.strides)  # strides are in bytes
        strides = tuple(strides * block_size) + tuple(strides)
        blocked_dense_grad = np.lib.stride_tricks.as_strided(dense_grad, (nx, ny, block_size, block_size), strides)
        if block_size == 1:
            blocked_dense_grad = np.squeeze(np.copy(blocked_dense_grad), axis=(-2, -1))
        np.testing.assert_allclose(sparse_grad, blocked_dense_grad[i, j], atol=atol, rtol=rtol,
                                   err_msg=f"Sparse grads for layer {name} do not match")

    print("All results match.")
    return sparse_result, dense_result
Example #28
0
def generic_graph(opts, is_training):
    master_dtype = get_tf_datatype(opts)
    graph = tf.Graph()

    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts)
        else:
            dataset = get_dataset_embed(opts, False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         sl):
                    prob, accuracy = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        sl,
                        use_negsampling=False)
                    with tf.control_dependencies([prob]):
                        return outfeed_queue.enqueue((prob, target, accuracy))

                return loops.repeat(opts['batches_per_step'], body, [], infeed)

            outputs = ipu_compiler.compile(comp_fn, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.train.Saver()

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = IPUConfig()
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
    ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000
    ipu_options.configure_ipu_system()

    graph_outputs = [outputs]

    sess = tf.Session(graph=graph)

    return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Example #29
0
def run_inference(loop_op: tf.Operation,
                  infeed_queue_initializer: tf.Operation,
                  outfeed_op: tf.Operation,
                  batch_size: int,
                  batches_per_step: int,
                  network_name: str,
                  decode_predictions: Callable,
                  ground_truth: Tuple[str],
                  num_iterations: Optional[int] = 500,
                  num_ipus: Optional[int] = 1,
                  mode: Optional[str] = "single_ipu",
                  data: Optional[str] = "real",
                  available_memory_proportion: Optional[float] = 0.6) -> None:
    """Run inference on device and decode predictions.

    Args:
        loop_op: Inference op.
        infeed_queue_initializer: Initializer for the infeed queue.
        outfeed_op: Outfeed operator to extract results.
        batch_size: Batch size per forward pass.
        batches_per_step: Number of forward passes per step.
        network_name: Name of this network, to use in frames_per_second plot filename.
        decode_predictions: Function to decode predictions with.
        ground_truth: Ground-truth labels.
        num_iterations: Number of iterations to run the inference, if running in a loop.
        num_ipus: Number of ipus to run the inference on.
        mode: Mode of inference - {"single_ipu", "replicated"}
        data: Run on real data transferred from host or on random synthetic data generated on device.
        available_memory_proportion: Proportion of tile memory available as temporary memory for
        matmul and convolution execution

    """
    # Set compile and device options
    opts = IPUConfig()
    opts.matmuls.poplar_options = {
        'availableMemoryProportion': str(available_memory_proportion)
    }
    opts.convolutions.poplar_options = {
        'availableMemoryProportion': str(available_memory_proportion)
    }

    if mode == 'replicated':
        num_replicas = num_ipus
    else:
        num_replicas = 1
    opts.auto_select_ipus = num_ipus
    opts.configure_ipu_system()
    with tf.Session() as session:
        session.run(infeed_queue_initializer)
        fps = []
        for iter_count in range(num_iterations):
            start = time.time()
            session.run(loop_op)
            predictions = session.run(outfeed_op)
            stop = time.time()
            fps.append(batch_size * batches_per_step * num_replicas /
                       (stop - start))
            logging.info(
                "Iter {4}: {0} Throughput using {1} data = {2:.1f} imgs/sec at batch size = {3}"
                .format(network_name, data, fps[-1], batch_size, iter_count))
            duration = stop - start
            report_string = "{:<7.3} sec/itr.".format(duration)
            report_string += "   {:5f} images/sec.".format(fps[-1])
            print(report_string)
            print("Total time: {}".format(duration))

            # Decode a random prediction per step to check functional correctness.
            if data == 'real':
                predictions = np.reshape(predictions,
                                         (-1, predictions.shape[-1]))
                index = np.random.randint(0, len(predictions))
                if network_name in ("inceptionv1", "efficientnet-s",
                                    "efficientnet-m", "efficientnet-l"):
                    # These models encode background in 0th index.
                    decoded_predictions = decode_predictions(
                        predictions[index:index + 1, 1:], top=3)
                else:
                    decoded_predictions = decode_predictions(
                        predictions[index:index + 1, :], top=3)
                labels_and_probs = [
                    (label, prob) for _, label, prob in decoded_predictions[0]
                ]
                print(
                    'Actual: ',
                    ground_truth[(index + num_replicas * iter_count *
                                  batches_per_step * batch_size) %
                                 len(ground_truth)])
                print('Predicted: ', labels_and_probs)

    print("Average statistics excluding the 1st 20 iterations.")
    print(
        "-------------------------------------------------------------------------------------------"
    )
    fps = fps[20:]
    print("Throughput at bs={}, data_mode={}, data_type={}, mode={},"
          " num_ipus={}, of {}: min={}, max={}, mean={}, std={}.".format(
              batch_size, data, predictions.dtype, mode, num_ipus,
              network_name, min(fps), max(fps), np.mean(fps), np.std(fps)))