Esempio n. 1
0
def make_and_run_on_device_benchmark(opts, train=True):
    name = "training" if train else "test"
    logging.info(f"Creating the {name} benchmark for running with a device")
    graph = tf.Graph()

    with graph.as_default():
        ds, num_ds, *_ = make_dataset(opts, use_synthetic_data=False, training=train)
        num_ds = num_ds // opts.batch_size
        infeed = ipu_infeed_queue.IPUInfeedQueue(ds)

        def empty_loop():
            def empty_body(data_infeed):
                return tf.no_op()
            return ipu.loops.repeat(opts.repeat_count, empty_body, [], infeed)

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            benchmark_op = ipu.ipu_compiler.compile(empty_loop, inputs=[])

    with tf.Session(graph=graph) as sess:
        # run a first un-monitored epoch to force compile
        sess.run(benchmark_op)
        times = []
        for _ in range(opts.epochs):
            progress = tqdm.tqdm(range(num_ds // opts.repeat_count))
            for _ in progress:
                t0 = time.perf_counter()
                sess.run(benchmark_op)
                t1 = time.perf_counter()
                times.append(t1 - t0)
        avg_time = np.mean(times)
        token_throughput = opts.source_sequence_length * opts.batch_size * opts.repeat_count / avg_time
        bytes_throughput = token_throughput * 4 / (2**30)

    logging.info(f"On device throughput: {token_throughput:0.2f} tokens/s = {bytes_throughput:0.2f} GB/s")
Esempio n. 2
0
def training_graph(opts, training_data):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, train_iterator, placeholders = training_data.get_dataset(
            opts, is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(opts,
                                                                    observed_ratings=observed_ratings,
                                                                    learning_rate=placeholders["learning_rate"],
                                                                    type='TRAIN')
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(opts.batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32),
                                     tf.constant(0, tf.float32)],
                                    infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar("RMSE/train", rmse)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(
        opts.logs_path + '/train',
        graph=train_graph,
        flush_secs=30)

    ipu_options = util.get_config(opts)
    ipu_options.configure_ipu_system()
    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph,
                    train_sess,
                    train_init,
                    [loss, train_summary, rmse],
                    placeholders,
                    infeed,
                    train_saver,
                    train_writer)
Esempio n. 3
0
    def _build_dataset(self):
        if not self.config['use_synthetic_data']:
            with open(self.config['dict_path'], 'r') as fp:
                for item in fp.readlines():
                    item = item.strip().split(' ')
                    self.char_dict[int(item[1])] = item[0]

        self.data_loader = Dataloader(
            self.config['data_path'],
            self.config['maxlen_in'],
            self.config['maxlen_tgt'],
            self.config['vocab_size'],
            self.config['fbank_size'],
            training=self.training,
            dtype=self.config['dtype'],
            use_synthetic_data=self.config['use_synthetic_data'])
        self.data_loader.load_data()

        output_types = (self.dtype, tf.int32, tf.int32, tf.int32)
        output_shapes = (tf.TensorShape([self.config['maxlen_in'], 83,
                                         1]), tf.TensorShape([]),
                         tf.TensorShape([self.config['maxlen_tgt']]),
                         tf.TensorShape([]))
        dataset = tf.data.Dataset.from_generator(self.data_loader,
                                                 output_types,
                                                 output_shapes=output_shapes)
        dataset = dataset.batch(self.config['batch_size'], drop_remainder=True)

        self.infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                            prefetch_depth=15)
Esempio n. 4
0
    def _build_dataset(self):
        self.start_id = start_id(self.output_vocab)
        self.end_id = end_id(self.output_vocab)
        data_file = ("./data/validation.csv"
                     if self.opts.infer else "./data/training.csv")
        data = Data(data_file, self.input_vocab, self.output_vocab)
        data.load()
        transform(data)
        vocab = (self.input_vocab, self.output_vocab)
        self.generator = DataGenerator(data, vocab, self.opts, self.start_id,
                                       self.end_id)
        items = next(self.generator)
        output_types = {i: tf.dtypes.as_dtype(items[i].dtype) for i in items}
        output_shapes = {i: tf.TensorShape(items[i].shape) for i in items}
        total_bytes = 0
        for i in items:
            total_bytes += items[i].nbytes
        dataset = tf.data.Dataset.from_generator(self.generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                       "InfeedQueue",
                                                       replication_factor=1)
        data_init = infeed_queue.initializer

        return dataset, infeed_queue, data_init, vocab
Esempio n. 5
0
    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})
Esempio n. 6
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=1,
        number_of_replicas=opts['replicas'] * opts['shards'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"])
    ipu.utils.configure_ipu_system(ipu_options)

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver, None)
Esempio n. 7
0
def _gradient_accumulation_loop(test_wrapper,
                                fwd_fn,
                                inputs_fn,
                                input_values,
                                repeat_count,
                                num_batches_to_accumulate,
                                dataset_fn,
                                optimizer,
                                num_iterations=None):
  g = ops.Graph()

  if num_iterations is None:
    num_iterations = repeat_count * num_batches_to_accumulate

  with g.as_default(), test_wrapper.test_session(graph=g) as session:
    dataset = dataset_fn()
    inputs = inputs_fn()
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id())
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    with variable_scope.variable_scope("ipu", use_resource=True, reuse=False):

      def model(*args):
        loss = fwd_fn(*functional_ops._convert_to_list(args))  # pylint: disable=W0212
        enqueue_op = outfeed_queue.enqueue(loss)
        opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2(
            optimizer, num_batches_to_accumulate)
        outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements])
        outs.append(enqueue_op)
        outs.append(opt.minimize(loss))
        return outs

      def my_net(*args):
        return loops.repeat(num_iterations,
                            model,
                            inputs=args,
                            infeed_queue=infeed_queue)

    with ops.device("/device:IPU:0"):
      loop_ret = ipu_compiler.compile(my_net, inputs=inputs)

    outfeed_op = outfeed_queue.dequeue()

    profiling = utils.running_on_ipu_model()

    cfg = utils.create_ipu_config(profiling=profiling,
                                  profile_execution=profiling)
    cfg = utils.set_ipu_model_options(cfg,
                                      compile_ipu_code=True,
                                      tiles_per_ipu=128)
    cfg = utils.auto_select_ipus(cfg, 1)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    session.run(variables.global_variables_initializer())
    session.run(infeed_queue.initializer)
    session.run(loop_ret, feed_dict=dict(zip(inputs, input_values)))
    return session.run(outfeed_op)
Esempio n. 8
0
def validation_graph(opts, valid_data):
    # Do not apply dropout during validation
    opts.apply_dropout = False

    valid_graph = tf.Graph()
    tf_device_ordinal = 0 if opts.multiprocessing else 1
    with valid_graph.as_default():
        dataset, _, _ = valid_data.get_dataset(opts, is_training=False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset, device_ordinal=tf_device_ordinal)

        with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)):
            def comp_fn():
                def body(sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings, ground_truth = tf.split(
                        data_tensors[0], num_or_size_splits=2, axis=1)
                    rmse_metric = graph_builder(opts,
                                                observed_ratings=observed_ratings,
                                                ground_truth=ground_truth,
                                                type='VALID')
                    return sum_rmse_metric + rmse_metric

                return loops.repeat(opts.validation_batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32)],
                                    infeed)

            (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, [])

        # Accuracy Ops
        rmse = sum_rmse_metric / opts.validation_batches_per_step

        valid_summary = tf.summary.scalar("RMSE/validation", rmse)
        valid_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_writer = tf.summary.FileWriter(
        opts.logs_path + '/valid',
        graph=valid_graph,
        flush_secs=30)

    ipu_options = util.get_config(opts)
    if opts.multiprocessing:
        ipu_options.configure_ipu_system()
    valid_sess = tf.Session(graph=valid_graph)

    return GraphOps(valid_graph,
                    valid_sess,
                    valid_init,
                    [rmse, valid_summary],
                    None,
                    infeed,
                    valid_saver,
                    valid_writer)
Esempio n. 9
0
    def testResetSeed(self):
        # The dataset for feeding the graphs
        ds = dataset_ops.Dataset.from_tensors(
            array_ops.constant(1.0, shape=[SIZE]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            ds, feed_name="infeed", replication_factor=REPLICAS)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=REPLICAS)

        # The device side
        def body(x1, x2):
            d1 = rand_ops.dropout(x1)
            d2 = rand_ops.dropout(x2)
            outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
            return outfeed

        def my_net():
            r = loops.repeat(REPEATS, body, [], infeed_queue)
            return r

        with scopes.ipu_scope('/device:IPU:0'):
            res = ipu_compiler.compile(my_net, inputs=[])

        # The outfeed dequeue has to happen after the outfeed enqueue
        dequeue_outfeed = outfeed_queue.dequeue()

        # Configure the hardware
        config = utils.create_ipu_config(profiling=True)
        config = utils.auto_select_ipus(config, REPLICAS)
        config = utils.set_floating_point_behaviour_options(config)
        utils.configure_ipu_system(config)

        with session.Session() as sess:
            res_all = set()
            total = 0

            sess.run(infeed_queue.initializer)

            for _ in range(EXECS):
                sess.run(res)
                outfed_result = sess.run(dequeue_outfeed)
                for r in np.array(list(outfed_result.values())).reshape(
                    [-1, SIZE]):
                    total += 1
                    res_all.add(r.tostring())

            # 2 dropouts per replica * REPLICAS * REPEATS * EXECS
            expected = 2 * REPLICAS * REPEATS * EXECS
            self.assertEqual(total, expected)
            self.assertEqual(len(res_all), expected)
Esempio n. 10
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas']))

        with ipu_scope('/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(ipu_options,
                                                 combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess,
                    init,
                    ops_train,
                    placeholders,
                    infeed_train,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Esempio n. 11
0
def training_graph(model, opts, iterations_per_step=1):

    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver)
Esempio n. 12
0
def train():
    graph = tf.Graph()
    with graph.as_default():
        dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[]))
        #         dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0]))
        dataset = dataset.map(lambda x: [x, x])
        dataset = dataset.batch(BS, drop_remainder=True)
        dataset = dataset.repeat()
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(),
                                                       feed_name="infeed")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed')
        time_steps_ph = tf.placeholder(tf.int32, shape=[])
        with ipu_scope('/device:IPU:0'):

            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        outputs = ipu_compiler.compile(compile_fn, [])

        dequeue_outfeed = outfeed_queue.dequeue()
    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.auto_select_ipus(ipu_options, 1)
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(SEED)

    sess = tf.Session(graph=graph)
    sess.run(init)
    sess.run(infeed_queue.initializer)

    steps = 6
    i = 0
    while i < steps:
        sess.run(outputs, feed_dict={time_steps_ph: 3})
        result = sess.run(dequeue_outfeed)
        print(result)
        i = i + 1
        break
Esempio n. 13
0
  def testSyntheticDataWithOutfeeds(self):
    poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "")
    poplar_flags += " --use_ipu_model"
    poplar_flags += " --use_synthetic_data"
    poplar_flags += " --synthetic_data_initializer=random"

    with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags}):

      # The device side main
      def body(x1, x2):
        d1 = x1 + x2
        d2 = x1 - x2
        outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
        return outfeed

      def my_net():
        r = loops.repeat(5, body, [], infeed_queue)
        return r

      with ops.device('cpu'):
        # The dataset for feeding the graphs
        ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[10]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed2")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed2")

      with scopes.ipu_scope('/device:IPU:0'):
        run_loop = ipu_compiler.compile(my_net, inputs=[])

      # The outfeed dequeue has to happen after the outfeed enqueue
      dequeue_outfeed = outfeed_queue.dequeue()

      # Configure the hardware
      config = utils.create_ipu_config()
      config = utils.auto_select_ipus(config, 1)
      utils.configure_ipu_system(config)

      with tf.Session() as sess:
        sess.run(infeed_queue.initializer)
        sess.run(run_loop)
        result = sess.run(dequeue_outfeed)
        self.assertAllEqual(len(result['d1']), 0)
Esempio n. 14
0
    def test_optimizer(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        strategy = ipu_strategy.IPUStrategy()

        report = tu.ReportJSON(self, eager_mode=True, replicated=True)
        report.reset()

        with strategy.scope():
            initial_variable = 2.0
            variable = variables.Variable(initial_variable)
            learning_rate = 0.5
            num_iterations = 3

            data = [1.0, 2.0]
            dataset = dataset_ops.Dataset.from_tensor_slices((data))
            dataset = dataset.repeat(num_iterations)
            infeed = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                     feed_name="feed",
                                                     replication_factor=2)

            optimizer = keras.optimizer_v2.gradient_descent.SGD(learning_rate)

            @def_function.function(experimental_compile=True)
            def apply_gradient():
                gradient = infeed._dequeue()  # pylint: disable=protected-access
                optimizer.apply_gradients([(gradient, variable)])

            # The optimizers in v2 will sum the gradients, and not average them.
            expected_gradient = np.sum(data)
            expected_variable = initial_variable

            infeed.initializer  # pylint: disable=pointless-statement

            for _ in range(num_iterations):
                strategy.experimental_run_v2(apply_gradient)
                expected_variable -= learning_rate * expected_gradient
                self.assertEqual(expected_variable, variable.numpy())
Esempio n. 15
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver)
Esempio n. 16
0
def run(benchmark, opts):
    '''
    Run the benchmark.

    benchmark - An instance of Benchmark
    opts - Namespace from argparse generated from parse_opts
    '''
    with ipu_scope('/device:IPU:0'):
        # Build graph
        with tf.device('cpu'):
            dataset = tf.data.Dataset \
                .range((opts.steps + 2) * opts.batches_per_step) \
                .map(lambda i: benchmark.inputs(opts, i)) \
                .prefetch(opts.batches_per_step)

        if opts.batches_per_step > 1:
            with tf.device('cpu'):
                infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
                    dataset, feed_name="benchmark_dataset_infeed")
                data_init = infeed_queue.initializer

            with tf.Graph().as_default():  # To get the shape and dtype
                dummy_opts = copy.deepcopy(opts)
                dummy_opts.shards = 1
                d = benchmark.inputs(dummy_opts, tf.constant(0))
                out = benchmark.graph_builder(dummy_opts, d)
            input = tf.constant(0, out.dtype, shape=out.shape)

            def body(inout, *args, **kwargs):
                with tf.control_dependencies([inout]):
                    # Run graph
                    with tf.variable_scope("MainGraph"):
                        out = benchmark.graph_builder(opts, kwargs)
                return out

            out = ipu_compiler.compile(
                lambda: loops.repeat(opts.batches_per_step, body, [input],
                                     infeed_queue), [])
        else:
            with tf.device('cpu'):
                data_tensors = dataset.make_one_shot_iterator().get_next()
                data_init = tf.no_op()
            out = ipu_compiler.compile(
                lambda: benchmark.graph_builder(opts, data_tensors), [])
            opts.batches_per_step = 1

    # Report
    report = gen_ipu_ops.ipu_event_trace()

    # Dump the graph to a logdir
    if opts.save_graph:
        writer = tf.summary.FileWriter(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs',
                         time.strftime('%Y%m%d_%H%M%S_%Z')))
        writer.add_graph(tf.get_default_graph())

    utils.configure_ipu_system(get_config(opts))
    with tf.Session() as sess:
        # Setup
        sess.run([benchmark.initializer(), data_init])
        sess.run(report)

        # Warmup
        print("Compiling and Warmup...")
        start = time.time()
        sess.run(out)
        duration = time.time() - start
        print("Duration: {:.3f} seconds\n".format(duration))

        # Cycle Report
        if opts.cycle_report:
            rep = sess.run(report)
            return extract_runtimes_from_report(
                rep, opts,
                display=True)  # Only run once if producing cycle report

        print("Executing...")
        average_batches_per_sec = 0
        # steps
        for i in range(opts.steps):
            # Run
            start = time.time()
            sess.run(out)
            duration = time.time() - start

            average_batches_per_sec += (opts.batches_per_step /
                                        duration) / opts.steps
            report_string = "{:<7.3} sec/itr.".format(duration)
            report_string += "   " + benchmark.iteration_report(opts, duration)
            print(report_string)

        return average_batches_per_sec
def training_graph(opts, training_data, device_index=0, learning_rate=0.001):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, _, placeholders = training_data.get_dataset(opts,
                                                             is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "training_dataset_infeed{0}".format(device_index), 0)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(
                        opts,
                        observed_ratings=observed_ratings,
                        learning_rate=placeholders["learning_rate"])
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(
                    opts.batches_per_step, body,
                    [tf.constant(0, tf.float32),
                     tf.constant(0, tf.float32)], infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", learning_rate)
        tf.summary.scalar("RMSE/train", rmse)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(opts.logs_path +
                                         '/train{0}'.format(device_index),
                                         graph=train_graph,
                                         flush_secs=30)

    ipu_options = ipu_utils.create_ipu_config(profiling=False)
    ipu_options = ipu_utils.set_floating_point_behaviour_options(
        ipu_options,
        inv=opts.fp_exceptions,
        div0=opts.fp_exceptions,
        oflo=opts.fp_exceptions,
        esr=opts.prng,
        nanoo=True)
    ipu_options = ipu_utils.auto_select_ipus(ipu_options, 1)
    ipu_utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init,
                    [loss, train_summary, rmse], placeholders, infeed,
                    train_saver, train_writer)
Esempio n. 18
0
def create_poplar_exec(model, opts, poplar_exec_path):
    """Create graph and save it to the file."""
    valid_graph = tf.Graph()

    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        if opts['generated_data']:
            # create dummy dataset with images only
            dummy_image = np.zeros((opts['micro_batch_size'],
                                    opts['image_size'], opts['image_size'], 3),
                                   dtype=np.uint8)
            inference_dataset = tf.data.Dataset.from_tensors(
                {"image": dummy_image})
        else:
            # create dataset with images and labels
            inference_dataset = dataset.data(opts, is_training=False)
        inference_dataset = inference_dataset.map(lambda x: {'data_dict': x})

        inference_infeed_iterator = \
            ipu_infeed_queue.IPUInfeedQueue(inference_dataset,
                                            prefetch_depth=opts['prefetch_depth'])

        acc_queue = ipu_outfeed_queue.IPUOutfeedQueue()
        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    accuracy_enqueue = acc_queue.enqueue(accuracy)
                    return accuracy_enqueue

                accuracy = loops.repeat(
                    int(opts['validation_batches_per_step']), body, [],
                    inference_infeed_iterator)
                return accuracy

        filenames, _ = get_ckpt_filenames(opts)

        accuracy = application_compile_op.experimental_application_compile_op(
            comp_fn, output_path=poplar_exec_path, freeze_variables=True)

        outfeed = acc_queue.dequeue()
        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()

    with tf.Session(graph=valid_graph, config=tf.ConfigProto()) as sess:
        if len(filenames) == 1:
            print("Restoring from a snapshot: ", filenames[0])
            sess.run(inference_infeed_iterator.initializer)
            init = tf.global_variables_initializer()
            sess.run(init)
            valid_saver.restore(sess, filenames[0])
        else:
            print(
                "Warning: no restore point found - randomly initialising weights instead"
            )
            init = tf.global_variables_initializer()
            sess.run(init)

        path = sess.run(accuracy)
        print(f"Poplar executable: {path}")

    valid_graph.finalize()
Esempio n. 19
0
def run_model(opts):
    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = 16
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    data_shape = [None, num_pixels]

    # Flatten the images and cast the labels:
    x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels)
    x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    epochs = 5
    ipu_steps_per_epoch = 15
    batches_per_epoch = num_train // batch_size
    train_batches = (num_train * epochs) // batch_size
    test_batches = num_test // batch_size
    batches_per_step = batches_per_epoch // ipu_steps_per_epoch
    if not batches_per_epoch % ipu_steps_per_epoch == 0:
        raise ValueError(f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}.")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input")
        place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
        lr_placeholder = tf.placeholder(tf.float32, shape=[])

    # Create dataset and IPU feeds:
    dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
    dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True)
    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="train_infeed")
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="train_outfeed")
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="test_infeed")
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="test_outfeed")

    # Use function binding to create all the builder functions that are neeeded:
    bound_train_model = partial(model, lr_placeholder, outfeed_train_queue, True)
    bound_train_loop = partial(
        loop_builder, batches_per_step, bound_train_model, infeed_train_queue)
    bound_test_model = partial(model, lr_placeholder, outfeed_test_queue, False)
    bound_test_loop = partial(loop_builder, test_batches,
                              bound_test_model, infeed_test_queue)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop, inputs=[])
        test_loop = ipu_compiler.compile(bound_test_loop, inputs=[])

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(
            tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    config = utils.create_ipu_config()
    config = utils.auto_select_ipus(config, 1)
    utils.configure_ipu_system(config)

    # These allow us to retrieve the results of IPU feeds:
    dequeue_train_outfeed = outfeed_train_queue.dequeue()
    dequeue_test_outfeed = outfeed_test_queue.dequeue()

    # Create a benchmark program for the infeed to determine maximum achievable throughput:
    infeed_perf = dataset_benchmark.infeed_benchmark(
        infeed_train_queue, epochs, num_train, True)

    print(f"\nImage shape: {image_shape} Training examples: {num_train} Test examples: {num_test}")
    print(f"Epochs: {epochs} Batch-size: {batch_size} Steps-per-epoch: {ipu_steps_per_epoch} Batches-per-step: {batches_per_step}")

    # Run the model:
    with tf.Session() as sess:
        print(f"Benchmarking the infeed...")
        sess.run(infeed_perf, feed_dict={place_x: x_train_flat, place_y: y_train})

        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer, feed_dict={
                 place_x: x_train_flat, place_y: y_train})

        if opts.test_mode in ["all", "training"]:
            print(f"Training...")
            progress = tqdm(
                range(epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:

                sess.run(metrics_initializer)
                for i in range(ipu_steps_per_epoch):
                    sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e)})
                    result = sess.run(dequeue_train_outfeed)
                    if len(result['mean_loss'] != 0) and len(result['acc'] != 0):
                        progress.set_description(f"Loss {result['mean_loss'][0]:.5f} Accuracy {result['acc'][0]:.5f}")

            print(f"Saving...")
            saver.save(sess, "model")

        if opts.test_mode in ["all", "tests"]:
            print(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer, feed_dict={
                     place_x: x_test_flat, place_y: y_test})
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = np.mean(result['mean_loss'])
            test_acc = np.mean(result['acc'])
            print(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
Esempio n. 20
0
from tensorflow.python.ipu.ops import normalization_ops

BATCH_SIZE = 50
# load dataset

train_images = np.random.normal(0, 1, (60000, 4))

print(np.shape(train_images))

train_images = train_images.reshape(train_images.shape[0], 1,
                                    4).astype("float32")

train_dataset = (tf.data.Dataset.from_tensor_slices(train_images).batch(
    BATCH_SIZE, drop_remainder=True).repeat(10))

infeed_GAN = ipu_infeed_queue.IPUInfeedQueue(train_dataset, feed_name="in_GAN")

outfeed_FULL = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_FULL")

outfeed_test = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_test")

with tf.device("cpu"):
    numPoints = tf.placeholder(np.int32, shape=(), name="numPoints")

from tensorflow.keras.layers import (
    Input,
    Flatten,
    Dense,
    Reshape,
    Dropout,
    LeakyReLU,
Esempio n. 21
0
if __name__ == "__main__":
    logging.basicConfig(
        level=logging.getLevelName('INFO'),
        format='%(asctime)s %(name)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')

    # Parse options
    opts = parse_args()

    if not opts.on_device_only:
        logger.info("Creating training dataset, infeed queue and benchmark.")
        # Create training dataset and infeed queue
        train_set, num_train, *_ = make_dataset(opts, use_synthetic_data=False, training=True)
        num_train = num_train // opts.batch_size

        infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_set)
        # Benchmark it
        infeed_perf_train = dataset_benchmark.infeed_benchmark(
            infeed_queue=infeed_train_queue,
            number_of_epochs=opts.epochs,
            elements_per_epochs=num_train,
            print_stats=False)
        ds_perf_train = dataset_benchmark.dataset_benchmark(
            dataset=train_set,
            number_of_epochs=opts.epochs,
            elements_per_epochs=num_train,
            print_stats=False,
            apply_options=True)

        logger.info("Creating test dataset, infeed queue and benchmark.")
        # Create test dataset
Esempio n. 22
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(
            dataset_val,
            feed_name='DIN_dataset_infeed_val',
            replication_factor=(opts['replicas']))

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="DIN_validation_outfeed",
            replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(
        ipu_options, combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Esempio n. 23
0
from tensorflow.python.ipu import ipu_compiler
from tensorflow.python.ipu import ipu_infeed_queue
from tensorflow.python.ipu import ipu_outfeed_queue
from tensorflow.python.ipu import loops
from tensorflow.python.ipu import scopes
from tensorflow.python.ipu import utils
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# The dataset for feeding the graphs
ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[800]))
ds = ds.map(lambda x: [x, x])
ds = ds.repeat()

# The host side queues
infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed")
outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed")


# The device side main
def body(x1, x2):
  d1 = x1 + x2
  d2 = x1 - x2
  outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
  return outfeed


def my_net():
  r = loops.repeat(10, body, [], infeed_queue)
  return r
Esempio n. 24
0
def train(replication_factor, batch_size, batch_per_step, profile, num_iter,
          time_steps):
    """Launch training."""

    # Set up in-feeds for the data
    with tf.device('cpu'):
        data_generator = EnvGenerator(batch_size, time_steps)
        items = next(data_generator)
        output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items))
        output_shapes = tuple((tf.TensorShape(i.shape) for i in items))
        total_bytes = 0
        for i in items:
            total_bytes += i.nbytes
        print(f'Input data size = {total_bytes/1000000} MB/batch')
        dataset = tf.data.Dataset.from_generator(data_generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "InfeedQueue", replication_factor=replication_factor)
        data_init = infeed_queue.initializer

    # Compile loss op
    with ipu_scope("/device:IPU:0"):
        total_loss = ipu_compiler.compile(
            lambda: loops.repeat(batch_per_step,
                                 build_train_op,
                                 infeed_queue=infeed_queue,
                                 inputs=[tf.constant(0.0, dtype=DTYPE)]))
    # Set up report op optionally.
    if profile:
        with tf.device('cpu'):
            report = gen_ipu_ops.ipu_event_trace()

    # Set up session on IPU
    opts = utils.create_ipu_config(
        profiling=profile,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=profile,
        merge_infeed_io_copies=True)
    opts = utils.set_optimization_options(
        opts, max_cross_replica_sum_buffer_size=10000000)
    opts = utils.auto_select_ipus(opts, [replication_factor])
    utils.configure_ipu_system(opts)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=True))

    # Initialize variables
    utils.move_variable_initialization_to_cpu()
    sess.run([tf.global_variables_initializer(), data_init])

    # Run training and time
    total_time = 0.0
    total_samples = 0
    skip_iterations = 5  # Initially the infeed may buffer extra input data and
    # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec.
    for iters in range(num_iter):
        data_generator.reset_counter()
        t0 = time.perf_counter()
        sess.run(total_loss)
        t1 = time.perf_counter()

        if profile:
            raw_reports = sess.run(report)
            if use_poplar_text_report:
                # extract the report
                rep = utils.extract_all_strings_from_event_trace(raw_reports)
                print("Writing profiling report to %s" % report_dest)
                with open(report_dest, "w") as f:
                    f.write(rep)
            else:
                os.makedirs('profile_rl', exist_ok=True)
                save_tf_report(raw_reports, log_dir='profile_rl')
                print("Writing profiling report to profile_rl")
            break

        if iters > skip_iterations:
            total_time += (t1 - t0)
            total_samples += (batch_size * batch_per_step * replication_factor)
            print("Average %.1f items/sec" % (total_samples / total_time))
Esempio n. 25
0
def run_mnist(opts):
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    data_shape = [None, num_pixels]

    # Flatten the images and cast the labels:
    x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels)
    x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    batches_per_epoch = num_train // batch_size
    train_batches = (num_train * opts.epochs) // batch_size
    test_batches = num_test // batch_size
    batches_per_step = batches_per_epoch // opts.steps_per_epoch
    if not batches_per_epoch % opts.steps_per_epoch == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        place_x = tf.placeholder(dtype=tf.float32,
                                 shape=data_shape,
                                 name="input")
        place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
        lr_placeholder = tf.placeholder(tf.float32, shape=[])
        for fc in fc_layers.values():
            fc.create_placeholders(tf.float32)

    # Create dataset and IPU feeds:
    dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
    dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache()
    dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="train_infeed")
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="train_outfeed_last_itr")
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="test_infeed")
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="test_outfeed")

    # Use function binding to create all the builder functions that are neeeded:
    bound_train_model = partial(model, fc_layers, opts.droprate,
                                lr_placeholder, batches_per_step, True,
                                outfeed_train_queue)
    bound_train_loop = partial(loop_builder, batches_per_step,
                               bound_train_model, infeed_train_queue)
    bound_test_model = partial(model, fc_layers, opts.droprate, lr_placeholder,
                               batches_per_step, False, outfeed_test_queue)
    bound_test_loop = partial(loop_builder, test_batches, bound_test_model,
                              infeed_test_queue)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop, inputs=[])
        test_loop = ipu_compiler.compile(bound_test_loop, inputs=[])
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    config = utils.create_ipu_config()
    config = utils.auto_select_ipus(config, 1)
    utils.configure_ipu_system(config)

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    # Merge the feeds needed for all layers:
    sparse_feed = {}
    for fc in fc_layers.values():
        sparse_feed.update(fc.feed_dict())

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    logpath = os.path.join(opts.checkpoint_path,
                           datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer,
                 feed_dict={
                     place_x: x_train_flat,
                     place_y: y_train
                 })
        # Must initialise the sparse layers separately:
        sess.run(update_representation, feed_dict=sparse_feed)

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            progress = tqdm(
                range(opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)
                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        sess.run(update_representation, feed_dict=sparse_feed)
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    last = sess.run(dequeue_train_outfeed)

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(last[name+'_non_zeros'][0])}"
                            )
                            logger.info(
                                f"Average momentum for layer {name} : {np.mean(last[name+'_momentum'][0])}"
                            )
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_and_grow(name, fc, last, random_gen,
                                               steps, total_steps, opts)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {last['acc'][0]}\n")
                    progress.set_description(
                        f"Loss {last['mean_loss'][0]:.5f} Accuracy {last['acc'][0]:.5f}"
                    )

            logger.info(f"Saving...")
            saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            test_feed = {}
            for fc in fc_layers.values():
                test_feed.update(fc.feed_dict())

            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer,
                     feed_dict={
                         place_x: x_test_flat,
                         place_y: y_test
                     })
            sess.run(test_loop, feed_dict=test_feed)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
Esempio n. 26
0
def run_mnist(opts):
    if opts.pipelining and opts.gradient_accumulation_count < 4:
        raise ValueError(
            "Pipelining requires at least 4 gradient accumulation steps.")
    if opts.seed is not None:
        utils.reset_ipu_seed(opts.seed)
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size // opts.gradient_accumulation_count
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32

    # Flatten the images and cast the labels:
    permutation = make_pixel_permutation_matrix(opts, image_shape)

    x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape(
        -1, num_pixels)
    x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels)

    x_train_flat[:, ...] = x_train_flat[:, permutation]
    x_test_flat[:, ...] = x_test_flat[:, permutation]

    if opts.records_path:
        os.makedirs(opts.records_path, exist_ok=True)
        filename = os.path.join(opts.records_path, "pixel_permutation")
        np.save(filename, permutation)

    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    if opts.pipelining:
        logger.info(
            f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}"
        )
    batches_per_epoch = num_train // (batch_size *
                                      opts.gradient_accumulation_count)
    test_batches = num_test // (batch_size * opts.gradient_accumulation_count)

    batches_per_step = opts.batches_per_step_override
    if batches_per_step is None:
        batches_per_step = batches_per_epoch // opts.steps_per_epoch

    if not (batches_per_epoch % opts.steps_per_epoch) == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        lr_placeholder = tf.placeholder(dtype, shape=[])

    # Create dataset and IPU feeds:
    def make_generator(features, labels):
        return lambda: zip(features, labels)

    # Input pipeline
    def make_dataset(features, labels, is_training: bool):
        dataset = tf.data.Dataset.from_generator(
            generator=make_generator(features, labels),
            output_types=(features.dtype, labels.dtype),
            output_shapes=(features.shape[1:], labels.shape[1:]))

        if is_training:
            dataset = dataset.shuffle(buffer_size=num_train,
                                      seed=opts.seed).cache()

        dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
        return dataset

    train_dataset = make_dataset(features=x_train_flat,
                                 labels=y_train,
                                 is_training=True)

    test_dataset = make_dataset(features=x_test_flat,
                                labels=y_test,
                                is_training=False)

    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset)
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset)
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue()

    # Get optimiser
    opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg)
    logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__,
                opt_kws)

    # Get the bound model functions
    bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model
    (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn(
        fc_layers=fc_layers,
        opts=opts,
        lr_placeholder=lr_placeholder,
        opt_cls=opt_cls,
        opt_kws=opt_kws,
        train_batches_per_step=batches_per_step,
        test_batches_per_step=test_batches,
        train_queues=(outfeed_train_queue, infeed_train_queue),
        test_queues=(outfeed_test_queue, infeed_test_queue),
        png_queue=outfeed_prune_and_grow_queue,
        disable_dense_grad=opts.disable_dense_grad_override)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop,
                                          inputs=train_inputs)
        test_loop = ipu_compiler.compile(bound_test_loop)

    # Placeholders can only be created on cpu after all the slots have registered:
    with tf.device("cpu"):
        for fc in fc_layers.values():
            fc.create_placeholders()

    # Create update op on IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    utils.move_variable_initialization_to_cpu()
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config.configure_ipu_system()

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    # Add dense gradient outfeed if we have sparse layers
    dequeue_prune_and_grow_outfeed = None
    if not opts.disable_dense_grad_override and any(
            fc.is_sparse() for fc in fc_layers.values()):
        dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    if opts.restore:
        logpath = os.path.join(opts.checkpoint_path, opts.restore)
    else:
        logpath = os.path.join(opts.checkpoint_path,
                               datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer)

        if opts.restore:
            saver.restore(sess, logpath + '/model.ckpt')

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            start = opts.start_epoch if opts.restore else 0
            progress = tqdm(
                range(start, opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)

                    t1 = time.perf_counter()
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    t2 = time.perf_counter()
                    sess_time = t2 - t1
                    batch_time = sess_time / batches_per_step
                    throughput = batch_size / batch_time
                    logger.info(f"Time for sess.run: {sess_time:0.3f} "
                                f"Time per batch: {batch_time:0.6f} "
                                f"Throughput: {throughput}")

                    if opts.single_train_step_only:
                        return

                    train_outputs = sess.run(dequeue_train_outfeed)
                    if opts.pipelining:
                        train_outputs = train_outputs[-1]

                    # Get the last value for all items:
                    for k, v in train_outputs.items():
                        train_outputs[k] = v[-1]
                    logger.debug(f"Train outputs: {train_outputs.keys()}")

                    # Merge prune and grow fetches with last fetches:
                    if dequeue_prune_and_grow_outfeed is not None:
                        png_data = sess.run(dequeue_prune_and_grow_outfeed)
                        for k in png_data:
                            png_data[k] = png_data[k][-1]
                        logger.debug(
                            f"Prune and grow outputs: {png_data.keys()}")

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            var_name = fc.get_values_var().name
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(png_data[var_name])}"
                            )
                            for slot_name in fc.sparse_slots:
                                logger.info(
                                    f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}"
                                )
                            if i == 0 and e == opts.start_epoch:
                                metainfo = sess.run(fc.get_metainfo_var())
                            else:
                                metainfo = None
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_sched = prune_and_grow(name,
                                                             fc,
                                                             png_data,
                                                             random_gen,
                                                             steps,
                                                             total_steps,
                                                             opts,
                                                             metainfo=metainfo)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )
                                logger.info(
                                    f"Pruned proportion: {prune_sched}")
                                if opts.use_wandb:
                                    wandb.log({'Prune Schedule': prune_sched},
                                              commit=False)

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {train_outputs['acc']}\n")
                    if opts.use_wandb:
                        wandb.log(
                            {
                                'Loss': train_outputs['mean_loss'],
                                'Accuracy': train_outputs['acc'],
                                'Throughput': throughput
                            },
                            commit=True)
                    progress.set_description(
                        f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}"
                    )

                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        # Merge the feeds needed for all layers:
                        sparse_feed = {}
                        for fc in fc_layers.values():
                            if fc.is_sparse():
                                sparse_feed.update(fc.feed_dict())
                        sess.run(update_representation, feed_dict=sparse_feed)

                if e % opts.checkpoint_freq == 0:
                    logger.info(f"Saving...")
                    saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer)
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}"
            )
            if opts.use_wandb:
                wandb.run.summary["Test Loss"] = test_loss
                wandb.run.summary["Test Accuracy"] = test_acc
Esempio n. 27
0
def training_graph(model, opts, iterations_per_step=1):
    train_graph = tf.Graph()
    sess_config = tf.ConfigProto()
    sess_target = None
    strategy = None

    if opts['distributed_cluster']:
        strategy, sess_target, sess_config = configure_distribution(
            opts, sess_config)

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()
        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)
        with tf.device('cpu'):
            profile_report = gen_ipu_ops.ipu_event_trace()
        ipu.utils.move_variable_initialization_to_cpu(graph=None)
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph,
                            config=sess_config,
                            target=sess_target)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, profile_report)
Esempio n. 28
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor}
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs)

        with ipu_scope(f'/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(opts,
                                                        observed=batch[:, :-1],
                                                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                                                        learning_rate=placeholders['learning_rate'] if training else None,
                                                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse
                return loops.repeat(batches_per_step,
                                    body,
                                    [tf.constant(0, getattr(np, opts.dtypes[0]))]*2,
                                    infeed)
            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None
        if opts.compiler_report:
            if training:
                summary_ops.ipu_compile_summary('compile_summary', avg_loss)
            with tf.device('cpu'):
                print('Initializing training report...')
                report = gen_ipu_ops.ipu_event_trace()

    writer = tf.summary.FileWriter(
        opts.logs_path + f'/{mode_name}',
        graph=graph,
        flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        config = ipu_utils.create_ipu_config(profiling=training,
                                             use_poplar_text_report=True,
                                             max_cross_replica_sum_buffer_size=10000000,
                                             max_inter_ipu_copies_buffer_size=10000000)
        if opts.select_ipus == 'AUTO':
            config = ipu_utils.auto_select_ipus(config, [opts.replication_factor])
        else:
            config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]])
        config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()})
        ipu_utils.configure_ipu_system(config)

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph,
                    sess,
                    init,
                    graph_outputs,
                    placeholders if training else None,
                    infeed,
                    saver,
                    writer,
                    report,
                    trainFlag)
Esempio n. 29
0
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text]).astype(np.int32)

sequence_length = 100
batch_size = 16
replication_factor = 2

#  Create training examples / targets
ds = tf.data.Dataset.from_tensor_slices(text_as_int)
ds = ds.batch(sequence_length, drop_remainder=True)
ds = ds.shuffle(batch_size * batch_size)
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat()

# The host side queues
infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
    ds, feed_name="infeed", replication_factor=replication_factor)

# Set the learning rate
lr = 0.0001

# Create a momentum optimiser for replication
optimizer = cross_replica_optimizer.CrossReplicaOptimizer(
    tf.train.MomentumOptimizer(lr, 0.99))

# Create a host embedding object
embedding = embedding_ops.create_host_embedding(
    "char_embedding",
    shape=[256, 256],
    dtype=tf.float32,
    partition_strategy="TOKEN",
    optimizer_spec=embedding_ops.HostEmbeddingOptimizerSpec(lr))
Esempio n. 30
0
def construct_graph(
        network_class: Type[InferenceNetwork], config: Path,
        checkpoint_dir: str, batch_size: int, batches_per_step: int,
        image_filenames: Tuple[str], loop: bool, preprocess_fn: Callable,
        num_ipus: int, mode: str, save_graph_pb: bool
) -> Tuple[tf.Operation, tf.Operation, tf.Operation]:
    """Create inference graph on the device, set up in-feeds and out-feeds, connect dataset iterator to the graph.

    This function also exports the frozen graph into an event file, to be viewed in Tensorboard in `network_name_graph`
    directory.

    Args:
        network_class: Class corresponding to chosen model.
        config: Path to config file.
        checkpoint_dir: Checkpoint location.
        batch_size: Batch size per forward pass.
        batches_per_step: Number of forward passes per step.
        image_filenames: Collection of path to images.
        loop: Run inference in a loop.
        preprocess_fn: Pre-process function to apply to the image before feeding into the graph.
        num_ipus: Number of ipus.
        mode: Inference mode.
        save_graph_pb: If true, export frozen graph to event file to view in Tensorboard

    Returns: Compiled loop operator to run repeated inference over the dataset, infeed_queue intitializer, outfeed op.

    """
    # Model specific config
    with open(config.as_posix()) as file_stream:
        try:
            config_dict = yaml.safe_load(file_stream)
        except yaml.YAMLError as exc:
            tf.logging.error(exc)

    config_dict['network_name'] = config.stem
    if 'dtype' not in config_dict:
        config_dict["dtype"] = 'float16'

    # Create inference optimized frozen graph definition
    network = network_class(input_shape=config_dict["input_shape"],
                            num_outputs=1000,
                            batch_size=batch_size,
                            data_type=config_dict['dtype'],
                            config=config_dict,
                            checkpoint_dir=checkpoint_dir)

    # Export frozen graph to event file to view in Tensorboard"
    if save_graph_pb:
        log_dir = Path(f"{config_dict['network_name']}_graph")
        graph_filename = f"{log_dir}/{config_dict['network_name']}_graph.pb"
        if not log_dir.exists():
            log_dir.mkdir()
        with tf.io.gfile.GFile(graph_filename, "wb") as f:
            f.write(network.optimized_graph.SerializeToString())
        logging.info("%d ops in the final graph." %
                     len(network.optimized_graph.node))
        import_to_tensorboard(graph_filename, log_dir=log_dir.as_posix())

    # Reset graph before creating one on the IPU
    tf.reset_default_graph()

    # Create dataset
    dataset = get_dataset(image_filenames,
                          batch_size,
                          loop=loop,
                          preprocess_fn=preprocess_fn,
                          img_width=config_dict["input_shape"][1],
                          img_height=config_dict["input_shape"][0],
                          dtype=config_dict['dtype'])

    # Set up graph on device, connect infeed and outfeed to the graph.
    num_replicas = num_ipus if mode == 'replicated' else 1
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset,
        device_ordinal=0,
        feed_name="infeed",
        replication_factor=num_replicas)
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        device_ordinal=0,
        feed_name="outfeed",
        outfeed_all=True,
        replication_factor=num_replicas)

    def comp_fn():
        def body(img):
            with scopes.ipu_scope('/device:IPU:0'):
                if mode == 'sharded':
                    with autoshard.ipu_autoshard():
                        probs = tf.import_graph_def(
                            network.optimized_graph,
                            input_map={network.graph_input: img},
                            name="optimized",
                            return_elements=[network.graph_output])[0]
                    autoshard.automatic_sharding(num_shards=num_ipus,
                                                 input_ts=img,
                                                 loss_ts=probs,
                                                 frozen_inference=True)
                    outfeed_op = outfeed_queue.enqueue(probs)
                    outfeed_op._set_attr(
                        sharding._XLA_SHARDING,
                        attr_value_pb2.AttrValue(
                            s=probs.op.get_attr('_XlaSharding')))
                else:
                    probs = tf.import_graph_def(
                        network.optimized_graph,
                        input_map={network.graph_input: img},
                        name="optimized",
                        return_elements=[network.graph_output])[0]
                    outfeed_op = outfeed_queue.enqueue(probs)
                # Note that enqueue happens on the IPU.
                return outfeed_op

        return loops.repeat(batches_per_step, body, [], infeed_queue)

    loop_op = ipu_compiler.compile(comp_fn, [])

    # The dequeue of the outfeed needs to happen on the CPU.
    with tf.device('cpu'):
        outfeed_dequeue = outfeed_queue.dequeue()

    ipu_utils.move_variable_initialization_to_cpu()
    return loop_op, infeed_queue.initializer, outfeed_dequeue