Exemple #1
0
 def __init__(self,
              name,
              outfeed_mode=None,
              replication_factor=1,
              filters=None):
     """ Construct a MaybeOutfeedQueue.
     Args:
         name: The name to use for the wrapped IPUOutfeedQueue.
         outfeed_mode: The outfeed_mode for the wrapped IPUOutfeedQueue.
         replication_factor: The replication_factor for the wrapped IPUOutfeedQueue.
         filters: Optional list of strings. If not None then one of these strings
             must be contained within the key for the key,value pair to be added to
             the dictionary that will be enqueued.
     """
     self._queue = ipu_outfeed_queue.IPUOutfeedQueue(
         name,
         replication_factor=replication_factor,
         outfeed_mode=outfeed_mode)
     self._vals = {}
     self.enqueued = False
     if filters is not None:
         self._filters = []
         self._filters.extend(filters)
     else:
         self._filters = None
Exemple #2
0
def make_graph(fc_weights):
    graph = tf.Graph()

    with graph.as_default():
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="sparse_outfeed")
        fc, weights["init_weights"] = create_sparse_layers(opts, fc_weights)

        model_op = partial(model,
                           fc=fc,
                           opts=opts,
                           outfeed_queue=outfeed_queue,
                           dtype=dtype)

        with tf.device("cpu"):
            x_fc = tf.placeholder(dtype,
                                  shape=[opts.batchsize, opts.input_size])

        with ipu_scope('/device:IPU:0'):
            test_op = ipu_compiler.compile(model_op, inputs=[x_fc])

        with tf.device("cpu"):
            fc.create_placeholders()

        with ipu_scope('/device:IPU:0'):
            upload_sparse = build_update_op(fc)

        sparse_feed = {}
        sparse_feed.update(fc.feed_dict())

        dequeue = outfeed_queue.dequeue()
        ipu.utils.move_variable_initialization_to_cpu()

    return graph, outfeed_queue, fc, x_fc, test_op, upload_sparse, dequeue
Exemple #3
0
  def testIllegalCapture(self):
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed8")

    with ops.device('cpu'):
      y = array_ops.placeholder(np.float32, shape=[])

    def stage1(x):
      return x * y

    def stage2(x):
      return x

    def model_pipeline(x):
      return pipelining_ops.pipeline(
          [stage1, stage2],
          10,
          inputs=[x],
          outfeed_queue=outfeed_queue,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      y = array_ops.placeholder(np.float32, shape=[])

    with ops.device("/device:IPU:0"):
      with self.assertRaisesRegex(ValueError, 'Trying to capture the tensor'):
        ipu_compiler.compile(model_pipeline, inputs=[x])
Exemple #4
0
def make_graph(fc_weights):
    graph = tf.Graph()

    with graph.as_default():
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()
        fc, fc_pool = create_sparse_layers(opts)

        model_op = partial(model,
                           fc=fc,
                           fc_pool=fc_pool,
                           opts=opts,
                           outfeed_queue=outfeed_queue,
                           dtype=dtype)

        with tf.device("cpu"):
            x_fc = tf.placeholder(dtype,
                                  shape=[opts.batchsize, opts.input_size])

        with ipu_scope('/device:IPU:0'):
            test_op = ipu_compiler.compile(model_op, inputs=[x_fc])

        with tf.device("cpu"):
            fc.create_placeholders()
            fc_pool.create_placeholders()

        dequeue = outfeed_queue.dequeue()
        ipu.utils.move_variable_initialization_to_cpu()

    return graph, outfeed_queue, fc, fc_pool, x_fc, test_op, dequeue
Exemple #5
0
    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})
Exemple #6
0
  def testResnetLike(self):
    # Check that we get all classifications for a small resnet correct

    def stage1(img, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        x = conv(img, 7, 2, 16)
        x = nn.relu(x)
        x = max_pool(x, ksize=3, stride=2)
        return x, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        x = block("b", 2, 64, 1, x)
        return x, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        x = math_ops.reduce_mean(x, axis=[1, 2])
        x = fc(x, 100)
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 1 conv in stage1, 2 conv in stage2, 1 matmul in stage3 = 4
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
Exemple #7
0
def _gradient_accumulation_loop(test_wrapper,
                                fwd_fn,
                                inputs_fn,
                                input_values,
                                repeat_count,
                                num_batches_to_accumulate,
                                dataset_fn,
                                optimizer,
                                num_iterations=None):
  g = ops.Graph()

  if num_iterations is None:
    num_iterations = repeat_count * num_batches_to_accumulate

  with g.as_default(), test_wrapper.test_session(graph=g) as session:
    dataset = dataset_fn()
    inputs = inputs_fn()
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id())
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    with variable_scope.variable_scope("ipu", use_resource=True, reuse=False):

      def model(*args):
        loss = fwd_fn(*functional_ops._convert_to_list(args))  # pylint: disable=W0212
        enqueue_op = outfeed_queue.enqueue(loss)
        opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2(
            optimizer, num_batches_to_accumulate)
        outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements])
        outs.append(enqueue_op)
        outs.append(opt.minimize(loss))
        return outs

      def my_net(*args):
        return loops.repeat(num_iterations,
                            model,
                            inputs=args,
                            infeed_queue=infeed_queue)

    with ops.device("/device:IPU:0"):
      loop_ret = ipu_compiler.compile(my_net, inputs=inputs)

    outfeed_op = outfeed_queue.dequeue()

    profiling = utils.running_on_ipu_model()

    cfg = utils.create_ipu_config(profiling=profiling,
                                  profile_execution=profiling)
    cfg = utils.set_ipu_model_options(cfg,
                                      compile_ipu_code=True,
                                      tiles_per_ipu=128)
    cfg = utils.auto_select_ipus(cfg, 1)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    session.run(variables.global_variables_initializer())
    session.run(infeed_queue.initializer)
    session.run(loop_ret, feed_dict=dict(zip(inputs, input_values)))
    return session.run(outfeed_op)
Exemple #8
0
    def testResetSeed(self):
        # The dataset for feeding the graphs
        ds = dataset_ops.Dataset.from_tensors(
            array_ops.constant(1.0, shape=[SIZE]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            ds, feed_name="infeed", replication_factor=REPLICAS)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=REPLICAS)

        # The device side
        def body(x1, x2):
            d1 = rand_ops.dropout(x1)
            d2 = rand_ops.dropout(x2)
            outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
            return outfeed

        def my_net():
            r = loops.repeat(REPEATS, body, [], infeed_queue)
            return r

        with scopes.ipu_scope('/device:IPU:0'):
            res = ipu_compiler.compile(my_net, inputs=[])

        # The outfeed dequeue has to happen after the outfeed enqueue
        dequeue_outfeed = outfeed_queue.dequeue()

        # Configure the hardware
        config = utils.create_ipu_config(profiling=True)
        config = utils.auto_select_ipus(config, REPLICAS)
        config = utils.set_floating_point_behaviour_options(config)
        utils.configure_ipu_system(config)

        with session.Session() as sess:
            res_all = set()
            total = 0

            sess.run(infeed_queue.initializer)

            for _ in range(EXECS):
                sess.run(res)
                outfed_result = sess.run(dequeue_outfeed)
                for r in np.array(list(outfed_result.values())).reshape(
                    [-1, SIZE]):
                    total += 1
                    res_all.add(r.tostring())

            # 2 dropouts per replica * REPLICAS * REPEATS * EXECS
            expected = 2 * REPLICAS * REPEATS * EXECS
            self.assertEqual(total, expected)
            self.assertEqual(len(res_all), expected)
Exemple #9
0
def training_graph(model, opts, iterations_per_step=1):

    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver)
Exemple #10
0
    def __init__(self,
                 every_n_iter=None,
                 every_n_secs=None,
                 at_end=False,
                 formatter=None,
                 logging_mode=LoggingMode.LAST,
                 feed_name="logging_hook",
                 replication_factor=1):
        """Initializes the hook.

    Args:
      every_n_iter: `int`, print the tensor values once every N steps.
      every_n_secs: `int` or `float`, print the tensor values once every N
        seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
        provided (unless `at_end` is True).
      at_end: `bool` specifying whether to print the tensor values at the
        end of the run.
      formatter: function that takes a dict with tensor names and values and
        returns a string. If None, uses default formatting.
      logging_mode: `IPULoggingTensorHook.LoggingMode` that determines the
        behaviour when enqueuing multiple tensor values between dequeues
        (e.g. print all of them or only the last one).
      feed_name: `string`. The name of the outfeed queue. Must be unique.
      replication_factor: `int`, the number of replicas from which logging
        is performed.
    """
        if (every_n_iter is not None) and (every_n_secs is not None):
            raise ValueError(
                "Cannot provide both every_n_iter and every_n_secs")
        if every_n_iter is None and every_n_secs is None and not at_end:
            raise ValueError(
                "Either every_n_iter, every_n_secs or at_end should be provided"
            )

        only_log_at_end = (at_end and (every_n_iter is None)
                           and (every_n_secs is None))

        self._timer = (NeverTriggerTimer() if only_log_at_end else
                       SecondOrStepTimer(every_secs=every_n_secs,
                                         every_steps=every_n_iter))
        self._log_at_end = at_end
        self._formatter = formatter

        self._outfeed = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name,
            outfeed_mode=logging_mode,
            replication_factor=replication_factor)

        self._dequeue_op = None
        self._deleter_op = None
        self._iter_count = 0
Exemple #11
0
def train():
    graph = tf.Graph()
    with graph.as_default():
        dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[]))
        #         dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0]))
        dataset = dataset.map(lambda x: [x, x])
        dataset = dataset.batch(BS, drop_remainder=True)
        dataset = dataset.repeat()
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(),
                                                       feed_name="infeed")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed')
        time_steps_ph = tf.placeholder(tf.int32, shape=[])
        with ipu_scope('/device:IPU:0'):

            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        outputs = ipu_compiler.compile(compile_fn, [])

        dequeue_outfeed = outfeed_queue.dequeue()
    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.auto_select_ipus(ipu_options, 1)
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(SEED)

    sess = tf.Session(graph=graph)
    sess.run(init)
    sess.run(infeed_queue.initializer)

    steps = 6
    i = 0
    while i < steps:
        sess.run(outputs, feed_dict={time_steps_ph: 3})
        result = sess.run(dequeue_outfeed)
        print(result)
        i = i + 1
        break
Exemple #12
0
  def testDuplicateInputsOutputs(self):
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9")

    def stage1(x, y):
      return x, y, y, x

    # The above should be optimised to a single copy for each duplicate output.
    def stage2(x1, y1, y2, x2):
      return x1, y1, y2, x2

    # Same for this stage
    def stage3(_x1, _y1, y2, x2):
      return x2, y2

    def model_pipeline(x, y):
      return pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          12,
          inputs=[x, y],
          outfeed_queue=outfeed_queue,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      y = array_ops.placeholder(np.float32, shape=[1, 2])

    with ops.device("/device:IPU:0"):
      compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                     inputs=[x, y])

    cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
    cfg = utils.auto_select_ipus(cfg, 4)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    #TODO(T10784) test how many IPU copies are here once we insert IPU copies.
    outfeed_op = outfeed_queue.dequeue()
    with tu.ipu_session() as sess:
      sess.run(compiled_model_pipeline, {
          x: np.ones(x.shape),
          y: np.ones(y.shape)
      })
      output = sess.run(outfeed_op)
      for i in range(12):
        self.assertAllClose(output[0][i], np.ones(x.shape))
        self.assertAllClose(output[1][i], np.ones(y.shape))
Exemple #13
0
  def testSyntheticDataWithOutfeeds(self):
    poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "")
    poplar_flags += " --use_ipu_model"
    poplar_flags += " --use_synthetic_data"
    poplar_flags += " --synthetic_data_initializer=random"

    with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags}):

      # The device side main
      def body(x1, x2):
        d1 = x1 + x2
        d2 = x1 - x2
        outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
        return outfeed

      def my_net():
        r = loops.repeat(5, body, [], infeed_queue)
        return r

      with ops.device('cpu'):
        # The dataset for feeding the graphs
        ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[10]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed2")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed2")

      with scopes.ipu_scope('/device:IPU:0'):
        run_loop = ipu_compiler.compile(my_net, inputs=[])

      # The outfeed dequeue has to happen after the outfeed enqueue
      dequeue_outfeed = outfeed_queue.dequeue()

      # Configure the hardware
      config = utils.create_ipu_config()
      config = utils.auto_select_ipus(config, 1)
      utils.configure_ipu_system(config)

      with tf.Session() as sess:
        sess.run(infeed_queue.initializer)
        sess.run(run_loop)
        result = sess.run(dequeue_outfeed)
        self.assertAllEqual(len(result['d1']), 0)
Exemple #14
0
def run_mnist(opts):
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    data_shape = [None, num_pixels]

    # Flatten the images and cast the labels:
    x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels)
    x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    batches_per_epoch = num_train // batch_size
    train_batches = (num_train * opts.epochs) // batch_size
    test_batches = num_test // batch_size
    batches_per_step = batches_per_epoch // opts.steps_per_epoch
    if not batches_per_epoch % opts.steps_per_epoch == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        place_x = tf.placeholder(dtype=tf.float32,
                                 shape=data_shape,
                                 name="input")
        place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
        lr_placeholder = tf.placeholder(tf.float32, shape=[])
        for fc in fc_layers.values():
            fc.create_placeholders(tf.float32)

    # Create dataset and IPU feeds:
    dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
    dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache()
    dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="train_infeed")
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="train_outfeed_last_itr")
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="test_infeed")
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="test_outfeed")

    # Use function binding to create all the builder functions that are neeeded:
    bound_train_model = partial(model, fc_layers, opts.droprate,
                                lr_placeholder, batches_per_step, True,
                                outfeed_train_queue)
    bound_train_loop = partial(loop_builder, batches_per_step,
                               bound_train_model, infeed_train_queue)
    bound_test_model = partial(model, fc_layers, opts.droprate, lr_placeholder,
                               batches_per_step, False, outfeed_test_queue)
    bound_test_loop = partial(loop_builder, test_batches, bound_test_model,
                              infeed_test_queue)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop, inputs=[])
        test_loop = ipu_compiler.compile(bound_test_loop, inputs=[])
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    config = utils.create_ipu_config()
    config = utils.auto_select_ipus(config, 1)
    utils.configure_ipu_system(config)

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    # Merge the feeds needed for all layers:
    sparse_feed = {}
    for fc in fc_layers.values():
        sparse_feed.update(fc.feed_dict())

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    logpath = os.path.join(opts.checkpoint_path,
                           datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer,
                 feed_dict={
                     place_x: x_train_flat,
                     place_y: y_train
                 })
        # Must initialise the sparse layers separately:
        sess.run(update_representation, feed_dict=sparse_feed)

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            progress = tqdm(
                range(opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)
                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        sess.run(update_representation, feed_dict=sparse_feed)
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    last = sess.run(dequeue_train_outfeed)

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(last[name+'_non_zeros'][0])}"
                            )
                            logger.info(
                                f"Average momentum for layer {name} : {np.mean(last[name+'_momentum'][0])}"
                            )
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_and_grow(name, fc, last, random_gen,
                                               steps, total_steps, opts)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {last['acc'][0]}\n")
                    progress.set_description(
                        f"Loss {last['mean_loss'][0]:.5f} Accuracy {last['acc'][0]:.5f}"
                    )

            logger.info(f"Saving...")
            saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            test_feed = {}
            for fc in fc_layers.values():
                test_feed.update(fc.feed_dict())

            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer,
                     feed_dict={
                         place_x: x_test_flat,
                         place_y: y_test
                     })
            sess.run(test_loop, feed_dict=test_feed)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
Exemple #15
0
def run_mnist(opts):
    if opts.pipelining and opts.gradient_accumulation_count < 4:
        raise ValueError(
            "Pipelining requires at least 4 gradient accumulation steps.")
    if opts.seed is not None:
        utils.reset_ipu_seed(opts.seed)
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size // opts.gradient_accumulation_count
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32

    # Flatten the images and cast the labels:
    permutation = make_pixel_permutation_matrix(opts, image_shape)

    x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape(
        -1, num_pixels)
    x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels)

    x_train_flat[:, ...] = x_train_flat[:, permutation]
    x_test_flat[:, ...] = x_test_flat[:, permutation]

    if opts.records_path:
        os.makedirs(opts.records_path, exist_ok=True)
        filename = os.path.join(opts.records_path, "pixel_permutation")
        np.save(filename, permutation)

    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    if opts.pipelining:
        logger.info(
            f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}"
        )
    batches_per_epoch = num_train // (batch_size *
                                      opts.gradient_accumulation_count)
    test_batches = num_test // (batch_size * opts.gradient_accumulation_count)

    batches_per_step = opts.batches_per_step_override
    if batches_per_step is None:
        batches_per_step = batches_per_epoch // opts.steps_per_epoch

    if not (batches_per_epoch % opts.steps_per_epoch) == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        lr_placeholder = tf.placeholder(dtype, shape=[])

    # Create dataset and IPU feeds:
    def make_generator(features, labels):
        return lambda: zip(features, labels)

    # Input pipeline
    def make_dataset(features, labels, is_training: bool):
        dataset = tf.data.Dataset.from_generator(
            generator=make_generator(features, labels),
            output_types=(features.dtype, labels.dtype),
            output_shapes=(features.shape[1:], labels.shape[1:]))

        if is_training:
            dataset = dataset.shuffle(buffer_size=num_train,
                                      seed=opts.seed).cache()

        dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
        return dataset

    train_dataset = make_dataset(features=x_train_flat,
                                 labels=y_train,
                                 is_training=True)

    test_dataset = make_dataset(features=x_test_flat,
                                labels=y_test,
                                is_training=False)

    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset)
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset)
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue()

    # Get optimiser
    opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg)
    logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__,
                opt_kws)

    # Get the bound model functions
    bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model
    (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn(
        fc_layers=fc_layers,
        opts=opts,
        lr_placeholder=lr_placeholder,
        opt_cls=opt_cls,
        opt_kws=opt_kws,
        train_batches_per_step=batches_per_step,
        test_batches_per_step=test_batches,
        train_queues=(outfeed_train_queue, infeed_train_queue),
        test_queues=(outfeed_test_queue, infeed_test_queue),
        png_queue=outfeed_prune_and_grow_queue,
        disable_dense_grad=opts.disable_dense_grad_override)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop,
                                          inputs=train_inputs)
        test_loop = ipu_compiler.compile(bound_test_loop)

    # Placeholders can only be created on cpu after all the slots have registered:
    with tf.device("cpu"):
        for fc in fc_layers.values():
            fc.create_placeholders()

    # Create update op on IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    utils.move_variable_initialization_to_cpu()
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config.configure_ipu_system()

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    # Add dense gradient outfeed if we have sparse layers
    dequeue_prune_and_grow_outfeed = None
    if not opts.disable_dense_grad_override and any(
            fc.is_sparse() for fc in fc_layers.values()):
        dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    if opts.restore:
        logpath = os.path.join(opts.checkpoint_path, opts.restore)
    else:
        logpath = os.path.join(opts.checkpoint_path,
                               datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer)

        if opts.restore:
            saver.restore(sess, logpath + '/model.ckpt')

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            start = opts.start_epoch if opts.restore else 0
            progress = tqdm(
                range(start, opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)

                    t1 = time.perf_counter()
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    t2 = time.perf_counter()
                    sess_time = t2 - t1
                    batch_time = sess_time / batches_per_step
                    throughput = batch_size / batch_time
                    logger.info(f"Time for sess.run: {sess_time:0.3f} "
                                f"Time per batch: {batch_time:0.6f} "
                                f"Throughput: {throughput}")

                    if opts.single_train_step_only:
                        return

                    train_outputs = sess.run(dequeue_train_outfeed)
                    if opts.pipelining:
                        train_outputs = train_outputs[-1]

                    # Get the last value for all items:
                    for k, v in train_outputs.items():
                        train_outputs[k] = v[-1]
                    logger.debug(f"Train outputs: {train_outputs.keys()}")

                    # Merge prune and grow fetches with last fetches:
                    if dequeue_prune_and_grow_outfeed is not None:
                        png_data = sess.run(dequeue_prune_and_grow_outfeed)
                        for k in png_data:
                            png_data[k] = png_data[k][-1]
                        logger.debug(
                            f"Prune and grow outputs: {png_data.keys()}")

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            var_name = fc.get_values_var().name
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(png_data[var_name])}"
                            )
                            for slot_name in fc.sparse_slots:
                                logger.info(
                                    f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}"
                                )
                            if i == 0 and e == opts.start_epoch:
                                metainfo = sess.run(fc.get_metainfo_var())
                            else:
                                metainfo = None
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_sched = prune_and_grow(name,
                                                             fc,
                                                             png_data,
                                                             random_gen,
                                                             steps,
                                                             total_steps,
                                                             opts,
                                                             metainfo=metainfo)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )
                                logger.info(
                                    f"Pruned proportion: {prune_sched}")
                                if opts.use_wandb:
                                    wandb.log({'Prune Schedule': prune_sched},
                                              commit=False)

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {train_outputs['acc']}\n")
                    if opts.use_wandb:
                        wandb.log(
                            {
                                'Loss': train_outputs['mean_loss'],
                                'Accuracy': train_outputs['acc'],
                                'Throughput': throughput
                            },
                            commit=True)
                    progress.set_description(
                        f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}"
                    )

                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        # Merge the feeds needed for all layers:
                        sparse_feed = {}
                        for fc in fc_layers.values():
                            if fc.is_sparse():
                                sparse_feed.update(fc.feed_dict())
                        sess.run(update_representation, feed_dict=sparse_feed)

                if e % opts.checkpoint_freq == 0:
                    logger.info(f"Saving...")
                    saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer)
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}"
            )
            if opts.use_wandb:
                wandb.run.summary["Test Loss"] = test_loss
                wandb.run.summary["Test Accuracy"] = test_acc
Exemple #16
0
    def run_with_pipeline(self):
        self._build_dataset()
        self._build_computational_stages()
        self.get_global_batch_size()
        self.outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        def train(lr, infeed, outfeed, gradient_accumulation_count):
            pipeline_op = pipelining_ops.pipeline(
                self.computational_stages,
                gradient_accumulation_count=gradient_accumulation_count,
                gradient_accumulation_dtype=self.dtype,
                inputs=[lr],
                infeed_queue=infeed,
                outfeed_queue=outfeed,
                device_mapping=self.device_mapping,
                optimizer_function=self.optimizer_function,
                offload_weight_update_variables=False)

            return pipeline_op

        def infer(lr, infeed, outfeed, gradient_accumulation_count):
            pipeline_op = pipelining_ops.pipeline(
                self.computational_stages,
                gradient_accumulation_count=gradient_accumulation_count,
                gradient_accumulation_dtype=self.dtype,
                inputs=[lr],
                infeed_queue=infeed,
                outfeed_queue=outfeed,
                device_mapping=self.device_mapping)

            return pipeline_op

        model = train if self.training else infer
        with tf.compat.v1.device("cpu"):
            lr = tf.compat.v1.placeholder(np.float32, [])
        pipeline_md = partial(model,
                              lr=lr,
                              infeed=self.infeed_queue,
                              outfeed=self.outfeed_queue,
                              gradient_accumulation_count=self.
                              config['gradient_accumulation_count'])

        with ipu_scope('/device:IPU:0'):
            compiled = ipu_compiler.compile(pipeline_md, [])
        outfeed = self.outfeed_queue.dequeue()
        saver = tf.compat.v1.train.Saver()
        total_parameters = 0
        variables = tf.compat.v1.trainable_variables()

        if not os.path.exists('logs'): os.mkdir('logs')
        with open('logs/' + self.config['logfile'], 'w') as fp:
            for var in variables:
                fp.write(str(var) + '\n')
                total_parameters += np.prod(var.shape)
            fp.write('\nTotal Parameters : ' + str(total_parameters) + '\n')

        # Create ipu_options
        # we assume one ipu for one stage here
        ipu_options = get_config(num_ipus=len(self.device_mapping) *
                                 self.config['replica'])
        ipu_options.configure_ipu_system()

        total_steps = self.data_loader.num_utts * self.config[
            'epochs'] // self.global_batch_size
        print('total_steps: ', total_steps)
        if self.config['wandb_name'] is not None:
            try:
                import wandb
            except:
                raise ImportError('wandb not installed')
            wandb.init(self.config['wandb_name'])
        with tf.compat.v1.Session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            sess.run(self.infeed_queue.initializer)
            step_per_epoch = self.data_loader.num_utts // self.global_batch_size
            for epoch in range(1, self.config['epochs'] + 1):
                for step in range(1, step_per_epoch + 1):
                    global_step = (epoch - 1) * step_per_epoch + step
                    step_lr = self.get_lr(global_step)
                    start = time.time()
                    _ = sess.run(compiled, {lr: step_lr})
                    result = sess.run(outfeed)
                    duration = time.time() - start
                    if step % 10 == 0:
                        tput = self.global_batch_size / duration
                        print(
                            'epoch: {}/{}, global_step: {}/{}, loss: {}, through_put: {}'
                            .format(epoch, self.config['epochs'], global_step,
                                    total_steps, np.mean(result[1]), tput))
                kl_acc = self.get_kl_acc(result[2], result[3])
                if self.config['wandb_name'] is not None:
                    wandb.log({
                        "loss": np.mean(result[1]),
                        'acc': kl_acc,
                    })
                if self.config['save_checkpoint']:
                    saver.save(sess,
                               'logs/model.ckpt',
                               global_step=global_step)
            if self.config['freeze']:
                self.save_pb(sess, self.output_names)
def construct_graph(
        network_class: Type[InferenceNetwork], config: Path,
        checkpoint_dir: str, batch_size: int, batches_per_step: int,
        image_filenames: Tuple[str], loop: bool, preprocess_fn: Callable,
        num_ipus: int, mode: str, save_graph_pb: bool
) -> Tuple[tf.Operation, tf.Operation, tf.Operation]:
    """Create inference graph on the device, set up in-feeds and out-feeds, connect dataset iterator to the graph.

    This function also exports the frozen graph into an event file, to be viewed in Tensorboard in `network_name_graph`
    directory.

    Args:
        network_class: Class corresponding to chosen model.
        config: Path to config file.
        checkpoint_dir: Checkpoint location.
        batch_size: Batch size per forward pass.
        batches_per_step: Number of forward passes per step.
        image_filenames: Collection of path to images.
        loop: Run inference in a loop.
        preprocess_fn: Pre-process function to apply to the image before feeding into the graph.
        num_ipus: Number of ipus.
        mode: Inference mode.
        save_graph_pb: If true, export frozen graph to event file to view in Tensorboard

    Returns: Compiled loop operator to run repeated inference over the dataset, infeed_queue intitializer, outfeed op.

    """
    # Model specific config
    with open(config.as_posix()) as file_stream:
        try:
            config_dict = yaml.safe_load(file_stream)
        except yaml.YAMLError as exc:
            tf.logging.error(exc)

    config_dict['network_name'] = config.stem
    if 'dtype' not in config_dict:
        config_dict["dtype"] = 'float16'

    # Create inference optimized frozen graph definition
    network = network_class(input_shape=config_dict["input_shape"],
                            num_outputs=1000,
                            batch_size=batch_size,
                            data_type=config_dict['dtype'],
                            config=config_dict,
                            checkpoint_dir=checkpoint_dir)

    # Export frozen graph to event file to view in Tensorboard"
    if save_graph_pb:
        log_dir = Path(f"{config_dict['network_name']}_graph")
        graph_filename = f"{log_dir}/{config_dict['network_name']}_graph.pb"
        if not log_dir.exists():
            log_dir.mkdir()
        with tf.io.gfile.GFile(graph_filename, "wb") as f:
            f.write(network.optimized_graph.SerializeToString())
        logging.info("%d ops in the final graph." %
                     len(network.optimized_graph.node))
        import_to_tensorboard(graph_filename, log_dir=log_dir.as_posix())

    # Reset graph before creating one on the IPU
    tf.reset_default_graph()

    # Create dataset
    dataset = get_dataset(image_filenames,
                          batch_size,
                          loop=loop,
                          preprocess_fn=preprocess_fn,
                          img_width=config_dict["input_shape"][1],
                          img_height=config_dict["input_shape"][0],
                          dtype=config_dict['dtype'])

    # Set up graph on device, connect infeed and outfeed to the graph.
    num_replicas = num_ipus if mode == 'replicated' else 1
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset,
        device_ordinal=0,
        feed_name="infeed",
        replication_factor=num_replicas)
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        device_ordinal=0,
        feed_name="outfeed",
        outfeed_all=True,
        replication_factor=num_replicas)

    def comp_fn():
        def body(img):
            with scopes.ipu_scope('/device:IPU:0'):
                if mode == 'sharded':
                    with autoshard.ipu_autoshard():
                        probs = tf.import_graph_def(
                            network.optimized_graph,
                            input_map={network.graph_input: img},
                            name="optimized",
                            return_elements=[network.graph_output])[0]
                    autoshard.automatic_sharding(num_shards=num_ipus,
                                                 input_ts=img,
                                                 loss_ts=probs,
                                                 frozen_inference=True)
                    outfeed_op = outfeed_queue.enqueue(probs)
                    outfeed_op._set_attr(
                        sharding._XLA_SHARDING,
                        attr_value_pb2.AttrValue(
                            s=probs.op.get_attr('_XlaSharding')))
                else:
                    probs = tf.import_graph_def(
                        network.optimized_graph,
                        input_map={network.graph_input: img},
                        name="optimized",
                        return_elements=[network.graph_output])[0]
                    outfeed_op = outfeed_queue.enqueue(probs)
                # Note that enqueue happens on the IPU.
                return outfeed_op

        return loops.repeat(batches_per_step, body, [], infeed_queue)

    loop_op = ipu_compiler.compile(comp_fn, [])

    # The dequeue of the outfeed needs to happen on the CPU.
    with tf.device('cpu'):
        outfeed_dequeue = outfeed_queue.dequeue()

    ipu_utils.move_variable_initialization_to_cpu()
    return loop_op, infeed_queue.initializer, outfeed_dequeue
Exemple #18
0
  def testPipelineWithInfeedsKwargs(self):
    with tu.ipu_session() as sess:
      dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
      dataset = dataset.batch(batch_size=2, drop_remainder=True)

      def dataset_parser(value):
        a = value
        b = (value + 10.) / 2.0
        return {"a": a, "b": b}

      dataset = dataset.map(dataset_parser)
      infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed6")
      outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed6")

      def stage1(c, **kwargs):
        with variable_scope.variable_scope("vs", use_resource=True):
          y = layers.Conv2D(2,
                            1,
                            use_bias=True,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv1')(kwargs["a"])
          return y + kwargs["b"], c

      def stage2(x, c):
        return math_ops.reduce_sum(x) + c

      def stage3(x):
        return x

      def my_net(c):
        return pipelining_ops.pipeline(
            [stage1, stage2, stage3],
            12,
            inputs=[c],
            infeed_queue=infeed_queue,
            outfeed_queue=outfeed_queue,
            pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

      with ops.device('cpu'):
        c = array_ops.placeholder(np.float32, shape=[])

      with ops.device("/device:IPU:0"):
        r = ipu_compiler.compile(my_net, inputs=[c])

      cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
      cfg = utils.auto_select_ipus(cfg, 4)
      utils.configure_ipu_system(cfg)
      utils.move_variable_initialization_to_cpu()

      outfeed_op = outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, configure_device=False)
      report.reset()
      sess.run(variables.global_variables_initializer())
      sess.run(infeed_queue.initializer)
      sess.run(r, {c: 10.01})
      losses_pipeline = sess.run(outfeed_op)
      self.assertAllClose(losses_pipeline, [[
          410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01, 650.01,
          570.01, 890.01, 410.01, 730.01
      ]])
      report.parse_log()
      report.assert_pipeline_stages_on_expected_ipu((0, 1, 3))
def create_poplar_exec(model, opts, poplar_exec_path):
    """Create graph and save it to the file."""
    valid_graph = tf.Graph()

    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        if opts['generated_data']:
            # create dummy dataset with images only
            dummy_image = np.zeros((opts['micro_batch_size'],
                                    opts['image_size'], opts['image_size'], 3),
                                   dtype=np.uint8)
            inference_dataset = tf.data.Dataset.from_tensors(
                {"image": dummy_image})
        else:
            # create dataset with images and labels
            inference_dataset = dataset.data(opts, is_training=False)
        inference_dataset = inference_dataset.map(lambda x: {'data_dict': x})

        inference_infeed_iterator = \
            ipu_infeed_queue.IPUInfeedQueue(inference_dataset,
                                            prefetch_depth=opts['prefetch_depth'])

        acc_queue = ipu_outfeed_queue.IPUOutfeedQueue()
        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    accuracy_enqueue = acc_queue.enqueue(accuracy)
                    return accuracy_enqueue

                accuracy = loops.repeat(
                    int(opts['validation_batches_per_step']), body, [],
                    inference_infeed_iterator)
                return accuracy

        filenames, _ = get_ckpt_filenames(opts)

        accuracy = application_compile_op.experimental_application_compile_op(
            comp_fn, output_path=poplar_exec_path, freeze_variables=True)

        outfeed = acc_queue.dequeue()
        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()

    with tf.Session(graph=valid_graph, config=tf.ConfigProto()) as sess:
        if len(filenames) == 1:
            print("Restoring from a snapshot: ", filenames[0])
            sess.run(inference_infeed_iterator.initializer)
            init = tf.global_variables_initializer()
            sess.run(init)
            valid_saver.restore(sess, filenames[0])
        else:
            print(
                "Warning: no restore point found - randomly initialising weights instead"
            )
            init = tf.global_variables_initializer()
            sess.run(init)

        path = sess.run(accuracy)
        print(f"Poplar executable: {path}")

    valid_graph.finalize()
Exemple #20
0
    def train(self):
        # Configure the IPU options.
        ipu_options = ipu_utils.get_ipu_config(
            ipu_id=self.opts["select_ipu"],
            num_ipus_required=len(self.opts["train"]["device_mapping"]) *
            self.opts["train"]["replicas"],
            fp_exceptions=False,
            stochastic_rounding=True,
            xla_recompute=True,
            available_memory_proportion=0.2,
            max_cross_replica_buffer_size=16 * 1024 * 1024,
            scheduler_selection="Clustering",
            compile_only=False,
            partials_type="half")
        # config replication strategy
        if self.opts["use_popdist"]:
            strategy = create_popdist_strategy()
            ipu_options = strategy.update_ipu_config(ipu_options)
            ipu_options = popdist.tensorflow.set_ipu_config(
                ipu_options,
                len(self.opts["train"]["device_mapping"]),
                configure_device=False)
        ipu_options.configure_ipu_system()

        self.sess = tf.Session(config=tf.ConfigProto())

        stop_flag = []
        data_threads = []
        ds = self.get_dataset_on_the_fly(stop_flag, data_threads)

        global_step_holder = tf.placeholder(dtype=tf.int32, shape=())

        # we write this wrapper because self.model_func has "self" as it's parameter
        # it will cause an error when cal ipu_compiler.compile
        def model_wrapper():
            self.model_func(self.model, self.opts, global_step_holder,
                            self.infeed_queue, self.outfeed_queue)

        with ExitStack() as stack:
            if self.opts["use_popdist"]:
                stack.enter_context(strategy.scope())
            self.infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds)
            self.outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                if self.opts["use_popdist"]:

                    def distributed_per_replica_func():
                        return ipu_compiler.compile(model_wrapper, inputs=[])

                    compiled_model = strategy.experimental_run_v2(
                        distributed_per_replica_func, args=[])
                else:
                    compiled_model = ipu_compiler.compile(model_wrapper,
                                                          inputs=[])
            # The outfeed dequeue has to happen after the outfeed enqueue(after calling compile)
            dequeue_outfeed = self.outfeed_queue.dequeue()

            if self.opts["use_popdist"]:
                # Take the mean of all the outputs across the distributed workers
                dequeue_outfeed = [
                    strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                    for v in dequeue_outfeed
                ]

            with tf.name_scope("loader_and_saver"):
                self.loader, self.saver = self.get_loader_and_saver()

        self.sess.run(self.infeed_queue.initializer)
        self.sess.run(tf.global_variables_initializer())

        begin_epoch = 0

        if self.opts["train"]["load_type"] == "resume":
            # resume a half-trained run
            ckpts = []
            if os.path.exists("./checkpoint"):
                ckpts = sorted([
                    path for path in os.listdir("./checkpoint")
                    if "meta" in path
                ])
            if len(ckpts) == 0:
                logger.info("fail to resume, not find any ckpt")
                return
            ckpt_path = "./checkpoint/" + ckpts[-1].replace(".meta", "")
            logger.info("=> Resume training from: %s ... " % ckpt_path)
            self.loader.restore(self.sess, ckpt_path)
            begin_epoch = int(
                re.search("epoch=([0-9]+)", ckpt_path).groups()[0])
        elif self.opts["train"]["load_type"] in [
                "yolov3", "darknet53", "phase1"
        ]:
            # if load some pretrained ckpt
            if self.initial_weight and os.path.exists(self.initial_weight +
                                                      ".meta"):
                logger.info("=> Restoring weights from: %s ... " %
                            self.initial_weight)
                self.loader.restore(self.sess, self.initial_weight)
            else:
                raise Exception("can't find ckpt to load")

        elif self.opts["train"]["load_type"] == "empty":
            logger.info("=> no checkpoint to load !!!")
            logger.info("=> Now it starts to train YOLOV3 from scratch ...")
        else:
            raise Exception(
                "'load_type' is not one of expected values: yolov3, darknet53, phase1, resume, empty"
            )

        total_epochs = self.epochs
        total_batch_size = self.opts["train"]["pipeline_depth"] * \
            self.batch_size * \
            self.opts["train"]["replicas"] * \
            self.opts["distributed_worker_count"]
        samples_per_interaction = total_batch_size * self.repeat_count
        samples_per_epoch = len(self.trainset) * self.batch_size
        interactions_per_epoch = samples_per_epoch // samples_per_interaction
        if self.for_speed_test:
            interactions_per_epoch = 30
            total_epochs = 1
        steps_per_epoch = interactions_per_epoch * self.repeat_count
        logger.info("total epochs: {}".format(total_epochs))
        logger.info("steps_per_epoch: {}".format(steps_per_epoch))
        moving_loss = deque(maxlen=30)

        if self.opts["distributed_worker_index"] == 0:
            # we only write logs to tensorboard on main worker
            summary_writer = tf.summary.FileWriter(
                "./tf_log/" + datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
                session=self.sess)

        train_begin_time = time.time()
        for epoch in range(begin_epoch, total_epochs):
            logger.info("epoch {}:".format(epoch + 1))

            start_time = time.time()
            for interaction_count in range(interactions_per_epoch):
                global_step = epoch * steps_per_epoch + interaction_count * self.repeat_count
                self.sess.run(compiled_model,
                              feed_dict={global_step_holder: global_step})
                result = self.sess.run(dequeue_outfeed)

                if self.opts["distributed_worker_index"] == 0:
                    giou_loss = np.mean(result[0])
                    conf_loss = np.mean(result[1])
                    prob_loss = np.mean(result[2])
                    lr = np.mean(result[3])
                    total_loss = giou_loss + conf_loss + prob_loss
                    moving_loss.append(total_loss)
                    end_time = time.time()
                    duration = end_time - start_time
                    start_time = time.time()
                    total_samples = global_step * total_batch_size
                    logger.info(
                        "epoch:{}, global_steps:{}, total_samples:{}, lr:{:.3e}, \
 moving_total_loss:{:.2f}, duration:{:.2f}, samples/s:{:.2f},\
 total_time:{:.2f}".format(epoch + 1, global_step, total_samples, lr,
                           np.mean(moving_loss), duration,
                           samples_per_interaction / duration,
                           time.time() - train_begin_time))

                    train_summary = tf.Summary()
                    train_summary.value.add(tag="giou_loss",
                                            simple_value=giou_loss)
                    train_summary.value.add(tag="conf_loss",
                                            simple_value=conf_loss)
                    train_summary.value.add(tag="prob_loss",
                                            simple_value=prob_loss)
                    train_summary.value.add(tag="total_loss",
                                            simple_value=total_loss)
                    train_summary.value.add(tag="lr", simple_value=lr)
                    train_summary.value.add(
                        tag="samples_per_sec",
                        simple_value=samples_per_interaction / duration)
                    summary_writer.add_summary(train_summary, total_samples)
                    summary_writer.flush()
            if (not self.for_speed_test) and (
                    epoch % self.opts["train"]["epochs_per_ckpt"] == 0
                    or epoch == total_epochs - 1):
                if self.opts["distributed_worker_index"] == 0:
                    ckpt_loss = np.mean(moving_loss)
                else:
                    # if not call save on all instances, there will be a all-reduce error
                    # but call save on all workers is pointless
                    # so only ckpt saved at worker 0 will have a name with loss value
                    ckpt_loss = 0.0
                ckpt_file = "./checkpoint/yolov3-{}-epoch={}-moving_total_loss={:.4f}.ckpt".format(
                    datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), epoch + 1,
                    ckpt_loss)

                logger.info("saving to: " + ckpt_file)
                model_path = self.saver.save(self.sess,
                                             ckpt_file,
                                             global_step=global_step)
                if self.opts["distributed_worker_index"] == 0:
                    log.save_model_statistics(model_path, summary_writer,
                                              global_step * total_batch_size)
        # tell threads to stop
        stop_flag.append(0)
        for data_thread in data_threads:
            data_thread.join()
        self.sess.close()
Exemple #21
0
                    default=250,
                    help="Number of iterations")
opts = parser.parse_args()

# Make a simple linear regression tf Dataset, of N noisy x = y lines, squashed into range [0, 1]
fx = np.tile(np.linspace(0, 1, opts.num_features), [opts.num_data_points, 1])
x = (fx + np.random.uniform(-1, 1, fx.shape)).astype(np.float32)[:, None]
y = (fx + np.random.uniform(-1, 1, fx.shape)).astype(np.float32)[:, None]
dataset = tf.data.Dataset.from_tensor_slices((x, y))
dataset = dataset.map(lambda x, y: (tf.nn.sigmoid(x), tf.nn.sigmoid(y)))
dataset = dataset.repeat()
# Make the IPU infeed and outfeed
# To use replication, we make as many feeds as there are replicated IPUs by passing in replication_factor
infeed = ipu_infeed_queue.IPUInfeedQueue(
    dataset, replication_factor=opts.replication_factor, feed_name='in')
outfeed = ipu_outfeed_queue.IPUOutfeedQueue(
    replication_factor=opts.replication_factor, feed_name='out')


# Make a basic linear model
def model(X, Y):
    Yp = tf.layers.dense(X, opts.num_features)
    loss = tf.losses.mean_squared_error(Y, Yp)
    optimizer = tf.train.GradientDescentOptimizer(1e-3)
    # To use replication, we wrap our optimizer with the IPU custom CrossReplicaOptimizer,
    # ...which averages the gradients determined by all IPUs together
    training_op = ipu_optimizer.CrossReplicaOptimizer(optimizer).minimize(loss)
    # We can also use the CrossReplicaGradientAccumulationOptimizer instead, which accumulates gradients
    # ...every N mini_batches before updating parameters, to effectively increase the batch size
    # ...For replication, this reduces the number of inter-IPU synchs by the factor N.
    # training_op = gao.CrossReplicaGradientAccumulationOptimizer(optimizer, num_mini_batches=8).minimize(loss)
    # Enqueue the loss to be dequeued later off the IPU
Exemple #22
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(
            dataset_val,
            feed_name='DIN_dataset_infeed_val',
            replication_factor=(opts['replicas']))

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="DIN_validation_outfeed",
            replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(
        ipu_options, combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Exemple #23
0
from tensorflow.python.ipu import ipu_infeed_queue
from tensorflow.python.ipu import ipu_outfeed_queue
from tensorflow.python.ipu import loops
from tensorflow.python.ipu import scopes
from tensorflow.python.ipu import utils
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# The dataset for feeding the graphs
ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[800]))
ds = ds.map(lambda x: [x, x])
ds = ds.repeat()

# The host side queues
infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed")
outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed")


# The device side main
def body(x1, x2):
  d1 = x1 + x2
  d2 = x1 - x2
  outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
  return outfeed


def my_net():
  r = loops.repeat(10, body, [], infeed_queue)
  return r

Exemple #24
0
BATCH_SIZE = 50
# load dataset

train_images = np.random.normal(0, 1, (60000, 4))

print(np.shape(train_images))

train_images = train_images.reshape(train_images.shape[0], 1,
                                    4).astype("float32")

train_dataset = (tf.data.Dataset.from_tensor_slices(train_images).batch(
    BATCH_SIZE, drop_remainder=True).repeat(10))

infeed_GAN = ipu_infeed_queue.IPUInfeedQueue(train_dataset, feed_name="in_GAN")

outfeed_FULL = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_FULL")

outfeed_test = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_test")

with tf.device("cpu"):
    numPoints = tf.placeholder(np.int32, shape=(), name="numPoints")

from tensorflow.keras.layers import (
    Input,
    Flatten,
    Dense,
    Reshape,
    Dropout,
    LeakyReLU,
    Conv2DTranspose,
    Conv2D,
Exemple #25
0
  def testOutlinedFunction(self):
    # Check that we get all classifications for a simple conv

    def stage1(x, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        weight = variable_scope.get_variable(
            "w0",
            shape=[224, 48],
            dtype=np.float32,
            initializer=init_ops.ones_initializer())
        a = ipu_math_ops.serialized_matmul(
            x, weight, 2, serialization_dimension="a_rows_b_columns")
        a = nn.relu(a)
        b = fc(x, 48)
        b = nn.relu(b)
        return a + b, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        a = fc(x, 100)
        a = nn.relu(a)
        b = fc(x, 100)
        b = nn.relu(b)
        return a + b, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 224])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 3 matmul in stage 1, 2 matmuls in stage 2 = 5 (5x updates, 5x grads)
      self.assertAllEqual(report.get_ml_type_counts(), [0, 5, 2, 5])
Exemple #26
0
def training_graph(model, opts, iterations_per_step=1):
    train_graph = tf.Graph()
    sess_config = tf.ConfigProto()
    sess_target = None
    strategy = None

    if opts['distributed_cluster']:
        strategy, sess_target, sess_config = configure_distribution(
            opts, sess_config)

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()
        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)
        with tf.device('cpu'):
            profile_report = gen_ipu_ops.ipu_event_trace()
        ipu.utils.move_variable_initialization_to_cpu(graph=None)
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph,
                            config=sess_config,
                            target=sess_target)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, profile_report)
Exemple #27
0
def run_model(opts):
    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = 16
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    data_shape = [None, num_pixels]

    # Flatten the images and cast the labels:
    x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels)
    x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels)
    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    epochs = 5
    ipu_steps_per_epoch = 15
    batches_per_epoch = num_train // batch_size
    train_batches = (num_train * epochs) // batch_size
    test_batches = num_test // batch_size
    batches_per_step = batches_per_epoch // ipu_steps_per_epoch
    if not batches_per_epoch % ipu_steps_per_epoch == 0:
        raise ValueError(f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}.")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input")
        place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
        lr_placeholder = tf.placeholder(tf.float32, shape=[])

    # Create dataset and IPU feeds:
    dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
    dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True)
    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="train_infeed")
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="train_outfeed")
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset, feed_name="test_infeed")
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        feed_name="test_outfeed")

    # Use function binding to create all the builder functions that are neeeded:
    bound_train_model = partial(model, lr_placeholder, outfeed_train_queue, True)
    bound_train_loop = partial(
        loop_builder, batches_per_step, bound_train_model, infeed_train_queue)
    bound_test_model = partial(model, lr_placeholder, outfeed_test_queue, False)
    bound_test_loop = partial(loop_builder, test_batches,
                              bound_test_model, infeed_test_queue)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop, inputs=[])
        test_loop = ipu_compiler.compile(bound_test_loop, inputs=[])

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(
            tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    config = utils.create_ipu_config()
    config = utils.auto_select_ipus(config, 1)
    utils.configure_ipu_system(config)

    # These allow us to retrieve the results of IPU feeds:
    dequeue_train_outfeed = outfeed_train_queue.dequeue()
    dequeue_test_outfeed = outfeed_test_queue.dequeue()

    # Create a benchmark program for the infeed to determine maximum achievable throughput:
    infeed_perf = dataset_benchmark.infeed_benchmark(
        infeed_train_queue, epochs, num_train, True)

    print(f"\nImage shape: {image_shape} Training examples: {num_train} Test examples: {num_test}")
    print(f"Epochs: {epochs} Batch-size: {batch_size} Steps-per-epoch: {ipu_steps_per_epoch} Batches-per-step: {batches_per_step}")

    # Run the model:
    with tf.Session() as sess:
        print(f"Benchmarking the infeed...")
        sess.run(infeed_perf, feed_dict={place_x: x_train_flat, place_y: y_train})

        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer, feed_dict={
                 place_x: x_train_flat, place_y: y_train})

        if opts.test_mode in ["all", "training"]:
            print(f"Training...")
            progress = tqdm(
                range(epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:

                sess.run(metrics_initializer)
                for i in range(ipu_steps_per_epoch):
                    sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e)})
                    result = sess.run(dequeue_train_outfeed)
                    if len(result['mean_loss'] != 0) and len(result['acc'] != 0):
                        progress.set_description(f"Loss {result['mean_loss'][0]:.5f} Accuracy {result['acc'][0]:.5f}")

            print(f"Saving...")
            saver.save(sess, "model")

        if opts.test_mode in ["all", "tests"]:
            print(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer, feed_dict={
                     place_x: x_test_flat, place_y: y_test})
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = np.mean(result['mean_loss'])
            test_acc = np.mean(result['acc'])
            print(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
Exemple #28
0
  def testPipelineInvalidDeviceMapping(self):
    dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
    dataset = dataset.batch(batch_size=2, drop_remainder=True)

    def dataset_parser(value):
      a = value
      b = (value + 10.) / 2.0
      return {"a": a, "b": b}

    dataset = dataset.map(dataset_parser)
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed3")
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed3")

    def stage1(c, **kwargs):
      with variable_scope.variable_scope("vs", use_resource=True):
        y = layers.Conv2D(2,
                          1,
                          use_bias=True,
                          kernel_initializer=init_ops.ones_initializer(),
                          name='conv1')(kwargs["a"])
        return y + kwargs["b"], c

    def stage2(x, c):
      return math_ops.reduce_sum(x) + c

    def stage3(x):
      return x

    with ops.device('cpu'):
      c = array_ops.placeholder(np.float32, shape=[])

    # Wrong type:
    with self.assertRaisesRegex(
        TypeError, 'device_mapping argument needs to be a list or a tuple'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=1,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    # Too many values:
    with self.assertRaisesRegex(ValueError,
                                'Each stage must be mapped to an IPU'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=list(range(4)),
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    # Not enough values:
    with self.assertRaisesRegex(ValueError,
                                'Each stage must be mapped to an IPU'):
      pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          3,
          inputs=[c],
          infeed_queue=infeed_queue,
          outfeed_queue=outfeed_queue,
          device_mapping=tuple(range(1)),
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)
Exemple #29
0
image_class = Input(shape=(1, ), dtype='int32')
emb = Flatten()(Embedding(2,
                          latent_size,
                          input_length=1,
                          embeddings_initializer='glorot_normal')(image_class))

h = Multiply()([latent, emb])

fake_image = loc(h)

generator = Model(inputs=[latent, image_class], outputs=[fake_image])

generator.summary()

outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
    feed_name="outfeed%d" % np.random.randint(low=0, high=99999))

with tf.device("cpu"):
    numPoints = tf.placeholder(np.int32, shape=(), name="numPoints")


def body():

    noise = tf.random.normal((batch_size, 200), 0, 1)
    class_i = tf.ones((batch_size, 1))

    logits = generator([noise, class_i], training=False)

    images = tf.squeeze(logits)

    outfeed = outfeed_queue.enqueue(images)
Exemple #30
0
    raise ValueError(
        f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}."
    )

# Put placeholders on the CPU host:
with tf.device("cpu"):
    place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input")
    place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
    lr_placeholder = tf.placeholder(tf.float32, shape=[])

# Create dataset and IPU feeds:
dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y))
dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True)
infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                     feed_name="train_infeed")
outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue(
    feed_name="train_outfeed")
infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                    feed_name="test_infeed")
outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue(
    feed_name="test_outfeed")

# Use function binding to create all the builder functions that are neeeded:
bound_train_model = partial(model, lr_placeholder, outfeed_train_queue, True)
bound_train_loop = partial(loop_builder, batches_per_step, bound_train_model,
                           infeed_train_queue)
bound_test_model = partial(model, lr_placeholder, outfeed_test_queue, False)
bound_test_loop = partial(loop_builder, test_batches, bound_test_model,
                          infeed_test_queue)

# Use the bound builder functions to place the model on the IPU:
with scopes.ipu_scope("/device:IPU:0"):