Exemple #1
0
def training_graph(opts, training_data):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, train_iterator, placeholders = training_data.get_dataset(
            opts, is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(opts,
                                                                    observed_ratings=observed_ratings,
                                                                    learning_rate=placeholders["learning_rate"],
                                                                    type='TRAIN')
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(opts.batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32),
                                     tf.constant(0, tf.float32)],
                                    infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar("RMSE/train", rmse)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(
        opts.logs_path + '/train',
        graph=train_graph,
        flush_secs=30)

    ipu_options = util.get_config(opts)
    ipu_options.configure_ipu_system()
    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph,
                    train_sess,
                    train_init,
                    [loss, train_summary, rmse],
                    placeholders,
                    infeed,
                    train_saver,
                    train_writer)
Exemple #2
0
    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})
Exemple #3
0
    def testKerasLenet(self):
        """Check that the output of PoplarExecutableRunner produces the same output as the original Graph execution.
    """
        if utils.running_on_ipu_model():
            self.skipTest(
                "PoplarExecutableRunner only works with physical IPUs")

        with tempfile.TemporaryDirectory() as tmp:
            poplar_binaries_folder = os.path.join(tmp, "poplar")
            model_path = os.path.join(tmp, "model")
            weights_file = os.path.join(tmp, "weights.bin")
            output_path = os.path.join(tmp, "output")
            input_values = np.random.uniform(size=(1, 32, 32, 1))
            input_file = "%s/input.bin" % tmp

            with self.session() as sess:

                self.configureIPU(poplar_binaries_folder, False)
                with ops.device("/device:IPU:0"):
                    out, inp, model = instantiate_lenet()

                utils.move_variable_initialization_to_cpu()
                sess.run(global_variables_initializer())

                utils.export_inputs_to_file([inp], input_file,
                                            {inp: input_values})

                # Run the model once to generate the poplar binaries.
                reference_values = sess.run(out, {inp: input_values})

                # Export the model & weights.
                saved_model.save(model, model_path)

            metadata_file = self.getSingleFileWithExt(poplar_binaries_folder,
                                                      "json")
            executable_file = self.getSingleFileWithExt(
                poplar_binaries_folder, "ipu_bin")

            self.runPythonCommand(
                (("./tensorflow/compiler/plugin/poplar/tools/"
                  "tensorflow_weights_extractor.py -o %s -s %s -m %s") %
                 (weights_file, model_path, metadata_file)).split())

            self.runCommand((("./third_party/ipus/tools/PoplarExecutableRunner"
                              " --binaries %s,%s,%s "
                              "--output_folder=%s --strict") % (
                                  executable_file,
                                  weights_file,
                                  input_file,
                                  output_path,
                              )).split())

            output_file = self.getSingleFileWithExt(output_path, "data")
            with open(output_file, 'r') as f:
                runner_values = np.array(json.load(f))
                logging.info("Reference %s\nRunner: %s", reference_values,
                             runner_values)
                self.assertAllClose(reference_values, runner_values)
Exemple #4
0
def _gradient_accumulation_loop(test_wrapper,
                                fwd_fn,
                                inputs_fn,
                                input_values,
                                repeat_count,
                                num_batches_to_accumulate,
                                dataset_fn,
                                optimizer,
                                num_iterations=None):
  g = ops.Graph()

  if num_iterations is None:
    num_iterations = repeat_count * num_batches_to_accumulate

  with g.as_default(), test_wrapper.test_session(graph=g) as session:
    dataset = dataset_fn()
    inputs = inputs_fn()
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id())
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    with variable_scope.variable_scope("ipu", use_resource=True, reuse=False):

      def model(*args):
        loss = fwd_fn(*functional_ops._convert_to_list(args))  # pylint: disable=W0212
        enqueue_op = outfeed_queue.enqueue(loss)
        opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2(
            optimizer, num_batches_to_accumulate)
        outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements])
        outs.append(enqueue_op)
        outs.append(opt.minimize(loss))
        return outs

      def my_net(*args):
        return loops.repeat(num_iterations,
                            model,
                            inputs=args,
                            infeed_queue=infeed_queue)

    with ops.device("/device:IPU:0"):
      loop_ret = ipu_compiler.compile(my_net, inputs=inputs)

    outfeed_op = outfeed_queue.dequeue()

    profiling = utils.running_on_ipu_model()

    cfg = utils.create_ipu_config(profiling=profiling,
                                  profile_execution=profiling)
    cfg = utils.set_ipu_model_options(cfg,
                                      compile_ipu_code=True,
                                      tiles_per_ipu=128)
    cfg = utils.auto_select_ipus(cfg, 1)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    session.run(variables.global_variables_initializer())
    session.run(infeed_queue.initializer)
    session.run(loop_ret, feed_dict=dict(zip(inputs, input_values)))
    return session.run(outfeed_op)
Exemple #5
0
def validation_graph(opts, valid_data):
    # Do not apply dropout during validation
    opts.apply_dropout = False

    valid_graph = tf.Graph()
    tf_device_ordinal = 0 if opts.multiprocessing else 1
    with valid_graph.as_default():
        dataset, _, _ = valid_data.get_dataset(opts, is_training=False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset, device_ordinal=tf_device_ordinal)

        with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)):
            def comp_fn():
                def body(sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings, ground_truth = tf.split(
                        data_tensors[0], num_or_size_splits=2, axis=1)
                    rmse_metric = graph_builder(opts,
                                                observed_ratings=observed_ratings,
                                                ground_truth=ground_truth,
                                                type='VALID')
                    return sum_rmse_metric + rmse_metric

                return loops.repeat(opts.validation_batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32)],
                                    infeed)

            (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, [])

        # Accuracy Ops
        rmse = sum_rmse_metric / opts.validation_batches_per_step

        valid_summary = tf.summary.scalar("RMSE/validation", rmse)
        valid_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_writer = tf.summary.FileWriter(
        opts.logs_path + '/valid',
        graph=valid_graph,
        flush_secs=30)

    ipu_options = util.get_config(opts)
    if opts.multiprocessing:
        ipu_options.configure_ipu_system()
    valid_sess = tf.Session(graph=valid_graph)

    return GraphOps(valid_graph,
                    valid_sess,
                    valid_init,
                    [rmse, valid_summary],
                    None,
                    infeed,
                    valid_saver,
                    valid_writer)
Exemple #6
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas']))

        with ipu_scope('/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(ipu_options,
                                                 combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess,
                    init,
                    ops_train,
                    placeholders,
                    infeed_train,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Exemple #7
0
def train():
    graph = tf.Graph()
    with graph.as_default():
        dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[]))
        #         dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0]))
        dataset = dataset.map(lambda x: [x, x])
        dataset = dataset.batch(BS, drop_remainder=True)
        dataset = dataset.repeat()
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(),
                                                       feed_name="infeed")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed')
        time_steps_ph = tf.placeholder(tf.int32, shape=[])
        with ipu_scope('/device:IPU:0'):

            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        outputs = ipu_compiler.compile(compile_fn, [])

        dequeue_outfeed = outfeed_queue.dequeue()
    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.auto_select_ipus(ipu_options, 1)
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(SEED)

    sess = tf.Session(graph=graph)
    sess.run(init)
    sess.run(infeed_queue.initializer)

    steps = 6
    i = 0
    while i < steps:
        sess.run(outputs, feed_dict={time_steps_ph: 3})
        result = sess.run(dequeue_outfeed)
        print(result)
        i = i + 1
        break
Exemple #8
0
  def testDuplicateInputsOutputs(self):
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9")

    def stage1(x, y):
      return x, y, y, x

    # The above should be optimised to a single copy for each duplicate output.
    def stage2(x1, y1, y2, x2):
      return x1, y1, y2, x2

    # Same for this stage
    def stage3(_x1, _y1, y2, x2):
      return x2, y2

    def model_pipeline(x, y):
      return pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          12,
          inputs=[x, y],
          outfeed_queue=outfeed_queue,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      y = array_ops.placeholder(np.float32, shape=[1, 2])

    with ops.device("/device:IPU:0"):
      compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                     inputs=[x, y])

    cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
    cfg = utils.auto_select_ipus(cfg, 4)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    #TODO(T10784) test how many IPU copies are here once we insert IPU copies.
    outfeed_op = outfeed_queue.dequeue()
    with tu.ipu_session() as sess:
      sess.run(compiled_model_pipeline, {
          x: np.ones(x.shape),
          y: np.ones(y.shape)
      })
      output = sess.run(outfeed_op)
      for i in range(12):
        self.assertAllClose(output[0][i], np.ones(x.shape))
        self.assertAllClose(output[1][i], np.ones(y.shape))
Exemple #9
0
    def test_augru(self):
        seqlen = 3
        bs = 3
        inputs_value = np.ones([bs, seqlen, self.HIDDEN_SIZE], np.float32)
        seq_len_value = np.array([1, 3, 2], np.int32)

        alphas_value = np.ones([seqlen, bs], np.float32)
        alphas_value = alphas_value * 0.5
        inputs = tf.placeholder(shape=[bs, seqlen, self.HIDDEN_SIZE], dtype=self.model_dtype)
        seq_len = tf.placeholder(shape=[bs], dtype=tf.int32)
        alphas = tf.placeholder(shape=[seqlen, bs], dtype=self.model_dtype)

        cfg = utils.create_ipu_config(profiling=False, profile_execution=False)
        cfg = utils.auto_select_ipus(cfg, 1)
        utils.configure_ipu_system(cfg)
        utils.move_variable_initialization_to_cpu()

        with ops.device("/device:IPU:0"):
            train_ipu = ipu_compiler.compile(self.augru_model, inputs=[inputs, seq_len, alphas])
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for var in tf.global_variables():
                if var.name == 'popnn_augru/kernel:0':
                    augru_kernel = np.array([[0.3188401, 0.8256132, -0.12287354, 0.8648142, -0.17983055, -0.45415568],
                                            [-0.29249465, 0.65579015, -0.75681853, 0.4331085, -0.07700777, -0.47652483],
                                            [-0.20116574, 0.52735907, -0.08258069, -0.21897888, -0.54514384, 0.32709408],
                                            [-0.43361932, -0.62175727, 0.28278595, 0.13071388, -0.29585528, -0.14058399]])
                    augru_kernel_var = var
            sess.run(tf.assign(augru_kernel_var, augru_kernel))
            outputs_expected = np.array([[[-0.15881832, -0.39365855], [0., 0.], [0., 0.]],
                                        [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [-0.09283338, -0.6407641]],
                                        [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [0., 0.]]])
            outputs = sess.run(train_ipu, feed_dict={inputs: inputs_value, seq_len: seq_len_value, alphas: alphas_value})
            augru_kernel_updated = sess.run(augru_kernel_var)
            augru_kernel_expected = np.array([[0.31478855, 0.81888944, -0.12453551, 0.863326, -0.40852502, -0.5518727],
                                             [-0.2965462, 0.6490664, -0.7584805, 0.4316203, -0.30570224, -0.5742418],
                                             [-0.20129025, 0.52758944, -0.08233033, -0.21876118, -0.5368969, 0.3306306],
                                             [-0.43399453, -0.6211322, 0.28351453, 0.13140172, -0.25127774, -0.12138209]])
            self.assertAlmostEqual(np.mean(outputs-outputs_expected), np.float32(0.0), delta = 1e-7)
            self.assertAlmostEqual(np.mean(augru_kernel_expected-augru_kernel_updated), np.float32(0.0), delta = 1e-8)
Exemple #10
0
    def testWeightsExportersNoMetadata(self):
        """ Check that the weights extractor produces the same output with
     TF v1 and v2 models."""
        # Disable the IPU model
        poplar_flags = os.environ.get("TF_POPLAR_FLAGS",
                                      "").replace("--use_ipu_model", "")
        with test.mock.patch.dict("os.environ",
                                  {"TF_POPLAR_FLAGS": poplar_flags
                                   }), tempfile.TemporaryDirectory() as tmp:
            model_path_keras = os.path.join(tmp, "model_keras")
            model_path_session = os.path.join(tmp, "model_session")
            weights_keras = os.path.join(tmp, "weights_keras.bin")
            weights_session = os.path.join(tmp, "weights_session.bin")

            with self.session() as sess:
                self.configureIPU()
                with ops.device("/device:IPU:0"):
                    _, _, model = instantiate_lenet()
                utils.move_variable_initialization_to_cpu()
                sess.run(global_variables_initializer())

                # Export the model & weights.
                saved_model.save(model, model_path_keras)
                Saver().save(sess, model_path_session)

            self.runPythonCommand(
                (("./tensorflow/compiler/plugin/poplar/tools/"
                  "tensorflow_weights_extractor.py -o %s -s %s") %
                 (weights_keras, model_path_keras)).split())

            self.runPythonCommand(
                (("./tensorflow/compiler/plugin/poplar/tools/"
                  "tensorflow_weights_extractor.py -o %s -s %s") %
                 (weights_session, model_path_session)).split())

            with open(weights_session, 'rb') as s, open(weights_keras,
                                                        'rb') as k:
                self.assertEqual(s.read(), k.read())
Exemple #11
0
    def testWeightsExportersMetadataLive(self):
        """Export weights directly from a live model.
    """
        poplar_flags = os.environ.get("TF_POPLAR_FLAGS",
                                      "").replace("--use_ipu_model", "")
        with test.mock.patch.dict("os.environ",
                                  {"TF_POPLAR_FLAGS": poplar_flags
                                   }), tempfile.TemporaryDirectory() as tmp:
            poplar_binaries_folder = os.path.join(tmp, "poplar")
            weights_keras = os.path.join(tmp, "weights_keras.bin")
            weights_session = os.path.join(tmp, "weights_session.bin")

            with self.session() as sess:
                self.configureIPU(poplar_binaries_folder)
                with ops.device("/device:IPU:0"):
                    out, inp, model = instantiate_lenet_fix_weights()

                utils.move_variable_initialization_to_cpu()
                sess.run(global_variables_initializer())

                # Run the model once to generate the poplar binaries.
                try:
                    sess.run(out, {inp: np.ones((1, 32, 32, 1))})
                except errors.InvalidArgumentError:
                    pass

            metadata_file = self.getSingleFileWithExt(poplar_binaries_folder,
                                                      "json")

            with self.session() as sess:
                self.configureIPU()
                with ops.device("/device:IPU:0"):
                    _, _, _ = instantiate_lenet_fix_weights()

                utils.move_variable_initialization_to_cpu()
                sess.run(global_variables_initializer())

                utils.export_variables_from_live_session(
                    sess, weights_session, metadata_file)

            with self.session() as sess:
                self.configureIPU()
                with ops.device("/device:IPU:0"):
                    _, _, model = instantiate_lenet_fix_weights()

                utils.move_variable_initialization_to_cpu()
                sess.run(global_variables_initializer())
                utils.export_variables_from_live_model(model, weights_keras,
                                                       metadata_file)

            with open(weights_session, 'rb') as s, open(weights_keras,
                                                        'rb') as k:
                self.assertEqual(s.read(), k.read())
Exemple #12
0
def run_mnist(opts):
    if opts.pipelining and opts.gradient_accumulation_count < 4:
        raise ValueError(
            "Pipelining requires at least 4 gradient accumulation steps.")
    if opts.seed is not None:
        utils.reset_ipu_seed(opts.seed)
    random_gen = np.random.default_rng(seed=opts.seed)

    # Use Keras to get the dataset:
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Sizes/shapes for the dataset:
    image_shape = x_train.shape[1:]
    num_pixels = image_shape[0] * image_shape[1]
    batch_size = opts.batch_size // opts.gradient_accumulation_count
    batch_shape = [batch_size, num_pixels]
    num_train = y_train.shape[0]
    num_test = y_test.shape[0]
    dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32

    # Flatten the images and cast the labels:
    permutation = make_pixel_permutation_matrix(opts, image_shape)

    x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape(
        -1, num_pixels)
    x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels)

    x_train_flat[:, ...] = x_train_flat[:, permutation]
    x_test_flat[:, ...] = x_test_flat[:, permutation]

    if opts.records_path:
        os.makedirs(opts.records_path, exist_ok=True)
        filename = os.path.join(opts.records_path, "pixel_permutation")
        np.save(filename, permutation)

    y_train = y_train.astype(np.int32)
    y_test = y_test.astype(np.int32)

    # Decide how to split epochs into loops up front:
    if opts.pipelining:
        logger.info(
            f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}"
        )
    batches_per_epoch = num_train // (batch_size *
                                      opts.gradient_accumulation_count)
    test_batches = num_test // (batch_size * opts.gradient_accumulation_count)

    batches_per_step = opts.batches_per_step_override
    if batches_per_step is None:
        batches_per_step = batches_per_epoch // opts.steps_per_epoch

    if not (batches_per_epoch % opts.steps_per_epoch) == 0:
        raise ValueError(
            f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly."
        )

    # Create FC layer descriptions:
    fc_layers = create_fc_layers(opts, batch_shape, random_gen)
    for name, fc in fc_layers.items():
        logger.info(f"Layer Config: {name}: {type(fc)}")

    # Put placeholders on the CPU host:
    with tf.device("cpu"):
        lr_placeholder = tf.placeholder(dtype, shape=[])

    # Create dataset and IPU feeds:
    def make_generator(features, labels):
        return lambda: zip(features, labels)

    # Input pipeline
    def make_dataset(features, labels, is_training: bool):
        dataset = tf.data.Dataset.from_generator(
            generator=make_generator(features, labels),
            output_types=(features.dtype, labels.dtype),
            output_shapes=(features.shape[1:], labels.shape[1:]))

        if is_training:
            dataset = dataset.shuffle(buffer_size=num_train,
                                      seed=opts.seed).cache()

        dataset = dataset.repeat().batch(batch_size, drop_remainder=True)
        return dataset

    train_dataset = make_dataset(features=x_train_flat,
                                 labels=y_train,
                                 is_training=True)

    test_dataset = make_dataset(features=x_test_flat,
                                labels=y_test,
                                is_training=False)

    infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset)
    outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue()
    infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset)
    outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue()

    # Get optimiser
    opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg)
    logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__,
                opt_kws)

    # Get the bound model functions
    bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model
    (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn(
        fc_layers=fc_layers,
        opts=opts,
        lr_placeholder=lr_placeholder,
        opt_cls=opt_cls,
        opt_kws=opt_kws,
        train_batches_per_step=batches_per_step,
        test_batches_per_step=test_batches,
        train_queues=(outfeed_train_queue, infeed_train_queue),
        test_queues=(outfeed_test_queue, infeed_test_queue),
        png_queue=outfeed_prune_and_grow_queue,
        disable_dense_grad=opts.disable_dense_grad_override)

    # Use the bound builder functions to place the model on the IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        train_loop = ipu_compiler.compile(bound_train_loop,
                                          inputs=train_inputs)
        test_loop = ipu_compiler.compile(bound_test_loop)

    # Placeholders can only be created on cpu after all the slots have registered:
    with tf.device("cpu"):
        for fc in fc_layers.values():
            fc.create_placeholders()

    # Create update op on IPU:
    with scopes.ipu_scope("/device:IPU:0"):
        update_representation = build_update_op(fc_layers)

    # Initialisers should go on the CPU:
    with tf.device("cpu"):
        metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metrics")
        metrics_initializer = tf.variables_initializer(var_list=metrics_vars)
        saver = tf.train.Saver()

    # Setup and acquire an IPU device:
    utils.move_variable_initialization_to_cpu()
    config = IPUConfig()
    config.auto_select_ipus = 1
    config.floating_point_behaviour.inv = False
    config.floating_point_behaviour.div0 = False
    config.floating_point_behaviour.oflo = False
    config.floating_point_behaviour.esr = True
    config.floating_point_behaviour.nanoo = False
    config.configure_ipu_system()

    # These allow us to retrieve the results of IPU feeds:
    dequeue_test_outfeed = outfeed_test_queue.dequeue()
    dequeue_train_outfeed = outfeed_train_queue.dequeue()

    # Add dense gradient outfeed if we have sparse layers
    dequeue_prune_and_grow_outfeed = None
    if not opts.disable_dense_grad_override and any(
            fc.is_sparse() for fc in fc_layers.values()):
        dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue()

    logger.info(
        f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}"
    )
    logger.info(
        f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}"
    )
    total_steps = opts.steps_per_epoch * opts.epochs
    logger.info(f"Total steps: {total_steps}")

    if opts.log:
        # Open log and write header fields:
        log_file = open(opts.log, 'w')
        d1, d2 = opts.densities
        log_file.write(f"Iteration Density_{d1}_{d2}\n")

    if opts.restore:
        logpath = os.path.join(opts.checkpoint_path, opts.restore)
    else:
        logpath = os.path.join(opts.checkpoint_path,
                               datetime.now().strftime("%Y%m%d-%H%M%S"))
    summary_writer = tf.summary.FileWriter(logpath)

    if opts.records_path:
        # Save the first hidden layer's weight mask for later analysis:
        save_weights(opts, 'fc1', fc_layers['fc1'], 0)

    # Run the model:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(infeed_train_queue.initializer)

        if opts.restore:
            saver.restore(sess, logpath + '/model.ckpt')

        if opts.test_mode in ["all", "training"]:
            logger.info(f"Training...")
            start = opts.start_epoch if opts.restore else 0
            progress = tqdm(
                range(start, opts.epochs),
                bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}')
            for e in progress:
                for i in range(opts.steps_per_epoch):
                    sess.run(metrics_initializer)

                    t1 = time.perf_counter()
                    sess.run(train_loop,
                             feed_dict={lr_placeholder: scheduler(e, opts)})
                    t2 = time.perf_counter()
                    sess_time = t2 - t1
                    batch_time = sess_time / batches_per_step
                    throughput = batch_size / batch_time
                    logger.info(f"Time for sess.run: {sess_time:0.3f} "
                                f"Time per batch: {batch_time:0.6f} "
                                f"Throughput: {throughput}")

                    if opts.single_train_step_only:
                        return

                    train_outputs = sess.run(dequeue_train_outfeed)
                    if opts.pipelining:
                        train_outputs = train_outputs[-1]

                    # Get the last value for all items:
                    for k, v in train_outputs.items():
                        train_outputs[k] = v[-1]
                    logger.debug(f"Train outputs: {train_outputs.keys()}")

                    # Merge prune and grow fetches with last fetches:
                    if dequeue_prune_and_grow_outfeed is not None:
                        png_data = sess.run(dequeue_prune_and_grow_outfeed)
                        for k in png_data:
                            png_data[k] = png_data[k][-1]
                        logger.debug(
                            f"Prune and grow outputs: {png_data.keys()}")

                    steps = 1 + i + e * opts.steps_per_epoch
                    batches_processed = batches_per_step * steps
                    for name, fc in fc_layers.items():
                        if fc.is_sparse():
                            var_name = fc.get_values_var().name
                            logger.info(
                                f"Average weights for layer {name}: {np.mean(png_data[var_name])}"
                            )
                            for slot_name in fc.sparse_slots:
                                logger.info(
                                    f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}"
                                )
                            if i == 0 and e == opts.start_epoch:
                                metainfo = sess.run(fc.get_metainfo_var())
                            else:
                                metainfo = None
                            if not opts.disable_pruning:
                                logger.info(
                                    f"Starting prune and grow for layer {name}"
                                )
                                t0 = time.perf_counter()
                                prune_sched = prune_and_grow(name,
                                                             fc,
                                                             png_data,
                                                             random_gen,
                                                             steps,
                                                             total_steps,
                                                             opts,
                                                             metainfo=metainfo)
                                t1 = time.perf_counter()
                                logger.info(
                                    f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds"
                                )
                                logger.info(
                                    f"Pruned proportion: {prune_sched}")
                                if opts.use_wandb:
                                    wandb.log({'Prune Schedule': prune_sched},
                                              commit=False)

                    if opts.log:
                        log_file.write(
                            f"{batches_processed} {train_outputs['acc']}\n")
                    if opts.use_wandb:
                        wandb.log(
                            {
                                'Loss': train_outputs['mean_loss'],
                                'Accuracy': train_outputs['acc'],
                                'Throughput': throughput
                            },
                            commit=True)
                    progress.set_description(
                        f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}"
                    )

                    # Only need to feed an updated sparsity representation if we are running rig-L:
                    if not opts.disable_pruning:
                        # Merge the feeds needed for all layers:
                        sparse_feed = {}
                        for fc in fc_layers.values():
                            if fc.is_sparse():
                                sparse_feed.update(fc.feed_dict())
                        sess.run(update_representation, feed_dict=sparse_feed)

                if e % opts.checkpoint_freq == 0:
                    logger.info(f"Saving...")
                    saver.save(sess, os.path.join(logpath, 'model.ckpt'))

        if opts.test_mode in ["all", "tests"]:
            logger.info(f"Testing...")
            sess.run(metrics_initializer)
            sess.run(infeed_test_queue.initializer)
            sess.run(test_loop)
            result = sess.run(dequeue_test_outfeed)

            test_loss = result['mean_loss'][-1]
            test_acc = result['acc'][-1]
            logger.info(
                f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}"
            )
            if opts.use_wandb:
                wandb.run.summary["Test Loss"] = test_loss
                wandb.run.summary["Test Accuracy"] = test_acc
Exemple #13
0
 def test_gru(self):
     seqLen = 2
     bs = 3
     inputs_value = np.array(
         [[[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]]],
         np.float32)
     seq_len_value = np.array([1, 2, 2], np.int32)
     inputs = tf.placeholder(shape=[bs, seqLen, self.HIDDEN_SIZE],
                             dtype=self.model_dtype)
     seq_len = tf.placeholder(shape=[bs], dtype=tf.int32)
     cfg = IPUConfig()
     cfg.auto_select_ipus = 1
     cfg.configure_ipu_system()
     utils.move_variable_initialization_to_cpu()
     with ops.device("/device:IPU:0"):
         train_ipu = ipu_compiler.compile(self.gru_model,
                                          inputs=[inputs, seq_len])
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         for var in tf.global_variables():
             if var.name == 'popnn_dynamic_gru/kernel:0':
                 gru_kernel = np.array([[
                     0.36324948, 0.34305102, -0.47945526, 0.29105264,
                     -0.55362725, 0.33607864
                 ],
                                        [
                                            -0.20881158, 0.79369456,
                                            0.3866263, -0.55099547,
                                            0.41944432, 0.39612126
                                        ],
                                        [
                                            0.48400682, 0.16632384,
                                            -0.78809285, 0.47519642,
                                            0.4464376, -0.63623476
                                        ],
                                        [
                                            -0.57933414, -0.29082513,
                                            -0.7381171, 0.77089626,
                                            -0.24111485, 0.9164796
                                        ]])
                 gru_kernel_var = var
         sess.run(tf.assign(gru_kernel_var, gru_kernel))
         outputs_expected = np.array([[[-0.03196924, 0.06592286], [-0, 0]],
                                      [[-0.03196924, 0.06592286],
                                       [-0.06241067, 0.12973404]],
                                      [[-0.03196924, 0.06592286],
                                       [-0.06241067, 0.12973404]]])
         outputs = sess.run(train_ipu,
                            feed_dict={
                                inputs: inputs_value,
                                seq_len: seq_len_value
                            })
         gru_kernel_updated = sess.run(gru_kernel_var)
         gru_kernel_expected = np.array([[
             0.35011762, 0.37606436, -0.4793783, 0.29105875, -0.6845508,
             0.3001622
         ],
                                         [
                                             -0.22194342, 0.8267079,
                                             0.38670325, -0.55098933,
                                             0.28852075, 0.36020482
                                         ],
                                         [
                                             0.48412853, 0.16602053,
                                             -0.7880953, 0.4751962,
                                             0.4473563, -0.6360037
                                         ],
                                         [
                                             -0.57958513, -0.2901997,
                                             -0.73811203, 0.7708967,
                                             -0.24294817, 0.9160184
                                         ]])
         self.assertAlmostEqual(np.mean(outputs - outputs_expected),
                                np.float32(0.0),
                                delta=1e-7)
         self.assertAlmostEqual(np.mean(gru_kernel_expected -
                                        gru_kernel_updated),
                                np.float32(0.0),
                                delta=1e-8)
Exemple #14
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor}
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs)

        with ipu_scope(f'/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(opts,
                                                        observed=batch[:, :-1],
                                                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                                                        learning_rate=placeholders['learning_rate'] if training else None,
                                                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse
                return loops.repeat(batches_per_step,
                                    body,
                                    [tf.constant(0, getattr(np, opts.dtypes[0]))]*2,
                                    infeed)
            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None
        if opts.compiler_report:
            if training:
                summary_ops.ipu_compile_summary('compile_summary', avg_loss)
            with tf.device('cpu'):
                print('Initializing training report...')
                report = gen_ipu_ops.ipu_event_trace()

    writer = tf.summary.FileWriter(
        opts.logs_path + f'/{mode_name}',
        graph=graph,
        flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        config = ipu_utils.create_ipu_config(profiling=training,
                                             use_poplar_text_report=True,
                                             max_cross_replica_sum_buffer_size=10000000,
                                             max_inter_ipu_copies_buffer_size=10000000)
        if opts.select_ipus == 'AUTO':
            config = ipu_utils.auto_select_ipus(config, [opts.replication_factor])
        else:
            config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]])
        config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()})
        ipu_utils.configure_ipu_system(config)

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph,
                    sess,
                    init,
                    graph_outputs,
                    placeholders if training else None,
                    infeed,
                    saver,
                    writer,
                    report,
                    trainFlag)
Exemple #15
0
def generic_graph(opts, data, trainFlag):
    graph = tf.Graph()
    training = trainFlag == util.Modes.TRAIN
    mode_name = 'training' if training else 'validation'
    batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step
    # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream.
    # For this reason, batches_per_step must be a minimum of N.
    batches_per_step = int(batches_per_step / opts.replication_factor)

    with graph.as_default():
        dataset, placeholders = data.get_dataset(opts, mode=trainFlag)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)

        with ipu_scope(f'/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_rmse, batch):
                    loss, rmse, grad_op = graph_builder(
                        opts,
                        observed=batch[:, :-1],
                        ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                        learning_rate=placeholders['learning_rate']
                        if training else None,
                        mode=trainFlag)
                    if not training:
                        return total_loss + loss, total_rmse + rmse
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_rmse + rmse

                return loops.repeat(
                    batches_per_step, body,
                    [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed)

            outputs = ipu_compiler.compile(comp_fn, [])

        # Average them over batches per step
        avg_loss, avg_rmse = [x / batches_per_step for x in outputs]

        # Add relevant things to the tf.summary for both
        if training:
            tf.summary.scalar("loss", avg_loss)
            tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse)
        summary = tf.summary.merge_all()
        saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()

        report = None

    writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}',
                                   graph=graph,
                                   flush_secs=30)

    # Attach to IPUs and configure system
    # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0
    if (not training and opts.multiprocessing) or training:
        ipu_config = IPUConfig()

        ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
        ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000

        if opts.compile_only:
            ipu_config.device_connection.version = opts.compile_only_ipu_version
            ipu_config.device_connection.enable_remote_buffers = True
            ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE

        if opts.select_ipus == 'AUTO':
            ipu_config.auto_select_ipus = [opts.replication_factor]
        else:
            ipu_config.select_ipus = [opts.select_ipus[not training]]

        ipu_config.floating_point_behaviour.esr = opts.prng
        ipu_config.configure_ipu_system()

    graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary]
    sess = tf.Session(graph=graph)
    return GraphOps(graph, sess, init, graph_outputs,
                    placeholders if training else None, infeed, saver, writer,
                    trainFlag)
Exemple #16
0
                                 initializer=tf.constant_initializer(0.0),
                                 dtype=datatype)

        return tf.nn.xw_plus_b(x, weights, biases)


if __name__ == "__main__":
    args = parse_args()

    x = tf.placeholder(datatype, shape=[1, NUM_UNITS_IN])

    with scopes.ipu_scope("/device:IPU:0"):
        logits = model(x)

    if args.var_init_on_cpu:
        utils.move_variable_initialization_to_cpu()

    with tf.device('cpu'):
        # Event trace
        trace = gen_ipu_ops.ipu_event_trace()

    # Create a config with profiling on
    opts = utils.create_ipu_config(profiling=True,
                                   use_poplar_text_report=not args.json_report,
                                   profile_execution=args.profile_execution)
    opts = utils.auto_select_ipus(opts, 1)
    utils.configure_ipu_system(opts)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        # The "trace" op constantly profiles everything that happens on the IPU, from the moment it's created.
        # Executing the trace op flushes everything it has recorded up to that point and outputs it.
Exemple #17
0
def run(benchmark, opts):
    '''
    Run the benchmark.

    benchmark - An instance of Benchmark
    opts - Namespace from argparse generated from parse_opts
    '''
    # Build graph
    with tf.device('cpu'):
        dataset = tf.data.Dataset \
            .range((opts.steps + 2) * opts.batches_per_step) \
            .map(lambda i: benchmark.inputs(opts, i)) \
            .repeat() \
            .prefetch(opts.batches_per_step)

        if opts.batches_per_step > 1 or opts.replicas > 1:
            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
                dataset,
                feed_name="benchmark_dataset_infeed",
                replication_factor=opts.replicas)
            data_init = infeed_queue.initializer
        else:
            data_tensor = dataset.make_one_shot_iterator().get_next()
            data_init = tf.no_op()

    with ipu_scope('/device:IPU:0'):
        if opts.batches_per_step > 1:
            with tf.Graph().as_default():  # To get the shape and dtype
                dummy_opts = copy.deepcopy(opts)
                dummy_opts.shards = 1
                d = benchmark.inputs(dummy_opts, tf.constant(0))
                out = benchmark.graph_builder(dummy_opts, d)
            input = tf.constant(0, out.dtype, shape=out.shape)

            def body(inout, *args, **kwargs):
                with tf.control_dependencies([inout]):
                    # Run graph
                    out = benchmark.graph_builder(opts, kwargs)
                return out

            out = ipu_compiler.compile(
                lambda: loops.repeat(opts.batches_per_step, body, [input],
                                     infeed_queue), [])
        else:
            opts.batches_per_step = 1
            if opts.replicas > 1:
                out = ipu_compiler.compile(
                    lambda: benchmark.graph_builder(opts, infeed_queue), [])
            else:
                out = ipu_compiler.compile(
                    lambda: benchmark.graph_builder(opts, data_tensor), [])

    # Report
    if opts.report:
        report = gen_ipu_ops.ipu_event_trace()

    # Dump the graph to a logdir
    if opts.save_graph:
        writer = tf.summary.FileWriter(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs',
                         time.strftime('%Y%m%d_%H%M%S_%Z')))
        writer.add_graph(tf.get_default_graph())

    utils.configure_ipu_system(get_config(opts))
    utils.move_variable_initialization_to_cpu()

    with tf.Session() as sess:
        # Setup
        sess.run(data_init)
        if benchmark.initializer is not None:
            sess.run(benchmark.initializer())
        if benchmark.initializer_sess is not None:
            benchmark.initializer_sess(sess)
        if opts.report:
            sess.run(report)

        # Warmup
        print("Compiling and Warmup...")
        start = time.time()
        sess.run(out)
        duration = time.time() - start
        print("Duration: {:.3f} seconds\n".format(duration))

        # Cycle Report
        if opts.report:
            rep = sess.run(report)
            return extract_runtimes_from_report(
                rep, opts,
                display=True)  # Only run once if producing cycle report

        print("Executing...")
        average_batches_per_sec = 0
        # steps
        for i in range(opts.steps):
            # Run
            start = time.time()
            sess.run(out)
            duration = time.time() - start

            average_batches_per_sec += (opts.batches_per_step * opts.replicas /
                                        duration) / opts.steps
            report_string = "{:<7.3} sec/itr.".format(duration)
            report_string += "   " + benchmark.iteration_report(opts, duration)
            print(report_string)

        return average_batches_per_sec
Exemple #18
0
def test_embedding(config, phase):
    # define input
    indices = np.random.randint(
        0, test_config.vocab_size,
        (test_config.batch_size, test_config.sequence_length)).astype(np.int32)
    positions = np.reshape(
        np.arange(test_config.sequence_length),
        (test_config.batch_size, test_config.sequence_length)).astype(np.int32)
    segments = np.random.randint(
        0, 2,
        (test_config.batch_size, test_config.sequence_length)).astype(np.int32)
    inputs = [d for d in [indices, positions, segments]]

    # build model
    # PyTorch model
    torch_config = TorchBertConfig(
        vocab_size_or_config_json_file=test_config.vocab_size,
        hidden_size=test_config.hidden_size,
        hidden_act=test_config.hidden_act,
        num_attention_heads=test_config.num_attention_heads,
        hidden_dropout_prob=test_config.hidden_dropout_prob,
        max_position_embeddings=test_config.max_position_embeddings,
        type_vocab_size=test_config.type_vocab_size,
        update_embedding_dict=True,
        layer_norm_eps=test_config.layer_norm_eps)
    torch_model = TorchBertEmbeddings(torch_config)
    torch_model.eval()

    # TF model
    tf_config = TFBertConfig(
        vocab_size=test_config.vocab_size,
        hidden_size=test_config.hidden_size,
        hidden_act=test_config.hidden_act,
        num_attention_heads=test_config.num_attention_heads,
        max_position_embeddings=test_config.max_position_embeddings,
        max_predictions_per_seq=test_config.max_predictions_per_seq,
        hidden_dropout_prob=test_config.hidden_dropout_prob,
        type_vocab_size=test_config.type_vocab_size,
        initializer_range=test_config.initializer_range,
        dtype=test_config.dtype,
        matmul_serialize_factor=test_config.matmul_serialize_factor,
        static_mask=False)

    # farward check
    if phase == "fwd":
        torch_outputs = run_fwd_model(inputs, torch_model)

        with tf.Graph().as_default():
            tf_model = TFBertModel(tf_config, is_training=True)

            with ops.device('cpu'):
                input_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                           dtype=tf.int32)
                position_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                              dtype=tf.int32)
                segment_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                             dtype=tf.int32)
            cfg = utils.create_ipu_config()
            cfg = utils.auto_select_ipus(cfg, 1)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()
            with ops.device("/device:IPU:0"):
                opt = ipu_compiler.compile(
                    tf_model.embeddings_layer,
                    inputs=[input_ids, position_ids, segment_ids])

            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                # copy pytorch weight to tf
                var_and_init = copy_torch_weights_to_tf(
                    torch_model, tf_model, TF_TO_TORCH, {}, sess)
                sess.run(var_and_init)
                # run tf feed feed farward
                tf_outputs = sess.run(
                    opt, {
                        input_ids: indices,
                        position_ids: positions,
                        segment_ids: segments
                    })
                # compare tf output with pytorch output
                check_tensors(tf_outputs, torch_outputs, margin=1.5e-8)

    # backward check
    elif phase == "bwd":
        l1_lambda = 0.1
        base_lr = 0.01
        optim = torch.optim.SGD(torch_model.parameters(),
                                base_lr,
                                weight_decay=0.0,
                                momentum=0.0)

        torch_output = torch_model(
            *[torch.from_numpy(t).long() for t in inputs])
        # pytorch backward
        torch_loss = l1_lambda * torch.norm(torch_output, 1)
        torch_loss.backward()  # calculate gradients
        optim.step()  # update gradients
        torch_outputs = [torch_output.detach().numpy()]

        # TF
        with tf.Graph().as_default():
            tf_model = TFBertModel(tf_config, is_training=True)
            with ops.device('cpu'):
                input_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                           dtype=tf.int32)
                position_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                              dtype=tf.int32)
                segment_ids = tf.placeholder(shape=[
                    test_config.batch_size, test_config.sequence_length
                ],
                                             dtype=tf.int32)
            cfg = utils.create_ipu_config()
            cfg = utils.auto_select_ipus(cfg, 1)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            def embedding_graph(input_ids, position_ids, segment_ids):
                embedding_output = tf_model.embeddings_layer(
                    input_ids, position_ids, segment_ids)
                l1_loss = l1_lambda * tf.norm(embedding_output, 1)
                optimizer = tf.train.GradientDescentOptimizer(base_lr)
                train_step = optimizer.minimize(l1_loss)
                return embedding_output, l1_loss, train_step

            with ops.device("/device:IPU:0"):
                opt = ipu_compiler.compile(
                    embedding_graph,
                    inputs=[input_ids, position_ids, segment_ids])

            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                var_and_init = copy_torch_weights_to_tf(
                    torch_model, tf_model, TF_TO_TORCH, {}, sess)
                sess.run(var_and_init)
                tvars = sess.run({v.name: v for v in tf.trainable_variables()})
                print(tvars)
                tf_outputs, tf_loss = sess.run(
                    opt, {
                        input_ids: indices,
                        position_ids: positions,
                        segment_ids: segments
                    })
                # sess.run(opt, {input_ids: indices, position_ids: positions, segment_ids: segments})
                # Compare the farward output
                check_tf_torch_model(sess,
                                     torch_model,
                                     TF_TO_TORCH,
                                     margin=5e-7)
            check_tensors(torch_outputs, tf_outputs, margin=5e-7)
    else:
        raise ValueError(
            f"`phase` only can be set to [`fwd`, `bwd`] which mean farward or backward respectively."
        )
Exemple #19
0
  def testPipelineWithInfeedsKwargs(self):
    with tu.ipu_session() as sess:
      dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
      dataset = dataset.batch(batch_size=2, drop_remainder=True)

      def dataset_parser(value):
        a = value
        b = (value + 10.) / 2.0
        return {"a": a, "b": b}

      dataset = dataset.map(dataset_parser)
      infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed6")
      outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed6")

      def stage1(c, **kwargs):
        with variable_scope.variable_scope("vs", use_resource=True):
          y = layers.Conv2D(2,
                            1,
                            use_bias=True,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv1')(kwargs["a"])
          return y + kwargs["b"], c

      def stage2(x, c):
        return math_ops.reduce_sum(x) + c

      def stage3(x):
        return x

      def my_net(c):
        return pipelining_ops.pipeline(
            [stage1, stage2, stage3],
            12,
            inputs=[c],
            infeed_queue=infeed_queue,
            outfeed_queue=outfeed_queue,
            pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

      with ops.device('cpu'):
        c = array_ops.placeholder(np.float32, shape=[])

      with ops.device("/device:IPU:0"):
        r = ipu_compiler.compile(my_net, inputs=[c])

      cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
      cfg = utils.auto_select_ipus(cfg, 4)
      utils.configure_ipu_system(cfg)
      utils.move_variable_initialization_to_cpu()

      outfeed_op = outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, configure_device=False)
      report.reset()
      sess.run(variables.global_variables_initializer())
      sess.run(infeed_queue.initializer)
      sess.run(r, {c: 10.01})
      losses_pipeline = sess.run(outfeed_op)
      self.assertAllClose(losses_pipeline, [[
          410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01, 650.01,
          570.01, 890.01, 410.01, 730.01
      ]])
      report.parse_log()
      report.assert_pipeline_stages_on_expected_ipu((0, 1, 3))
Exemple #20
0
def generic_graph(opts, is_training):
    master_dtype = get_tf_datatype(opts)
    graph = tf.Graph()

    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts)
        else:
            dataset = get_dataset_embed(opts, False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         sl):
                    prob, accuracy = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        sl,
                        use_negsampling=False)
                    with tf.control_dependencies([prob]):
                        return outfeed_queue.enqueue((prob, target, accuracy))

                return loops.repeat(opts['batches_per_step'], body, [], infeed)

            outputs = ipu_compiler.compile(comp_fn, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.train.Saver()

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = IPUConfig()
    ipu_options.allow_recompute = True
    ipu_options.auto_select_ipus = [opts['replicas']]
    ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000
    ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000
    ipu_options.configure_ipu_system()

    graph_outputs = [outputs]

    sess = tf.Session(graph=graph)

    return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Exemple #21
0
    def train(self):
        with tf.device("cpu"):
            dataset, infeed_queue, data_init, vocab = self._build_dataset()
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
                feed_name="outfeed")
        if self.host_embeddings:
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )

        def build_common(src_embedding, tgt_embedding, source, target, label,
                         mask):
            nonlocal outfeed_queue
            input_, encoder_outputs, encoder_state = self._build_encoder(
                src_embedding, source)
            samples, logits = self._build_decoder(encoder_outputs,
                                                  encoder_state,
                                                  tgt_embedding,
                                                  target,
                                                  train=True)
            loss = self._build_optimiser(logits, label, mask)
            outfeed = outfeed_queue.enqueue({"loss": loss, "logits": logits})
            return outfeed

        def build_train(source, target, label, mask):
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )
            return build_common(src_embedding, tgt_embedding, source, target,
                                label, mask)

        def build_train_host_embeddings(source, target, label, mask):
            nonlocal src_embedding, tgt_embedding
            return build_common(src_embedding, tgt_embedding, source, target,
                                label, mask)

        with ipu_scope("/device:IPU:0"):
            build = build_train_host_embeddings if self.host_embeddings else build_train
            batch = ipu_compiler.compile(lambda: loops.repeat(
                self.opts.batches_per_step,
                build,
                infeed_queue=infeed_queue,
                inputs=[],
            ))

        # Create a restoring object
        saver = tf.train.Saver()

        if self.opts.save_graph:
            # Dump the graph to a logdir
            writer = tf.summary.FileWriter(
                os.path.join("./logs", "NMT",
                             time.strftime("%Y%m%d_%H%M%S_%Z")))
            writer.add_graph(tf.get_default_graph())

        ipu_options = util.get_config(report_n=0)
        utils.configure_ipu_system(ipu_options)
        session = tf.Session()
        checkpoint = CHECKPOINT_FILE + ("host_ckpt" if
                                        self.opts.host_embeddings else "ckpt")
        if self.opts.ckpt:
            saver.restore(session, checkpoint)
        else:
            utils.move_variable_initialization_to_cpu()
            session.run(tf.global_variables_initializer())
        session.run(data_init)
        print("Init done.")
        if self.host_embeddings:
            batch = [
                batch,
                src_embedding(self.opts.batches_per_step, 1),
                tgt_embedding(self.opts.batches_per_step, 1),
            ]
        result_queue = outfeed_queue.dequeue()
        session.run(batch)  # Warmup
        best_loss = float("Inf")
        for e in range(self.opts.iterations):
            start = time.time()
            session.run(batch)
            result = session.run(result_queue)
            l = result["loss"]
            avg_loss = np.mean(l)
            duration = (time.time() - start) / self.opts.batches_per_step

            print(
                "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}"
                .format(
                    (e + 1),
                    avg_loss,
                    self.opts.batch_size / duration,
                    self.opts.batch_size *
                    (self.src_length + self.tgt_length) / duration,
                ))
            if avg_loss < best_loss:
                best_loss = avg_loss
                saver.save(session, checkpoint)
Exemple #22
0
    def train(self):
        def build_train():
            embedding = Nmt._build_embedding(self.src_vocab_size,
                                             self.opts.embedding_size,
                                             name="source_embedding")
            input_, encoder_outputs, encoder_state = self._build_encoder(
                embedding)
            embedding = Nmt._build_embedding(self.tgt_vocab_size,
                                             self.opts.embedding_size,
                                             name="tgt_embedding")
            samples, logits = self._build_decoder(encoder_outputs,
                                                  encoder_state,
                                                  embedding,
                                                  train=True)
            loss, update = self._build_optimiser(logits)
            return loss, samples, logits, update

        with ipu_scope('/device:IPU:0'):
            data, _ = self._build_inputs()
            batch = ipu_compiler.compile(build_train, [])

        # Create a restoring object
        saver = tf.train.Saver()

        if self.opts.save_graph:
            # Dump the graph to a logdir
            writer = tf.summary.FileWriter(
                os.path.join('./logs', 'NMT',
                             time.strftime('%Y%m%d_%H%M%S_%Z')))
            writer.add_graph(tf.get_default_graph())

        ipu_options = util.get_config(report_n=0)
        utils.configure_ipu_system(ipu_options)
        session = tf.Session()
        checkpoint = CHECKPOINT_FILE + 'ckpt'
        if self.opts.ckpt:
            saver.restore(session, checkpoint)
        else:
            utils.move_variable_initialization_to_cpu()
            session.run(tf.global_variables_initializer())
        print("Init done.")

        session.run(batch, feed_dict=next(data))  # Warmup
        duration = 0
        avg_loss = 0
        best_loss = float('Inf')
        for e in range(1, 1 + self.opts.steps):
            start = time.time()
            l, _, _ = session.run(batch, feed_dict=next(data))
            duration += time.time() - start
            avg_loss += l
            if (e <= 1000 and not e % 100) or not e % 1000:
                duration /= 100 if e <= 1000 else 1000
                avg_loss /= 100 if e <= 1000 else 1000
                print(
                    "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}"
                    .format(
                        e, avg_loss, self.opts.batch_size / duration,
                        self.opts.batch_size *
                        (self.src_length + self.tgt_length) / duration))
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    saver.save(session, checkpoint)
                duration = 0
                avg_loss = 0
Exemple #23
0
    def test_pipelining(self):
        gradient_accumulation_count = 4
        local_batch_size = 2

        features = np.ones((1, 20), dtype=np.float32) * hvd.rank()
        labels = np.ones(1, dtype=np.int32) * hvd.rank()
        dataset = dataset_ops.Dataset.from_tensor_slices((features, labels))
        dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True)

        loss_vals = []

        strategy = IPUHorovodStrategy()

        with strategy.scope():

            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed")
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed")

            def stage1(lr, images, labels):
                partial = keras.layers.Dense(32, activation="relu")(images)
                partial = keras.layers.Dense(16, activation="relu")(partial)
                return lr, partial, labels

            def stage2(lr, partial, labels):
                logits = keras.layers.Dense(10)(partial)
                per_example_loss = keras.losses.sparse_categorical_crossentropy(
                    y_true=labels, y_pred=logits, from_logits=True)
                # In a custom training loop, the optimiser does an allreduce *sum*, not
                # average, of the gradients across the distributed workers. Therefore
                # we want to divide the loss here by the *global* batch size, which is
                # done by the `tf.nn.compute_average_loss()` function.
                loss = nn.compute_average_loss(per_example_loss)
                return lr, loss

            def optimizer_function(lr, loss):
                optimizer = GradientDescentOptimizer(lr)
                return pipelining_ops.OptimizerFunctionOutput(optimizer, loss)

            def model(lr):
                pipeline_op = pipelining_ops.pipeline(
                    computational_stages=[stage1, stage2],
                    device_mapping=[0, 0],
                    gradient_accumulation_count=gradient_accumulation_count,
                    inputs=[lr],
                    infeed_queue=infeed_queue,
                    repeat_count=2,
                    outfeed_queue=outfeed_queue,
                    optimizer_function=optimizer_function,
                    name="Pipeline")
                return pipeline_op

            def compiled_model(lr):
                with ipu_scope("/device:IPU:0"):
                    return ipu_compiler.compile(model, inputs=[lr])

            with ops.device("cpu"):
                lr = array_ops.placeholder(np.float32, [])

            train_op = strategy.experimental_run_v2(compiled_model, args=[lr])

            _, per_worker_losses = outfeed_queue.dequeue()

            # Mean across the local `gradient_accumulation_count` batches:
            per_worker_loss = math_ops.reduce_mean(per_worker_losses)

            # Global mean across the distributed workers (since it is already
            # divided by the global batch size above, we do a sum here):
            global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss)

            config = ipu_utils.create_ipu_config()
            config = ipu_utils.auto_select_ipus(config, num_ipus=1)
            ipu_utils.configure_ipu_system(config)
            ipu_utils.move_variable_initialization_to_cpu()

            with session.Session() as sess:
                sess.run(infeed_queue.initializer)
                sess.run(variables.global_variables_initializer())

                for _ in range(10):
                    sess.run(train_op, {lr: 0.01})
                    global_loss_val = sess.run(global_loss)

                    if loss_vals:
                        # Check that the loss decreases monotonically.
                        self.assertLess(global_loss_val, loss_vals[-1])
                    loss_vals.append(global_loss_val)

                sess.run(infeed_queue.deleter)
                sess.run(outfeed_queue.deleter)

                # Check all variables are equal across workers.
                for variable in variables.global_variables():
                    self.assertAllRanksEqual(variable.eval(), variable.name)
Exemple #24
0
def train(replication_factor, batch_size, batch_per_step, profile, num_iter,
          time_steps):
    """Launch training."""

    # Set up in-feeds for the data
    with tf.device('cpu'):
        data_generator = EnvGenerator(batch_size, time_steps)
        items = next(data_generator)
        output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items))
        output_shapes = tuple((tf.TensorShape(i.shape) for i in items))
        total_bytes = 0
        for i in items:
            total_bytes += i.nbytes
        print(f'Input data size = {total_bytes/1000000} MB/batch')
        dataset = tf.data.Dataset.from_generator(data_generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "InfeedQueue", replication_factor=replication_factor)
        data_init = infeed_queue.initializer

    # Compile loss op
    with ipu_scope("/device:IPU:0"):
        total_loss = ipu_compiler.compile(
            lambda: loops.repeat(batch_per_step,
                                 build_train_op,
                                 infeed_queue=infeed_queue,
                                 inputs=[tf.constant(0.0, dtype=DTYPE)]))
    # Set up report op optionally.
    if profile:
        with tf.device('cpu'):
            report = gen_ipu_ops.ipu_event_trace()

    # Set up session on IPU
    opts = utils.create_ipu_config(
        profiling=profile,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=profile,
        merge_infeed_io_copies=True)
    opts = utils.set_optimization_options(
        opts, max_cross_replica_sum_buffer_size=10000000)
    opts = utils.auto_select_ipus(opts, [replication_factor])
    utils.configure_ipu_system(opts)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=True))

    # Initialize variables
    utils.move_variable_initialization_to_cpu()
    sess.run([tf.global_variables_initializer(), data_init])

    # Run training and time
    total_time = 0.0
    total_samples = 0
    skip_iterations = 5  # Initially the infeed may buffer extra input data and
    # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec.
    for iters in range(num_iter):
        data_generator.reset_counter()
        t0 = time.perf_counter()
        sess.run(total_loss)
        t1 = time.perf_counter()

        if profile:
            raw_reports = sess.run(report)
            if use_poplar_text_report:
                # extract the report
                rep = utils.extract_all_strings_from_event_trace(raw_reports)
                print("Writing profiling report to %s" % report_dest)
                with open(report_dest, "w") as f:
                    f.write(rep)
            else:
                os.makedirs('profile_rl', exist_ok=True)
                save_tf_report(raw_reports, log_dir='profile_rl')
                print("Writing profiling report to profile_rl")
            break

        if iters > skip_iterations:
            total_time += (t1 - t0)
            total_samples += (batch_size * batch_per_step * replication_factor)
            print("Average %.1f items/sec" % (total_samples / total_time))
Exemple #25
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(
            dataset_val,
            feed_name='DIN_dataset_infeed_val',
            replication_factor=(opts['replicas']))

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="DIN_validation_outfeed",
            replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(
        ipu_options, combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
def construct_graph(
        network_class: Type[InferenceNetwork], config: Path,
        checkpoint_dir: str, batch_size: int, batches_per_step: int,
        image_filenames: Tuple[str], loop: bool, preprocess_fn: Callable,
        num_ipus: int, mode: str, save_graph_pb: bool
) -> Tuple[tf.Operation, tf.Operation, tf.Operation]:
    """Create inference graph on the device, set up in-feeds and out-feeds, connect dataset iterator to the graph.

    This function also exports the frozen graph into an event file, to be viewed in Tensorboard in `network_name_graph`
    directory.

    Args:
        network_class: Class corresponding to chosen model.
        config: Path to config file.
        checkpoint_dir: Checkpoint location.
        batch_size: Batch size per forward pass.
        batches_per_step: Number of forward passes per step.
        image_filenames: Collection of path to images.
        loop: Run inference in a loop.
        preprocess_fn: Pre-process function to apply to the image before feeding into the graph.
        num_ipus: Number of ipus.
        mode: Inference mode.
        save_graph_pb: If true, export frozen graph to event file to view in Tensorboard

    Returns: Compiled loop operator to run repeated inference over the dataset, infeed_queue intitializer, outfeed op.

    """
    # Model specific config
    with open(config.as_posix()) as file_stream:
        try:
            config_dict = yaml.safe_load(file_stream)
        except yaml.YAMLError as exc:
            tf.logging.error(exc)

    config_dict['network_name'] = config.stem
    if 'dtype' not in config_dict:
        config_dict["dtype"] = 'float16'

    # Create inference optimized frozen graph definition
    network = network_class(input_shape=config_dict["input_shape"],
                            num_outputs=1000,
                            batch_size=batch_size,
                            data_type=config_dict['dtype'],
                            config=config_dict,
                            checkpoint_dir=checkpoint_dir)

    # Export frozen graph to event file to view in Tensorboard"
    if save_graph_pb:
        log_dir = Path(f"{config_dict['network_name']}_graph")
        graph_filename = f"{log_dir}/{config_dict['network_name']}_graph.pb"
        if not log_dir.exists():
            log_dir.mkdir()
        with tf.io.gfile.GFile(graph_filename, "wb") as f:
            f.write(network.optimized_graph.SerializeToString())
        logging.info("%d ops in the final graph." %
                     len(network.optimized_graph.node))
        import_to_tensorboard(graph_filename, log_dir=log_dir.as_posix())

    # Reset graph before creating one on the IPU
    tf.reset_default_graph()

    # Create dataset
    dataset = get_dataset(image_filenames,
                          batch_size,
                          loop=loop,
                          preprocess_fn=preprocess_fn,
                          img_width=config_dict["input_shape"][1],
                          img_height=config_dict["input_shape"][0],
                          dtype=config_dict['dtype'])

    # Set up graph on device, connect infeed and outfeed to the graph.
    num_replicas = num_ipus if mode == 'replicated' else 1
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
        dataset,
        device_ordinal=0,
        feed_name="infeed",
        replication_factor=num_replicas)
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
        device_ordinal=0,
        feed_name="outfeed",
        outfeed_all=True,
        replication_factor=num_replicas)

    def comp_fn():
        def body(img):
            with scopes.ipu_scope('/device:IPU:0'):
                if mode == 'sharded':
                    with autoshard.ipu_autoshard():
                        probs = tf.import_graph_def(
                            network.optimized_graph,
                            input_map={network.graph_input: img},
                            name="optimized",
                            return_elements=[network.graph_output])[0]
                    autoshard.automatic_sharding(num_shards=num_ipus,
                                                 input_ts=img,
                                                 loss_ts=probs,
                                                 frozen_inference=True)
                    outfeed_op = outfeed_queue.enqueue(probs)
                    outfeed_op._set_attr(
                        sharding._XLA_SHARDING,
                        attr_value_pb2.AttrValue(
                            s=probs.op.get_attr('_XlaSharding')))
                else:
                    probs = tf.import_graph_def(
                        network.optimized_graph,
                        input_map={network.graph_input: img},
                        name="optimized",
                        return_elements=[network.graph_output])[0]
                    outfeed_op = outfeed_queue.enqueue(probs)
                # Note that enqueue happens on the IPU.
                return outfeed_op

        return loops.repeat(batches_per_step, body, [], infeed_queue)

    loop_op = ipu_compiler.compile(comp_fn, [])

    # The dequeue of the outfeed needs to happen on the CPU.
    with tf.device('cpu'):
        outfeed_dequeue = outfeed_queue.dequeue()

    ipu_utils.move_variable_initialization_to_cpu()
    return loop_op, infeed_queue.initializer, outfeed_dequeue
def training_graph(opts, training_data, device_index=0, learning_rate=0.001):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, _, placeholders = training_data.get_dataset(opts,
                                                             is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "training_dataset_infeed{0}".format(device_index), 0)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(
                        opts,
                        observed_ratings=observed_ratings,
                        learning_rate=placeholders["learning_rate"])
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(
                    opts.batches_per_step, body,
                    [tf.constant(0, tf.float32),
                     tf.constant(0, tf.float32)], infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", learning_rate)
        tf.summary.scalar("RMSE/train", rmse)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(opts.logs_path +
                                         '/train{0}'.format(device_index),
                                         graph=train_graph,
                                         flush_secs=30)

    ipu_options = ipu_utils.create_ipu_config(profiling=False)
    ipu_options = ipu_utils.set_floating_point_behaviour_options(
        ipu_options,
        inv=opts.fp_exceptions,
        div0=opts.fp_exceptions,
        oflo=opts.fp_exceptions,
        esr=opts.prng,
        nanoo=True)
    ipu_options = ipu_utils.auto_select_ipus(ipu_options, 1)
    ipu_utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init,
                    [loss, train_summary, rmse], placeholders, infeed,
                    train_saver, train_writer)
Exemple #28
0
def run_training(opts, transformer):
    # Construct the training graph
    training_graph = tf.Graph()
    with training_graph.as_default():
        with tf.device("cpu"):
            dataset, num_train, vocab = data_utils.make_dataset(
                opts,
                use_synthetic_data=opts.use_synthetic_data,
                training=True)

        # Calculate dataset length
        batch_size = opts.batch_size
        if opts.pipeline:
            batch_size *= opts.gradient_accumulation_count
        batches_per_epoch = num_train // batch_size
        io_steps_per_epoch = batches_per_epoch // opts.repeat_count
        total_io_steps = opts.nepochs * io_steps_per_epoch
        total_global_steps = opts.nepochs * io_steps_per_epoch * opts.repeat_count
        logger.info(f"Effective batch-size (global batch): {batch_size}, "
                    f"IO steps per epoch: {io_steps_per_epoch}, "
                    f"Total IO steps: {total_io_steps} "
                    f"Total global steps: {total_global_steps}")

        if opts.prune_ratio is not None and opts.prune_ratio > 0:
            # Compute the pruning ratio when the learning rate will reach a minimum
            lr_decay_steps = opts.cooldown_steps + opts.warmup_steps
            lr_min_epochs = lr_decay_steps / (io_steps_per_epoch *
                                              opts.repeat_count)
            remainining_prune_ratio = opts.prune_ratio * sparse_training.cosine_prune_function(
                lr_decay_steps, total_global_steps, opts.cosine_prune_schedule)
            logger.warn(
                f"\n\nThe learning rate schedule will reach a minimum after {lr_min_epochs:0.2f} epochs, "
                f"at which point the pruning ratio will be {remainining_prune_ratio:0.3f}\n\n"
            )
            logger.info(
                f"Cosine prune schedule options: {opts.cosine_prune_schedule}")

        logger.info("Creating infeed and outfeed queues")
        # Queues for streaming from host to device and back
        train_infeed = IPUInfeedQueue(dataset, feed_name="train_infeed")
        train_outfeed = IPUOutfeedQueue(feed_name="train_outfeed")
        prune_and_grow_outfeed = IPUOutfeedQueue(
            feed_name="prune_and_grow_outfeed")

        # Helper function
        def loop_builder(iterations, builder_func, infeed):
            return loops.repeat(iterations, builder_func, [], infeed)

        # Compile the forward and backward pass for training
        with scopes.ipu_scope("/device:IPU:0"):
            if opts.pipeline:
                logger.info("Creating pipelined training graph")
                train_loop = partial(forward_pass, opts, transformer,
                                     opts.repeat_count, True, train_outfeed,
                                     prune_and_grow_outfeed, train_infeed)
            else:
                logger.info("Creating training graph")
                train_body = partial(forward_pass, opts, transformer,
                                     opts.repeat_count, True, train_outfeed,
                                     prune_and_grow_outfeed)
                train_loop = partial(loop_builder, opts.repeat_count,
                                     train_body, train_infeed)
            train_loop = ipu_compiler.compile(train_loop, inputs=[])
            transformer.buildSparsityUpdateOps()

        # Metrics
        with tf.device("cpu"):
            metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                             scope="metrics")
            metrics_initializer = tf.variables_initializer(
                var_list=metrics_vars)
            saver = tf.train.Saver()

            # These ops are declared here so that the graph can be frozen afterwards
            global_initializer = tf.global_variables_initializer()
            train_outfeed_dequeue = train_outfeed.dequeue()
            if opts.prune_ratio is not None and opts.prune_ratio > 0:
                prune_and_grow_dequeue = prune_and_grow_outfeed.dequeue()
            utils.move_variable_initialization_to_cpu()

            # Tensorboard
            log_name = "logs/" + datetime.now().isoformat()
            summary_writer = tf.summary.FileWriter(logdir=os.path.join(
                opts.train_checkpoint_path, log_name),
                                                   flush_secs=5)

    # Run the model:
    training_graph.finalize()  # no more new ops added from here on out
    with tf.Session(graph=training_graph) as sess:
        logger.info(f"Initializing training session")
        sess.run(global_initializer)
        sess.run(train_infeed.initializer)
        logger.info(f"Training...")
        progress = tqdm(range(opts.nepochs))
        for e in progress:
            sess.run(metrics_initializer)
            for io_step in range(io_steps_per_epoch):
                # Train the model
                step_start_time = time.perf_counter()
                sess.run(train_loop)
                ipu_train_time = time.perf_counter() - step_start_time

                session_outputs = sess.run(train_outfeed_dequeue)[-1]
                logger.debug(f"Train outputs: {session_outputs.keys()}")

                # Calculate avg throughput
                num_tokens = transformer.source_sequence_length * opts.repeat_count * batch_size
                throughput = num_tokens / ipu_train_time

                # Log progress - average stats over the last accumulation step only:
                start_point = -1 if not opts.pipeline else -opts.gradient_accumulation_count
                lr = np.mean(session_outputs["learning_rate"][start_point:])
                training_loss = np.mean(
                    session_outputs['training_loss'][start_point:])
                std_training_loss = np.std(
                    session_outputs['training_loss'][start_point:])
                nll_loss = np.mean(session_outputs['nll_loss'][start_point:])
                perplexity = np.mean(
                    session_outputs["perplexity"][start_point:])
                token_accuracy = np.mean(
                    session_outputs['token_accuracy'][start_point:])
                global_step = session_outputs['global_step'][start_point:][-1]
                logger.info(
                    f"\nEpoch {e}: io_step {io_step+1}/{io_steps_per_epoch}"
                    f"\nGlobal step: {global_step}/{total_global_steps}"
                    f"\nTraining loss : {training_loss:.4f}"
                    f"\nTraining loss standard deviation: {std_training_loss:.4f}"
                    f"\nXentropy loss : {nll_loss:.4f}"
                    f"\nPerplexity : {perplexity:.3f}"
                    f"\nToken accuracy: {token_accuracy:.2f}"
                    f"\nLearning rate: {lr:3.4e}"
                    f"\nThroughput {throughput:.1f} token/s")

                if opts.decode and logger.level <= logging.INFO:
                    try:
                        text_pred, text_target = data_utils.decode_prediction(
                            prediction=session_outputs['predictions'][-1],
                            target=session_outputs['target'][-1],
                            vocab=vocab)
                        logger.info(
                            f"\nTarget: {text_target}\n\nPrediction: {text_pred}\n"
                        )
                    except Exception as ex:
                        logger.warn(f"Decoding failed: {ex}")

                summary_value = [
                    tf.Summary.Value(tag="perplexity",
                                     simple_value=perplexity),
                    tf.Summary.Value(tag="training_loss",
                                     simple_value=training_loss),
                    tf.Summary.Value(tag="stddev_training_loss",
                                     simple_value=std_training_loss),
                    tf.Summary.Value(tag="xentropy_loss",
                                     simple_value=nll_loss),
                    tf.Summary.Value(tag="token_accuracy",
                                     simple_value=token_accuracy),
                    tf.Summary.Value(tag="learning_rate", simple_value=lr),
                    tf.Summary.Value(tag="throughput",
                                     simple_value=throughput),
                    tf.Summary.Value(tag="epoch", simple_value=e)
                ]

                # If we just completed the last io step we do not
                # prune and grow regardless, otherwise check the prune ratio:
                if io_step + 1 < io_steps_per_epoch and transformer.prune_ratio is not None and transformer.prune_ratio > 0:
                    # Retrieve p and g results from the conditional queue:
                    prune_and_grow_data = sess.run(prune_and_grow_dequeue)
                    for k in prune_and_grow_data:
                        prune_and_grow_data[k] = prune_and_grow_data[k][-1]
                    logger.debug(
                        f"Prune and grow outputs: {prune_and_grow_data.keys()}"
                    )

                    prune_and_grow_time, cosine_schedule_factor = transformer.syncPruneAndRegrowOnHost(
                        opts.cosine_prune_schedule, global_step,
                        total_global_steps, prune_and_grow_data)
                    transformer.streamSparsityFromHostToDevice()
                    summary_value.extend([
                        tf.Summary.Value(tag="prune+grow_time",
                                         simple_value=prune_and_grow_time),
                        tf.Summary.Value(tag="cosine_schedule_factor",
                                         simple_value=cosine_schedule_factor)
                    ])

                    for layer_name, sparse_layer in transformer.sparse_layers.items(
                    ):
                        values_var = sparse_layer.get_values_var()
                        grad_w_name = values_var.name.replace(
                            'nz_values:0', 'grad_w')
                        grad_w = np.array(prune_and_grow_data[grad_w_name])
                        if (opts.log_histograms):
                            histogram = tf_utils.make_histogram_proto(
                                grad_w, bins_count=opts.bins_count)
                            summary_value.extend([
                                tf.Summary.Value(tag=layer_name +
                                                 "/dense_grad_w",
                                                 histo=histogram)
                            ])

                        summary_value.extend([
                            tf.Summary.Value(tag=layer_name +
                                             "/dense_grad_w_stddev",
                                             simple_value=np.std(grad_w)),
                            tf.Summary.Value(tag=layer_name +
                                             "/dense_grad_w_mean",
                                             simple_value=np.mean(grad_w)),
                            tf.Summary.Value(tag=layer_name +
                                             "/dense_grad_w_min",
                                             simple_value=np.min(grad_w)),
                            tf.Summary.Value(tag=layer_name +
                                             "/dense_grad_w_max",
                                             simple_value=np.max(grad_w))
                        ])

                        for slot_name, slot in sparse_layer.get_slot_var_dict(
                        ).items():
                            slot_val = prune_and_grow_data[
                                slot.tf_variable.name]
                            if opts.log_histograms:
                                histogram = tf_utils.make_histogram_proto(
                                    slot_val, bins_count=opts.bins_count)
                                summary_value.extend([
                                    tf.Summary.Value(tag=slot_name,
                                                     histo=histogram)
                                ])
                            summary_value.extend([
                                tf.Summary.Value(
                                    tag=slot_name + "/stddev",
                                    simple_value=np.std(slot_val)),
                                tf.Summary.Value(
                                    tag=slot_name + "/mean",
                                    simple_value=np.mean(slot_val)),
                                tf.Summary.Value(
                                    tag=slot_name + "/min",
                                    simple_value=np.min(slot_val)),
                                tf.Summary.Value(tag=slot_name + "/max",
                                                 simple_value=np.max(slot_val))
                            ])

                # Log to tensorboard (outside any graph)
                summary = tf.Summary(value=summary_value)
                summary_writer.add_summary(summary, np.mean(global_step))
                if opts.use_wandb:
                    wandb.tensorflow.log(summary.SerializeToString())
                logger.info(
                    f"Total time for step {time.perf_counter() - step_start_time}"
                )
                logger.info(f"IPU train time for step {ipu_train_time}")

            logger.info(f"Saving model after epoch {e}")
            saver.save(
                sess,
                os.path.join(opts.train_checkpoint_path,
                             'model_' + str(e) + '.ckpt'))
            os.sys.stdout.flush()
        logger.info(f"Training complete.")
Exemple #29
0
def initializer():
    utils.move_variable_initialization_to_cpu()
    return tf.global_variables_initializer()
Exemple #30
0
def generic_graph(opts):
    data_type = get_tf_datatype(opts)
    graph = tf.Graph()
    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, True, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts, return_neg=True)
            feed_dict_values = {}
        else:
            dataset, feed_dict_values = get_dataset_embed_from_tensors(
                opts, data_type)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset,
            feed_name='DIEN_dataset_infeed',
            replication_factor=(opts['replicas']))

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen, noclk_mids, noclk_cats):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        noclk_mids,
                        noclk_cats,
                        use_negsampling=True)
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, data_type)] * 3, infeed)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]

        saver = tf.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(opts['seed'])

    graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.Session(graph=graph)

    return GraphOps(
        sess, init, graph_outputs, placeholders, infeed, saver,
        feed_dict_values), uid_embedding, mid_embedding, cat_embedding