def comp_fn():
        def body(img):
            with scopes.ipu_scope('/device:IPU:0'):
                if mode == 'sharded':
                    with autoshard.ipu_autoshard():
                        probs = tf.import_graph_def(
                            network.optimized_graph,
                            input_map={network.graph_input: img},
                            name="optimized",
                            return_elements=[network.graph_output])[0]
                    autoshard.automatic_sharding(num_shards=num_ipus,
                                                 input_ts=img,
                                                 loss_ts=probs,
                                                 frozen_inference=True)
                    outfeed_op = outfeed_queue.enqueue(probs)
                    outfeed_op._set_attr(
                        sharding._XLA_SHARDING,
                        attr_value_pb2.AttrValue(
                            s=probs.op.get_attr('_XlaSharding')))
                else:
                    probs = tf.import_graph_def(
                        network.optimized_graph,
                        input_map={network.graph_input: img},
                        name="optimized",
                        return_elements=[network.graph_output])[0]
                    outfeed_op = outfeed_queue.enqueue(probs)
                # Note that enqueue happens on the IPU.
                return outfeed_op

        return loops.repeat(batches_per_step, body, [], infeed_queue)
Exemple #2
0
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, getattr(np, 'float32'))] *
                                    3, infeed_train)
Exemple #3
0
            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)
Exemple #4
0
 def comp_fn():
     def body(total_accuracy, image, label):
         accuracy = validation_graph_builder(model, image, label, opts)
         return total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"])
     accuracy = loops.repeat(int(opts["validation_batches_per_step"]),
                             body, [tf.constant(0, tf.float32)], valid_iterator)
     if opts['replicas'] > 1:
         accuracy = cross_replica_ops.cross_replica_sum(accuracy) / (opts['replicas']*opts['shards'])
     return accuracy
            def comp_fn():
                def body(data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    accuracy_enqueue = acc_queue.enqueue(accuracy)
                    return accuracy_enqueue

                accuracy = loops.repeat(
                    int(opts['validation_batches_per_step']), body, [],
                    inference_infeed_iterator)
                return accuracy
Exemple #6
0
    def comp_fn():
        def body(img):
            with scopes.ipu_scope('/device:IPU:0'):
                probs = tf.import_graph_def(
                    network.optimized_graph,
                    input_map={network.graph_input: img},
                    name="optimized",
                    return_elements=[network.graph_output])[0]
                outfeed_op = outfeed_queue.enqueue(probs)
                # Note that enqueue happens on the IPU.
                return outfeed_op

        return loops.repeat(batches_per_step, body, [], infeed_queue)
Exemple #7
0
            def comp_fn():
                def body(sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings, ground_truth = tf.split(
                        data_tensors[0], num_or_size_splits=2, axis=1)
                    rmse_metric = graph_builder(
                        opts,
                        observed_ratings=observed_ratings,
                        ground_truth=ground_truth,
                        type='VALID')
                    return sum_rmse_metric + rmse_metric

                return loops.repeat(opts.validation_batches_per_step, body,
                                    [tf.constant(0, tf.float32)], infeed)
            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(
                        opts,
                        observed_ratings=observed_ratings,
                        learning_rate=placeholders["learning_rate"])
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(
                    opts.batches_per_step, body,
                    [tf.constant(0, tf.float32),
                     tf.constant(0, tf.float32)], infeed)
Exemple #9
0
 def comp_fn():
     def body(total_loss, total_rmse, batch):
         loss, rmse, grad_op = graph_builder(opts,
                                             observed=batch[:, :-1],
                                             ground_truth=tf.expand_dims(batch[:, -1], axis=1),
                                             learning_rate=placeholders['learning_rate'] if training else None,
                                             mode=trainFlag)
         if not training:
             return total_loss + loss, total_rmse + rmse
         with tf.control_dependencies([grad_op]):
             return total_loss + loss, total_rmse + rmse
     return loops.repeat(batches_per_step,
                         body,
                         [tf.constant(0, getattr(np, opts.dtypes[0]))]*2,
                         infeed)
Exemple #10
0
            def comp_fn():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         sl):
                    prob, accuracy = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        sl,
                        use_negsampling=False)
                    with tf.control_dependencies([prob]):
                        return outfeed_queue.enqueue((prob, target, accuracy))

                return loops.repeat(opts['batches_per_step'], body, [], infeed)
Exemple #11
0
            def comp_fn():
                def body(total_accuracy, data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    if opts['latency']:
                        timestamp_enqueue = timestamp_queue.enqueue(
                            data_dict['timestamp'])
                        return (total_accuracy +
                                (tf.cast(accuracy, tf.float32) /
                                 opts["validation_batches_per_step"]),
                                timestamp_enqueue)
                    else:
                        return total_accuracy + (
                            tf.cast(accuracy, tf.float32) /
                            opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['total_replicas'] * opts['shards'] > 1 and not opts.get(
                        'inference', False):
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['total_replicas'] * opts['shards'])
                return accuracy
Exemple #12
0
            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)
Exemple #13
0
 def my_net(v):
     r = loops.repeat(2, body, inputs=[v])
     return r
Exemple #14
0
def loop_builder(iterations, builder_func, infeed):
    return loops.repeat(iterations, builder_func, [], infeed)
Exemple #15
0
def my_net():
    r = loops.repeat(100, body, [], infeed_queue)
    return r
Exemple #16
0
 def my_net():
     r = loops.repeat(REPEATS, body, [], infeed_queue)
     return r
Exemple #17
0
def train(replication_factor, batch_size, batch_per_step, profile, num_iter,
          time_steps):
    """Launch training."""

    # Set up in-feeds for the data
    with tf.device('cpu'):
        data_generator = EnvGenerator(batch_size, time_steps)
        items = next(data_generator)
        output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items))
        output_shapes = tuple((tf.TensorShape(i.shape) for i in items))
        total_bytes = 0
        for i in items:
            total_bytes += i.nbytes
        print(f'Input data size = {total_bytes/1000000} MB/batch')
        dataset = tf.data.Dataset.from_generator(data_generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "InfeedQueue", replication_factor=replication_factor)
        data_init = infeed_queue.initializer

    # Compile loss op
    with ipu_scope("/device:IPU:0"):
        total_loss = ipu_compiler.compile(
            lambda: loops.repeat(batch_per_step,
                                 build_train_op,
                                 infeed_queue=infeed_queue,
                                 inputs=[tf.constant(0.0, dtype=DTYPE)]))
    # Set up report op optionally.
    if profile:
        with tf.device('cpu'):
            report = gen_ipu_ops.ipu_event_trace()

    # Set up session on IPU
    opts = utils.create_ipu_config(
        profiling=profile,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=profile,
        merge_infeed_io_copies=True)
    opts = utils.set_optimization_options(
        opts, max_cross_replica_sum_buffer_size=10000000)
    opts = utils.auto_select_ipus(opts, [replication_factor])
    utils.configure_ipu_system(opts)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=True))

    # Initialize variables
    utils.move_variable_initialization_to_cpu()
    sess.run([tf.global_variables_initializer(), data_init])

    # Run training and time
    total_time = 0.0
    total_samples = 0
    skip_iterations = 5  # Initially the infeed may buffer extra input data and
    # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec.
    for iters in range(num_iter):
        data_generator.reset_counter()
        t0 = time.perf_counter()
        sess.run(total_loss)
        t1 = time.perf_counter()

        if profile:
            raw_reports = sess.run(report)
            if use_poplar_text_report:
                # extract the report
                rep = utils.extract_all_strings_from_event_trace(raw_reports)
                print("Writing profiling report to %s" % report_dest)
                with open(report_dest, "w") as f:
                    f.write(rep)
            else:
                os.makedirs('profile_rl', exist_ok=True)
                save_tf_report(raw_reports, log_dir='profile_rl')
                print("Writing profiling report to profile_rl")
            break

        if iters > skip_iterations:
            total_time += (t1 - t0)
            total_samples += (batch_size * batch_per_step * replication_factor)
            print("Average %.1f items/sec" % (total_samples / total_time))
Exemple #18
0
 def my_net():
     r = loops.repeat(self.repeat_count,
                      model_func, [], infeed_queue)
     return r
Exemple #19
0
 def model():
     return loops.repeat(2, body, inputs=[1.0])
def my_net():
    loss = tf.constant(0.0, shape=[])
    r = loops.repeat(num_iterations, training, [loss], infeed_queue)
    return r
Exemple #21
0
 def compiled_fn():
     return loops.repeat(iterations_per_step,
                         partial(training_step_loop, outfeed=outfeed),
                         [], train_iterator)
Exemple #22
0
    def infer(self):
        with tf.device("cpu"):
            dataset, infeed_queue, data_init, vocab = self._build_dataset()
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
                feed_name="outfeed")
        if self.host_embeddings:
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )

        def build_common(src_embedding, tgt_embedding, source):
            input_, encoder_outputs, encoder_state = self._build_encoder(
                src_embedding, source)
            samples, logits = self._build_decoder(encoder_outputs,
                                                  encoder_state,
                                                  tgt_embedding,
                                                  None,
                                                  train=False)
            outfeed = outfeed_queue.enqueue({"samples": samples})
            return outfeed

        def build_infer(source):
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )
            return build_common(src_embedding, tgt_embedding, source)

        def build_infer_host_embeddings(source):
            nonlocal src_embedding, tgt_embedding
            return build_common(src_embedding, tgt_embedding, source)

        with ipu_scope("/device:IPU:0"):
            build = build_infer_host_embeddings if self.host_embeddings else build_infer
            batch = ipu_compiler.compile(lambda: loops.repeat(
                1, build, infeed_queue=infeed_queue, inputs=[]))

        # Create a restoring object
        saver = tf.train.Saver()

        ipu_options = util.get_config(report_n=0)
        utils.configure_ipu_system(ipu_options)
        session = tf.Session()
        checkpoint = CHECKPOINT_FILE + ("host_ckpt" if
                                        self.opts.host_embeddings else "ckpt")
        saver.restore(session, checkpoint)
        session.run(data_init)
        if self.host_embeddings:
            batch = [
                batch,
                src_embedding(1, 1, False),
                tgt_embedding(1, 1, False)
            ]
        result_queue = outfeed_queue.dequeue()
        # Run a dummy value to force the graph compilation
        session.run(batch)
        result = session.run(result_queue)
        predictions = result["samples"]
        print_data(self.generator.query, vocab[0], predictions, vocab[1])

        while True:
            session.run(batch)
            result = session.run(result_queue)
            predictions = result["samples"]
            print_data(self.generator.query, vocab[0], predictions, vocab[1])
            if not self.opts.interact:
                break
Exemple #23
0
def getImages(numPoints):
    r = loops.repeat(numPoints, body, [])
    return r
Exemple #24
0
 def my_net(i, w):
     loss = array_ops.constant(0.0, shape=[])
     r = loops.repeat(num_iterations, training, [loss, i, w])
     return r
def my_net():
    count = 0
    count = loops.repeat(10, body, [count], infeed_queue)
    return count
Exemple #26
0
def training_loop():
    return loops.repeat(opts.num_iters, model, infeed_queue=infeed)
Exemple #27
0
def training_loop_FULL(numPoints):
    out = loops.repeat(numPoints, train_model, infeed_queue=infeed_GAN)
    return out
Exemple #28
0
    def train(self):
        with tf.device("cpu"):
            dataset, infeed_queue, data_init, vocab = self._build_dataset()
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
                feed_name="outfeed")
        if self.host_embeddings:
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )

        def build_common(src_embedding, tgt_embedding, source, target, label,
                         mask):
            nonlocal outfeed_queue
            input_, encoder_outputs, encoder_state = self._build_encoder(
                src_embedding, source)
            samples, logits = self._build_decoder(encoder_outputs,
                                                  encoder_state,
                                                  tgt_embedding,
                                                  target,
                                                  train=True)
            loss = self._build_optimiser(logits, label, mask)
            outfeed = outfeed_queue.enqueue({"loss": loss, "logits": logits})
            return outfeed

        def build_train(source, target, label, mask):
            src_embedding = Nmt._build_embedding(
                self.src_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="source_embedding",
            )
            tgt_embedding = Nmt._build_embedding(
                self.tgt_vocab_size,
                self.opts.embedding_size,
                self.opts.host_embeddings,
                name="tgt_embedding",
            )
            return build_common(src_embedding, tgt_embedding, source, target,
                                label, mask)

        def build_train_host_embeddings(source, target, label, mask):
            nonlocal src_embedding, tgt_embedding
            return build_common(src_embedding, tgt_embedding, source, target,
                                label, mask)

        with ipu_scope("/device:IPU:0"):
            build = build_train_host_embeddings if self.host_embeddings else build_train
            batch = ipu_compiler.compile(lambda: loops.repeat(
                self.opts.batches_per_step,
                build,
                infeed_queue=infeed_queue,
                inputs=[],
            ))

        # Create a restoring object
        saver = tf.train.Saver()

        if self.opts.save_graph:
            # Dump the graph to a logdir
            writer = tf.summary.FileWriter(
                os.path.join("./logs", "NMT",
                             time.strftime("%Y%m%d_%H%M%S_%Z")))
            writer.add_graph(tf.get_default_graph())

        ipu_options = util.get_config(report_n=0)
        utils.configure_ipu_system(ipu_options)
        session = tf.Session()
        checkpoint = CHECKPOINT_FILE + ("host_ckpt" if
                                        self.opts.host_embeddings else "ckpt")
        if self.opts.ckpt:
            saver.restore(session, checkpoint)
        else:
            utils.move_variable_initialization_to_cpu()
            session.run(tf.global_variables_initializer())
        session.run(data_init)
        print("Init done.")
        if self.host_embeddings:
            batch = [
                batch,
                src_embedding(self.opts.batches_per_step, 1),
                tgt_embedding(self.opts.batches_per_step, 1),
            ]
        result_queue = outfeed_queue.dequeue()
        session.run(batch)  # Warmup
        best_loss = float("Inf")
        for e in range(self.opts.iterations):
            start = time.time()
            session.run(batch)
            result = session.run(result_queue)
            l = result["loss"]
            avg_loss = np.mean(l)
            duration = (time.time() - start) / self.opts.batches_per_step

            print(
                "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}"
                .format(
                    (e + 1),
                    avg_loss,
                    self.opts.batch_size / duration,
                    self.opts.batch_size *
                    (self.src_length + self.tgt_length) / duration,
                ))
            if avg_loss < best_loss:
                best_loss = avg_loss
                saver.save(session, checkpoint)
Exemple #29
0
def run(benchmark, opts):
    '''
    Run the benchmark.

    benchmark - An instance of Benchmark
    opts - Namespace from argparse generated from parse_opts
    '''
    with ipu_scope('/device:IPU:0'):
        # Build graph
        with tf.device('cpu'):
            dataset = tf.data.Dataset \
                .range((opts.steps + 2) * opts.batches_per_step) \
                .map(lambda i: benchmark.inputs(opts, i)) \
                .prefetch(opts.batches_per_step)

        if opts.batches_per_step > 1:
            with tf.device('cpu'):
                infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
                    dataset, feed_name="benchmark_dataset_infeed")
                data_init = infeed_queue.initializer

            with tf.Graph().as_default():  # To get the shape and dtype
                dummy_opts = copy.deepcopy(opts)
                dummy_opts.shards = 1
                d = benchmark.inputs(dummy_opts, tf.constant(0))
                out = benchmark.graph_builder(dummy_opts, d)
            input = tf.constant(0, out.dtype, shape=out.shape)

            def body(inout, *args, **kwargs):
                with tf.control_dependencies([inout]):
                    # Run graph
                    with tf.variable_scope("MainGraph"):
                        out = benchmark.graph_builder(opts, kwargs)
                return out

            out = ipu_compiler.compile(
                lambda: loops.repeat(opts.batches_per_step, body, [input],
                                     infeed_queue), [])
        else:
            with tf.device('cpu'):
                data_tensors = dataset.make_one_shot_iterator().get_next()
                data_init = tf.no_op()
            out = ipu_compiler.compile(
                lambda: benchmark.graph_builder(opts, data_tensors), [])
            opts.batches_per_step = 1

    # Report
    report = gen_ipu_ops.ipu_event_trace()

    # Dump the graph to a logdir
    if opts.save_graph:
        writer = tf.summary.FileWriter(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs',
                         time.strftime('%Y%m%d_%H%M%S_%Z')))
        writer.add_graph(tf.get_default_graph())

    utils.configure_ipu_system(get_config(opts))
    with tf.Session() as sess:
        # Setup
        sess.run([benchmark.initializer(), data_init])
        sess.run(report)

        # Warmup
        print("Compiling and Warmup...")
        start = time.time()
        sess.run(out)
        duration = time.time() - start
        print("Duration: {:.3f} seconds\n".format(duration))

        # Cycle Report
        if opts.cycle_report:
            rep = sess.run(report)
            return extract_runtimes_from_report(
                rep, opts,
                display=True)  # Only run once if producing cycle report

        print("Executing...")
        average_batches_per_sec = 0
        # steps
        for i in range(opts.steps):
            # Run
            start = time.time()
            sess.run(out)
            duration = time.time() - start

            average_batches_per_sec += (opts.batches_per_step /
                                        duration) / opts.steps
            report_string = "{:<7.3} sec/itr.".format(duration)
            report_string += "   " + benchmark.iteration_report(opts, duration)
            print(report_string)

        return average_batches_per_sec
Exemple #30
0
def training_loop_test():
    out = loops.repeat(1, test_model)
    return out