コード例 #1
0
    def end(self, session):
        import os
        raw_report = session.run(self._report_op)
        write_file = os.path.join(self._write_dir, f'{self._name}_report.txt')
        with open(write_file, 'w') as f:
            f.write(ipu.utils.extract_all_strings_from_event_trace(raw_report))

        from gcprofile import save_tf_report
        save_tf_report(raw_report)

        print(f"Wrote profiling report to {write_file}")
コード例 #2
0
ファイル: run_benchmark.py プロジェクト: xerothermic/examples
def get_report(loop_op: tf.Operation,
               infeed_queue_initializer: tf.Operation,
               outfeed_op: tf.Operation,
               report_dest: str,
               available_memory_proportion: Optional[float] = 0.6) -> None:
    """Generate report from running model on IPU and save to disk.

    Args:
        loop_op: Inference op to generate report on.
        infeed_queue_initializer: Initializer for the infeed queue
        outfeed_op: Outfeed operator.
        report_dest: Location to store report.
        available_memory_proportion: Proportion of tile memory available as temporary memory
        for matmul and convolution execution

    """
    # Set compile and device options
    os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model"
    use_poplar_text_report = report_mode == 'text'
    opts = ipu_utils.create_ipu_config(
        profiling=True,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=True)
    opts = ipu_utils.set_matmul_options(opts,
                                        matmul_options={
                                            "availableMemoryProportion":
                                            str(available_memory_proportion)
                                        })
    opts = ipu_utils.set_convolution_options(
        opts,
        convolution_options={
            "availableMemoryProportion": str(available_memory_proportion)
        })
    ipu_utils.auto_select_ipus(opts, [1])
    ipu_utils.configure_ipu_system(opts)

    with tf.device('cpu'):
        report = gen_ipu_ops.ipu_event_trace()

    run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)
    session = tf.Session()
    session.run(infeed_queue_initializer)
    session.run(loop_op, options=run_options)
    session.run(outfeed_op, options=run_options)
    out = session.run(report)
    if report_mode == 'text':
        # extract the report
        rep = ipu_utils.extract_all_strings_from_event_trace(out)
        logging.info("Writing profiling report to %s" % report_dest)
        with open(report_dest, "w") as f:
            f.write(rep)
    else:
        save_tf_report(out)
コード例 #3
0
ファイル: ssd_model.py プロジェクト: xerothermic/examples
 print("Compiling and Warmup...")
 start = time.time()
 sess.run(inference_output)
 convolution_predictions = sess.run(outfeed)
 # convolution_predictions = sess.run(inference_output, feed_dict={input_image: np_image})
 raw_output = sess.run(
     decoder, feed_dict={input_detection: convolution_predictions[0]})
 filtered_output = process_detections(raw_output)
 draw_detections(original_image, original_image_dims[0],
                 original_image_dims[1], filtered_output)
 print("Done running inference.")
 duration = time.time() - start
 print("Duration: {:.3f} seconds\n".format(duration))
 if REPORT:
     rep_out = sess.run(report)
     save_tf_report(rep_out)
     rep = utils.extract_all_strings_from_event_trace(rep_out)
     with open(
             str(WIDTH) + "x" + str(HEIGHT) + "_ipus" + str(NUM_IPUS) +
             "_ssd_report.txt", "w") as f:
         f.write(rep)
 # Performance runs
 print("Executing...")
 for iter_count in range(N_ITERATIONS):
     print("Running iteration: ", iter_count)
     # Run
     start = time.time()
     sess.run(inference_output)
     convolution_predictions = sess.run(outfeed)
     raw_output = sess.run(
         decoder, feed_dict={input_detection: convolution_predictions[0]})
コード例 #4
0
ファイル: rl_benchmark.py プロジェクト: inejc/examples
def train(replication_factor, batch_size, batch_per_step, profile, num_iter,
          time_steps):
    """Launch training."""

    # Set up in-feeds for the data
    with tf.device('cpu'):
        data_generator = EnvGenerator(batch_size, time_steps)
        items = next(data_generator)
        output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items))
        output_shapes = tuple((tf.TensorShape(i.shape) for i in items))
        total_bytes = 0
        for i in items:
            total_bytes += i.nbytes
        print(f'Input data size = {total_bytes/1000000} MB/batch')
        dataset = tf.data.Dataset.from_generator(data_generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "InfeedQueue", replication_factor=replication_factor)
        data_init = infeed_queue.initializer

    # Compile loss op
    with ipu_scope("/device:IPU:0"):
        total_loss = ipu_compiler.compile(
            lambda: loops.repeat(batch_per_step,
                                 build_train_op,
                                 infeed_queue=infeed_queue,
                                 inputs=[tf.constant(0.0, dtype=DTYPE)]))
    # Set up report op optionally.
    if profile:
        with tf.device('cpu'):
            report = gen_ipu_ops.ipu_event_trace()

    # Set up session on IPU
    opts = utils.create_ipu_config(
        profiling=profile,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=profile,
        merge_infeed_io_copies=True)
    opts = utils.set_optimization_options(
        opts, max_cross_replica_sum_buffer_size=10000000)
    opts = utils.auto_select_ipus(opts, [replication_factor])
    utils.configure_ipu_system(opts)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=True))

    # Initialize variables
    utils.move_variable_initialization_to_cpu()
    sess.run([tf.global_variables_initializer(), data_init])

    # Run training and time
    total_time = 0.0
    total_samples = 0
    skip_iterations = 5  # Initially the infeed may buffer extra input data and
    # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec.
    for iters in range(num_iter):
        data_generator.reset_counter()
        t0 = time.perf_counter()
        sess.run(total_loss)
        t1 = time.perf_counter()

        if profile:
            raw_reports = sess.run(report)
            if use_poplar_text_report:
                # extract the report
                rep = utils.extract_all_strings_from_event_trace(raw_reports)
                print("Writing profiling report to %s" % report_dest)
                with open(report_dest, "w") as f:
                    f.write(rep)
            else:
                os.makedirs('profile_rl', exist_ok=True)
                save_tf_report(raw_reports, log_dir='profile_rl')
                print("Writing profiling report to profile_rl")
            break

        if iters > skip_iterations:
            total_time += (t1 - t0)
            total_samples += (batch_size * batch_per_step * replication_factor)
            print("Average %.1f items/sec" % (total_samples / total_time))
コード例 #5
0
ファイル: train.py プロジェクト: muzzynine/examples-1
def train_with_session(input_fn, cosmoflow_config):

    with tf.device('cpu'):
        infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue(
            input_fn(
            ),  # difference in tf.dataset construction changes throughput
            feed_name="training_infeed",
            replication_factor=cosmoflow_config['ipu_config']['num_ipus'])

    outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
        'outfeed',
        replication_factor=cosmoflow_config['ipu_config']['num_ipus'])

    def cosmoflow_training_loop():
        def body(loss, features, labels):
            with tf.variable_scope("MainGraph"):
                model = get_model(**cosmoflow_config['model'])
                outputs = model(features, training=True)
            train_config = cosmoflow_config['train']
            loss_name = train_config['loss']
            if loss_name == "mse":
                loss = tf.losses.mean_squared_error(labels=labels,
                                                    predictions=outputs)
            else:
                raise NotImplementedError("loss: %s" % loss_name)

            optimizer = tf.train.GradientDescentOptimizer(
                cosmoflow_config['optimizer']['lr'])
            if cosmoflow_config['ipu_config']['num_ipus'] > 1:
                optimizer = CrossReplicaOptimizer(optimizer)
            train_op = optimizer.minimize(loss=loss)
            with tf.control_dependencies([train_op]):
                return loss, outfeed_queue.enqueue(loss)

        loss = 0.0
        return ipu.loops.repeat(
            cosmoflow_config['ipu_config']['iterations_per_loop'], body,
            [loss], infeed_queue)

    # Compile model
    with ipu.scopes.ipu_scope('/device:IPU:0'):
        res = ipu.ipu_compiler.compile(cosmoflow_training_loop, inputs=[])

    dequeue_outfeed = outfeed_queue.dequeue()

    ipu_options = get_ipu_options(cosmoflow_config)

    ipu.utils.configure_ipu_system(ipu_options)
    ipu.utils.move_variable_initialization_to_cpu()

    data_config = cosmoflow_config['data']
    # remember that effective batch-size is batch-size X num_ipus
    # also note that num_loops is different from num_steps given to IPUEstimator
    num_loops = ((data_config["n_epochs"] * data_config["n_train"]) //
                 (data_config["batch_size"] *
                  cosmoflow_config['ipu_config']['num_ipus'] *
                  cosmoflow_config['ipu_config']['iterations_per_loop']))

    if cosmoflow_config['ipu_config']['profiling']:
        with tf.device('cpu'):
            from tensorflow.compiler.plugin.poplar.ops import gen_ipu_ops
            # Event trace
            trace = gen_ipu_ops.ipu_event_trace()

    with tf.Session() as sess:
        sess.run(infeed_queue.initializer)
        sess.run(tf.global_variables_initializer())

        # Warm up
        print("Compiling and Warmup...")
        start = time.time()
        sess.run(res)
        if cosmoflow_config['ipu_config']['profiling']:
            report = sess.run(trace)
            from gcprofile import save_tf_report
            save_tf_report(report)
        duration = time.time() - start
        print("Duration: {:.3f} seconds\n".format(duration))
        print("Executing...")
        losses = []
        average_batches_per_sec = []
        start = time.time()
        for i in range(num_loops):
            t0 = time.time()
            sess.run(res)
            local_losses = sess.run(dequeue_outfeed)
            duration = time.time() - t0
            average_batches_per_sec.append(
                cosmoflow_config['ipu_config']['iterations_per_loop'] /
                duration)
            report_string = "{:<7.3} sec/itr.".format(duration)
            print(report_string)
            losses.append(local_losses)

        t1 = time.time()
        duration_seconds = t1 - start

        logging.info("Took {:.2f} minutes".format(duration_seconds / 60))
        print('Iteration, Batches/Second, Samples/Second')
        for loop_idx, bps in enumerate(average_batches_per_sec):
            print('{}, {}, {}'.format(
                loop_idx, bps, bps * data_config["batch_size"] *
                cosmoflow_config['ipu_config']['num_ipus']))

        samples_per_second = np.mean(average_batches_per_sec) * data_config[
            "batch_size"] * cosmoflow_config['ipu_config']['num_ipus']
        print(
            "Took {:.2f} minutes, i.e. {:.0f} samples per second for batch-size {} and no. IPUs = {}"
            .format(duration_seconds / 60, samples_per_second,
                    cosmoflow_config['data']['batch_size'],
                    cosmoflow_config['ipu_config']['num_ipus']))

        # Finalize
        logging.info('All done!')

    return
コード例 #6
0
ファイル: train.py プロジェクト: muzzynine/examples-1
 def end(self, session):
     raw_reports = session.run(self._report)
     from gcprofile import save_tf_report
     save_tf_report(raw_reports)
コード例 #7
0
ファイル: train.py プロジェクト: sabarahimi2019/examples
def train_process(model, LR_Class, opts):

    # --------------- OPTIONS ---------------------
    epochs = opts["epochs"]
    iterations_per_epoch = DATASET_CONSTANTS[
        opts['dataset']]['NUM_IMAGES'] // opts["total_batch_size"]
    if not opts['iterations']:
        iterations = epochs * iterations_per_epoch
        log_freq = iterations_per_epoch // opts['logs_per_epoch']
    else:
        iterations = opts['iterations']
        log_freq = opts['log_freq']

    if log_freq < opts['batches_per_step']:
        iterations_per_step = log_freq
    else:
        iterations_per_step = log_freq // int(
            round(log_freq / opts['batches_per_step']))

    iterations_per_valid = iterations_per_epoch
    iterations_per_ckpt = iterations_per_epoch // opts[
        'ckpts_per_epoch'] if opts['ckpts_per_epoch'] else np.inf

    LR = LR_Class(opts, iterations)

    batch_accs = deque(maxlen=iterations_per_epoch // iterations_per_step)
    batch_losses = deque(maxlen=iterations_per_epoch // iterations_per_step)
    batch_times = deque(maxlen=iterations_per_epoch // iterations_per_step)
    start_all = None

    # -------------- BUILD TRAINING GRAPH ----------------

    train = training_graph(
        model, opts, iterations_per_step * opts["gradients_to_accumulate"])
    train.session.run(train.init)
    train.session.run(train.iterator.initializer)

    # -------------- BUILD VALIDATION GRAPH ----------------

    if opts['validation']:
        valid = validation.initialise_validation(model, opts)

    # -------------- SAVE AND RESTORE --------------

    if opts['ckpts_per_epoch']:
        filepath = train.saver.save(train.session,
                                    opts["checkpoint_path"],
                                    global_step=0)
        print("Saved checkpoint to {}".format(filepath))

    if opts.get('restoring'):
        filename_pattern = re.compile(".*ckpt-[0-9]+$")
        ckpt_pattern = re.compile(".*ckpt-([0-9]+)$")
        filenames = sorted(
            [
                os.path.join(opts['logs_path'], f[:-len(".index")])
                for f in os.listdir(opts['logs_path'])
                if filename_pattern.match(f[:-len(".index")])
                and f[-len(".index"):] == ".index"
            ],
            key=lambda x: int(ckpt_pattern.match(x).groups()[0]))
        latest_checkpoint = filenames[-1]
        logging.print_to_file_and_screen(
            "Restoring training from latest checkpoint: {}".format(
                latest_checkpoint), opts)
        ckpt_pattern = re.compile(".*ckpt-([0-9]+)$")
        i = int(ckpt_pattern.match(latest_checkpoint).groups()[0]) + 1
        train.saver.restore(train.session, latest_checkpoint)
        epoch = float(opts["total_batch_size"] *
                      (i + iterations_per_step)) / DATASET_CONSTANTS[
                          opts['dataset']]['NUM_IMAGES']
    else:
        i = 0

    # ------------- TRAINING LOOP ----------------

    print_format = (
        "step: {step:6d}, iteration: {iteration:6d}, epoch: {epoch:6.2f}, lr: {lr:6.4g}, loss: {loss_avg:6.3f}, top-1 accuracy: {train_acc_avg:6.3f}%"
        ", img/sec: {img_per_sec:6.2f}, time: {it_time:8.6f}, total_time: {total_time:8.1f}"
    )

    step = 0
    start_all = time.time()
    while i < iterations:
        step += opts["gradients_to_accumulate"]
        log_this_step = ((i // log_freq) <
                         ((i + iterations_per_step) // log_freq) or (i == 0)
                         or ((i + (2 * iterations_per_step)) >= iterations))
        ckpt_this_step = (opts["ckpts_per_epoch"] and (
            (i // iterations_per_ckpt) <
            ((i + iterations_per_step) // iterations_per_ckpt) or (i == 0) or
            ((i + (2 * iterations_per_step)) >= iterations)))
        valid_this_step = (opts['validation'] and (
            (i // iterations_per_valid) <
            ((i + iterations_per_step) // iterations_per_valid) or (i == 0) or
            ((i + (2 * iterations_per_step)) >= iterations)))

        # Run Training
        try:
            batch_loss, batch_acc, batch_time, current_lr = training_step(
                train, i + 1, LR.feed_dict_lr(i))
            if opts['pipeline_depth'] > 1:
                current_lr *= opts["loss_scaling"]
        except tf.errors.OpError as e:
            raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message)

        batch_time /= iterations_per_step

        # Calculate Stats
        batch_accs.append([batch_acc])
        batch_losses.append([batch_loss])

        if i != 0:
            batch_times.append([batch_time])

        # Print loss
        if log_this_step:
            train_acc = np.mean(batch_accs)
            train_loss = np.mean(batch_losses)

            if len(batch_times) != 0:
                avg_batch_time = np.mean(batch_times)
            else:
                avg_batch_time = batch_time

            # flush times every time it is reported
            batch_times.clear()

            total_time = time.time() - start_all
            epoch = float(opts["total_batch_size"] *
                          (i + iterations_per_step)) / DATASET_CONSTANTS[
                              opts['dataset']]['NUM_IMAGES']

            stats = OrderedDict([
                ('step', step),
                ('iteration', i + iterations_per_step),
                ('epoch', epoch),
                ('lr', current_lr),
                ('loss_batch', batch_loss),
                ('loss_avg', train_loss),
                ('train_acc_batch', batch_acc),
                ('train_acc_avg', train_acc),
                ('it_time', avg_batch_time),
                ('img_per_sec', opts['total_batch_size'] / avg_batch_time),
                ('total_time', total_time),
            ])

            logging.print_to_file_and_screen(print_format.format(**stats),
                                             opts)
            logging.write_to_csv(stats, i == 0, True, opts)

        if ckpt_this_step:
            filepath = train.saver.save(train.session,
                                        opts["checkpoint_path"],
                                        global_step=i + iterations_per_step)
            print("Saved checkpoint to {}".format(filepath))

        # Eval
        if valid_this_step and opts['validation']:
            if 'validation_points' not in locals():
                validation_points = []
            validation_points.append(
                (i + iterations_per_step, epoch, i == 0, filepath))

        i += iterations_per_step

    # ------------ COLLECT PROFILE -----------
    if opts["profile"]:
        from gcprofile import save_tf_report
        save_tf_report(train.session.run(train.profile))

    # ------------ RUN VALIDATION ------------
    if 'validation_points' in locals() and opts['validation']:
        for iteration, epoch, first_run, filepath in validation_points:
            validation.validation_run(valid, filepath, iteration, epoch,
                                      first_run, opts)

    # --------------- CLEANUP ----------------
    train.session.close()
コード例 #8
0
 def end(self, session):
     raw_reports = session.run(self._report)
     save_tf_report(raw_reports)
コード例 #9
0
    # With pipelining, IPU-level profiling is needed to correctly visualise the execution trace.
    # For pipelined models either SNAKE or HOOF IPU selection orders are advised;
    # the latter works best when the first and last stage live on the same IPU.
    # For more info, check ipu.utils.py file or the TensorFlow document:
    # https://www.graphcore.ai/docs/targeting-the-ipu-from-tensorflow#tensorflow.python.ipu.utils.SelectionOrder.
    cfg = ipu.utils.create_ipu_config(
        profiling=args.profile,
        profile_execution=ipu.utils.ExecutionProfileType.IPU_PROFILE
        if args.profile else False,
        selection_order=ipu.utils.SelectionOrder.SNAKE)
    # Auto select as many IPUs as we want to pipeline across
    cfg = ipu.utils.auto_select_ipus(cfg, 2)
    ipu.utils.configure_ipu_system(cfg)

    with tf.Session() as sess:
        # Initialize
        sess.run(init_op)
        sess.run(infeed_queue.initializer)
        # Run
        for step in range(steps):
            sess.run(compiled_model, {lr: args.learning_rate})
            if args.profile and gcprofile_present:
                raw_reports = sess.run(report)
                save_tf_report(raw_reports)
                break
            # Read the outfeed for the training losses
            losses = sess.run(outfeed_op)
            epoch = float(examples_per_step * step / n_examples)
            print("Epoch {:.1f}, Mean loss: {:.3f}".format(
                epoch, np.mean(losses)))