Esempio n. 1
0
def validation_run(valid, filepath, i, epoch, first_run, opts):
    if filepath:
        valid.saver.restore(valid.session, filepath)

    # Gather accuracy statistics
    accuracy = 0.0
    start = time.time()
    for __ in range(opts["validation_iterations"]):
        try:
            a = valid.session.run(valid.ops)[0]
        except tf.errors.OpError as e:
            raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message)

        accuracy += a
    val_time = time.time() - start
    accuracy /= opts["validation_iterations"]

    valid_format = (
        "Validation top-1 accuracy (iteration: {iteration:6d}, epoch: {epoch:6.2f}, img/sec: {img_per_sec:6.2f},"
        " time: {val_time:8.6f}): {val_acc:6.3f}%")

    stats = OrderedDict([
        ('iteration', i),
        ('epoch', epoch),
        ('val_acc', accuracy),
        ('val_time', val_time),
        ('img_per_sec',
         (opts["validation_iterations"] * opts["validation_batches_per_step"] *
          opts['validation_total_batch_size']) / val_time),
    ])
    logging.print_to_file_and_screen(valid_format.format(**stats), opts)
    logging.write_to_csv(stats, first_run, False, opts)
Esempio n. 2
0
def train_process(model, LR_Class, opts):

    # --------------- OPTIONS ---------------------
    epochs = opts["epochs"]
    iterations_per_epoch = DATASET_CONSTANTS[
        opts['dataset']]['NUM_IMAGES'] // opts["total_batch_size"]
    if not opts['iterations']:
        iterations = epochs * iterations_per_epoch
        log_freq = iterations_per_epoch // opts['logs_per_epoch']
    else:
        iterations = opts['iterations']
        log_freq = opts['log_freq']

    if log_freq < opts['batches_per_step']:
        iterations_per_step = log_freq
    else:
        iterations_per_step = log_freq // int(
            round(log_freq / opts['batches_per_step']))

    iterations_per_valid = iterations_per_epoch
    iterations_per_ckpt = iterations_per_epoch // opts[
        'ckpts_per_epoch'] if opts['ckpts_per_epoch'] else np.inf

    LR = LR_Class(opts, iterations)

    batch_accs = deque(maxlen=iterations_per_epoch // iterations_per_step)
    batch_losses = deque(maxlen=iterations_per_epoch // iterations_per_step)
    batch_times = deque(maxlen=iterations_per_epoch // iterations_per_step)
    start_all = None

    # -------------- BUILD TRAINING GRAPH ----------------

    train = training_graph(
        model, opts, iterations_per_step * opts["gradients_to_accumulate"])
    train.session.run(train.init)
    train.session.run(train.iterator.initializer)

    # -------------- BUILD VALIDATION GRAPH ----------------

    if opts['validation']:
        valid = validation.initialise_validation(model, opts)

    # -------------- SAVE AND RESTORE --------------

    if opts['ckpts_per_epoch']:
        filepath = train.saver.save(train.session,
                                    opts["checkpoint_path"],
                                    global_step=0)
        print("Saved checkpoint to {}".format(filepath))

    if opts.get('restoring'):
        filename_pattern = re.compile(".*ckpt-[0-9]+$")
        ckpt_pattern = re.compile(".*ckpt-([0-9]+)$")
        filenames = sorted(
            [
                os.path.join(opts['logs_path'], f[:-len(".index")])
                for f in os.listdir(opts['logs_path'])
                if filename_pattern.match(f[:-len(".index")])
                and f[-len(".index"):] == ".index"
            ],
            key=lambda x: int(ckpt_pattern.match(x).groups()[0]))
        latest_checkpoint = filenames[-1]
        logging.print_to_file_and_screen(
            "Restoring training from latest checkpoint: {}".format(
                latest_checkpoint), opts)
        ckpt_pattern = re.compile(".*ckpt-([0-9]+)$")
        i = int(ckpt_pattern.match(latest_checkpoint).groups()[0]) + 1
        train.saver.restore(train.session, latest_checkpoint)
        epoch = float(opts["total_batch_size"] *
                      (i + iterations_per_step)) / DATASET_CONSTANTS[
                          opts['dataset']]['NUM_IMAGES']
    else:
        i = 0

    # ------------- TRAINING LOOP ----------------

    print_format = (
        "step: {step:6d}, iteration: {iteration:6d}, epoch: {epoch:6.2f}, lr: {lr:6.4g}, loss: {loss_avg:6.3f}, accuracy: {train_acc_avg:6.3f}%"
        ", img/sec: {img_per_sec:6.2f}, time: {it_time:8.6f}, total_time: {total_time:8.1f}"
    )

    step = 0
    start_all = time.time()
    while i < iterations:
        step += opts["gradients_to_accumulate"]
        log_this_step = ((i // log_freq) <
                         ((i + iterations_per_step) // log_freq) or (i == 0)
                         or ((i + (2 * iterations_per_step)) >= iterations))
        ckpt_this_step = ((i // iterations_per_ckpt) <
                          ((i + iterations_per_step) // iterations_per_ckpt)
                          or (i == 0)
                          or ((i + (2 * iterations_per_step)) >= iterations))
        valid_this_step = (opts['validation'] and (
            (i // iterations_per_valid) <
            ((i + iterations_per_step) // iterations_per_valid) or (i == 0) or
            ((i + (2 * iterations_per_step)) >= iterations)))

        # Run Training
        try:
            batch_loss, batch_acc, batch_time, current_lr, scaled_lr = training_step(
                train, i + 1, LR.feed_dict_lr(i))
        except tf.errors.OpError as e:
            raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message)

        batch_time /= iterations_per_step

        # Calculate Stats
        batch_accs.append([batch_acc])
        batch_losses.append([batch_loss])

        if i != 0:
            batch_times.append([batch_time])

        # Print loss
        if log_this_step:
            train_acc = np.mean(batch_accs)
            train_loss = np.mean(batch_losses)

            if len(batch_times) != 0:
                avg_batch_time = np.mean(batch_times)
            else:
                avg_batch_time = batch_time

            # flush times every time it is reported
            batch_times.clear()

            total_time = time.time() - start_all
            epoch = float(opts["total_batch_size"] *
                          (i + iterations_per_step)) / DATASET_CONSTANTS[
                              opts['dataset']]['NUM_IMAGES']

            stats = OrderedDict([
                ('step', step),
                ('iteration', i + iterations_per_step),
                ('epoch', epoch),
                ('lr', current_lr),
                ('scaled_lr', scaled_lr),
                ('loss_batch', batch_loss),
                ('loss_avg', train_loss),
                ('train_acc_batch', batch_acc),
                ('train_acc_avg', train_acc),
                ('it_time', avg_batch_time),
                ('img_per_sec', opts['total_batch_size'] / avg_batch_time),
                ('total_time', total_time),
            ])

            logging.print_to_file_and_screen(print_format.format(**stats),
                                             opts)
            logging.write_to_csv(stats, i == 0, True, opts)

        if ckpt_this_step:
            filepath = train.saver.save(train.session,
                                        opts["checkpoint_path"],
                                        global_step=i + iterations_per_step)
            print("Saved checkpoint to {}".format(filepath))

        # Eval
        if valid_this_step and opts['validation']:
            if 'validation_points' not in locals():
                validation_points = []
            validation_points.append(
                (i + iterations_per_step, epoch, i == 0, filepath))

        i += iterations_per_step

    # ------------ RUN VALIDATION ------------
    if opts['validation']:
        for iteration, epoch, first_run, filepath in validation_points:
            validation.validation_run(valid, filepath, iteration, epoch,
                                      first_run, opts)

    # --------------- CLEANUP ----------------
    train.session.close()
Esempio n. 3
0
                                              args['lr_schedule'])
    except ImportError:
        raise ValueError("LR_Schedules/{}.py not found".format(
            args['lr_schedule']))

    # Large number of deprecation warnings that cannot be resolved yet.
    tf.logging.set_verbosity(tf.logging.ERROR)

    parser = create_parser(model, lr_schedule, parser)
    opts = vars(parser.parse_args())
    if opts['help']:
        parser.print_help()
    else:
        if opts['gradients_to_accumulate'] > 1 and opts['pipeline_depth'] > 1:
            raise ValueError(
                "gradients-to-accumulate can't be specified when using --pipeline-depth > 1"
            )
        if opts['pipeline_depth'] > 1 and opts['shards'] == 1:
            raise ValueError(
                "--pipeline-depth can only be used if --shards > 1")
        opts["command"] = ' '.join(sys.argv)
        set_defaults(model, lr_schedule, opts)

        logging.print_to_file_and_screen("Command line: " + opts["command"],
                                         opts)
        logging.print_to_file_and_screen(opts["summary_str"].format(**opts),
                                         opts)
        opts["summary_str"] = ""
        logging.print_to_file_and_screen(opts, opts)
        train_process(model, lr_schedule.LearningRate, opts)
Esempio n. 4
0
def inference_run(exec_filename, ckpt_name, iteration, epoch, first_run, opts):
    """Run inference for multiple iterations and collect latency values."""
    logging.mlperf_logging(key="EVAL_START",
                           log_type="start",
                           metadata={"epoch_num": round(epoch)})
    engine_name = "my_engine"
    ctx = embedded_runtime.embedded_runtime_start(exec_filename, [],
                                                  engine_name,
                                                  timeout=1000)

    input_placeholder = tf.placeholder(
        tf.uint8,
        (opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3))

    num_iters = opts['iterations']
    if opts['generated_data']:
        placeholders = [input_placeholder]
        images = np.random.normal(size=(opts['micro_batch_size'],
                                        opts['image_size'], opts['image_size'],
                                        3)).astype(np.uint8)
        labels = None
    else:
        label_placeholder = tf.placeholder(tf.int32,
                                           (opts['micro_batch_size']))
        placeholders = [input_placeholder, label_placeholder]

        with tf.Graph().as_default():
            inference_dataset = dataset.data(
                opts, is_training=False).map(lambda x: {'data_dict': x})
            images, labels = dataset_to_list(
                inference_dataset, num_iters * opts['micro_batch_size'])

    call_result = embedded_runtime.embedded_runtime_call(placeholders, ctx)

    ipu.config.reset_ipu_configuration()
    gc.collect()

    thread_queue = Queue()
    with tf.Session() as session:
        # do not include time of the first iteration in stats
        initial_feed_dict = prepare_feed_dict(placeholders, images, labels,
                                              opts['micro_batch_size'],
                                              opts['generated_data'], 0)
        session.run(call_result, initial_feed_dict)

        def runner(session, thread_idx):
            thread_channel = pvti.createTraceChannel(f"Thread {thread_idx}")
            latencies = []
            accuracies = []
            for iter_idx in range(num_iters):
                feed_dict = prepare_feed_dict(placeholders, images, labels,
                                              opts['micro_batch_size'],
                                              opts['generated_data'], iter_idx)
                with pvti.Tracepoint(thread_channel, f"Iteration {iter_idx}"):
                    start_iter = time.time()
                    _, predictions = session.run(call_result, feed_dict)
                    end_iter = time.time()
                latencies.append(end_iter - start_iter)
                if not opts['generated_data']:
                    expected = feed_dict[label_placeholder]
                    accuracy = np.mean(
                        np.equal(predictions, expected).astype(np.float32))
                    accuracies.append(accuracy)
            thread_queue.put((latencies, accuracies), timeout=10)

        thp = [
            Thread(target=runner, args=(session, thread_idx))
            for thread_idx in range(opts['num_inference_thread'])
        ]
        inference_start = time.time()
        for idx, _thread in enumerate(thp):
            _thread.start()
            print(f"Thread {idx} started")

        for idx, _thread in enumerate(thp):
            _thread.join()
            print(f"Thread {idx} joined")
        val_time = time.time() - inference_start

    latencies, accuracies = [], []
    while not thread_queue.empty():
        lat_acc = thread_queue.get()
        latencies.extend(lat_acc[0])
        accuracies.extend(lat_acc[1])

    if opts['generated_data']:
        total_accuracy = -1
    else:
        total_accuracy = sum(accuracies) / len(accuracies)
        total_accuracy *= 100

    # convert latencies to miliseconds
    latencies = [1000 * latency_s for latency_s in latencies]

    max_latency = max(latencies)
    mean_latency = np.mean(latencies)
    perc_99 = np.percentile(latencies, 99)
    perc_99_9 = np.percentile(latencies, 99.9)

    print(
        f"Latencies - avg: {mean_latency:8.4f}, 99th percentile: {perc_99:8.4f}, "
        f"99.9th percentile: {perc_99_9:8.4f}, max: {max_latency:8.4f}")

    valid_format = (
        "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, "
        "img/sec: {img_per_sec:6.2f}, time: {val_time:8.6f}, "
        "latency (ms): {latency:8.4f}: {val_acc:6.3f}%")

    val_size = (num_iters * opts['num_inference_thread'] *
                opts['validation_total_batch_size'])

    stats = OrderedDict([
        ('name', ckpt_name),
        ('iteration', iteration),
        ('epoch', epoch),
        ('val_acc', total_accuracy),
        ('val_time', val_time),
        ('val_size', val_size),
        ('img_per_sec', val_size / val_time),
        ('latency', mean_latency),
    ])
    logging.print_to_file_and_screen(valid_format.format(**stats), opts)
    logging.write_to_csv(stats, first_run, False, opts)
    if opts['wandb'] and opts['distributed_worker_index'] == 0:
        logging.log_to_wandb(stats)
    logging.mlperf_logging(key="EVAL_STOP",
                           log_type="stop",
                           metadata={"epoch_num": round(epoch)})
    logging.mlperf_logging(key="EVAL_ACCURACY",
                           value=float(stats['val_acc']) / 100,
                           metadata={"epoch_num": round(epoch)})
    return stats
Esempio n. 5
0
    else:
        # backwards compatibility
        if opts['batch_size'] and opts['micro_batch_size']:
            raise ValueError(
                'Both --batch-size and --micro-batch-size arguments were given, '
                'use --micro-batch-size, as --batch-size is deprecated and kept '
                'for backwards compatibility.')
        elif opts['batch_size']:
            opts['micro_batch_size'] = opts['batch_size']
        opts['use_popdist'] = False
        opts['total_replicas'] = opts['replicas']

        opts['command'] = ' '.join(sys.argv)
        set_defaults(model, opts)

        if opts['dataset'] == 'imagenet':
            if opts['image_size'] is None:
                opts['image_size'] = 224
        elif 'cifar' in opts['dataset']:
            opts['image_size'] = 32

        if opts['wandb'] and opts['distributed_worker_index'] == 0:
            logging.initialise_wandb(opts)
        logging.print_to_file_and_screen("Command line: " + opts['command'],
                                         opts)
        logging.print_to_file_and_screen(opts['summary_str'].format(**opts),
                                         opts)
        opts['summary_str'] = ""
        logging.print_to_file_and_screen(opts, opts)
        inference_only_process(model, opts)
Esempio n. 6
0
def validation_run(valid, filepath, i, epoch, first_run, opts, latency_thread):
    run = True
    if filepath:
        valid.saver.restore(valid.session, filepath)
        name = filepath.split('/')[-1]

        csv_path = os.path.join(opts['logs_path'], 'validation.csv')
        if os.path.exists(csv_path):
            with open(csv_path, 'rU') as infile:
                # read the file as a dictionary for each row ({header : value})
                reader = csv.DictReader(infile)
                for row in reader:
                    if row['name'] == name:
                        run = False
                        print(
                            'Skipping validation run on checkpoint: {}'.format(
                                name))
                        break
    else:
        name = None

    if run:
        if opts['use_popdist']:
            # synchronise the model weights across all instances
            valid.session.run(valid.ops['broadcast_weights'])

        logging.mlperf_logging(key="EVAL_START",
                               log_type="start",
                               metadata={"epoch_num": round(epoch)})
        # Gather accuracy statistics
        accuracy = 0.0

        # start latency thread
        latency_thread.start()

        start = relative_timer.now()
        for __ in range(opts["validation_iterations"]):
            try:
                a = valid.session.run(valid.ops['accuracy'])
            except tf.errors.OpError as e:
                if opts['compile_only'] and 'compilation only' in e.message:
                    print("Validation graph successfully compiled")
                    print("Exiting...")
                    sys.exit(0)
                raise tf.errors.ResourceExhaustedError(e.node_def, e.op,
                                                       e.message)

            accuracy += a
        val_time = relative_timer.now() - start
        accuracy /= opts["validation_iterations"]

        # wait for all dequeues and latency computation
        latency_thread.join()
        latency = latency_thread.get_latency()

        valid_format = (
            "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, img/sec: {img_per_sec:6.2f},"
            " time: {val_time:8.6f}, latency (ms): {latency:8.4f}): {val_acc:6.3f}%"
        )

        val_size = (opts["validation_iterations"] *
                    opts["validation_batches_per_step"] *
                    opts["validation_global_batch_size"])

        count = int(
            DATASET_CONSTANTS[opts['dataset']]['NUM_VALIDATION_IMAGES'])

        raw_accuracy = accuracy
        if count < val_size:
            accuracy = accuracy * val_size / count

        stats = OrderedDict([
            ('name', name),
            ('iteration', i),
            ('epoch', epoch),
            ('val_acc', accuracy),
            ('raw_acc', raw_accuracy),
            ('val_time', val_time),
            ('val_size', val_size),
            ('img_per_sec', val_size / val_time),
            ('latency', latency * 1000),
        ])
        logging.print_to_file_and_screen(valid_format.format(**stats), opts)
        logging.write_to_csv(stats, first_run, False, opts)
        if opts["wandb"] and opts["distributed_worker_index"] == 0:
            logging.log_to_wandb(stats)
        logging.mlperf_logging(key="EVAL_STOP",
                               log_type="stop",
                               metadata={"epoch_num": round(epoch)})
        logging.mlperf_logging(key="EVAL_ACCURACY",
                               value=float(stats["val_acc"]) / 100,
                               metadata={"epoch_num": round(epoch)})
        return stats