Example #1
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
):
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    for i in range(epoch_iters):
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
        prefix = "{}_{}".format(train_model._device_prefix,
                                train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        train_fmt = "Training loss: {}, accuracy: {}"
        log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)

    explog.log(input_count=num_images,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("
    return epoch + 1
Example #2
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
    best_accuracy,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
        prefix = "{}_{}".format(
            train_model._device_prefix,
            train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        train_fmt = "Training loss: {}, accuracy: {}"
        log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
    )
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        # for _ in range(0, 100):
        # for _ in range(0, 125):
        for _ in range(0, args.test_iters):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(workspace.FetchBlob(
                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                ))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy

    explog.log(
        input_count=num_images,
        batch_count=(i + epoch * epoch_iters),
        additional_values={
            'accuracy': accuracy,
            'loss': loss,
            'learning_rate': learning_rate,
            'epoch': epoch,
            'test_accuracy': test_accuracy,
            'best_accuracy': best_accuracy,
        }
    )
    assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    return epoch + 1, best_accuracy
Example #3
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    ts = time.time()
    drop = 10
    max = 0.0

    spans = []

    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 3600  #3600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
            if i > drop:
                spans.append(dt)
                pass
        updateEvery = args.notify_frequency
        #ignore the first 10 iterations
        if i == drop:
            #reset timer
            ts = time.time()
            pass
        if (i - drop) % updateEvery == 0 and i > drop:
            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec), max = {:.2f}, avg = {:.2f}, median = {}. medthru = {}, avgthru = {}"
            te = time.time()
            td = te - ts
            currSpeed = updateEvery * total_batch_size / td
            if max < currSpeed:
                max = currSpeed
                pass
            log.info(
                fmt.format(i + 1, epoch_iters, epoch, currSpeed, max,
                           np.mean(spans), np.median(spans),
                           1. / np.median(spans), 1. / np.mean(spans)))
            ts = time.time()

            prefix = "{}_{}".format(train_model._device_prefix,
                                    train_model._devices[0])
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            loss = workspace.FetchBlob(prefix + '/loss')
            train_fmt = "Training loss: {}, accuracy: {}"
            log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)

    explog.log(input_count=num_images,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    print("accuracy = %s. test_acc = %s. loss = %s" %
          (accuracy, test_accuracy, loss))
    return epoch + 1
def RunEpoch(args,
             epoch,
             train_model,
             test_model,
             total_batch_size,
             num_shards,
             explog,
             plt_kernel):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    if args.test_data_type == 'VAL':
        log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
        epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
        epoch_loss = []
        epoch_accuracy = []
        for i in range(epoch_iters):
            # This timeout is required (temporarily) since CUDA-NCCL
            # operators might deadlock when synchronizing between GPUs.
            timeout = 600.0 if i == 0 else 60.0
            with timeout_guard.CompleteInTimeOrDie(timeout):
                t1 = time.time()
                workspace.RunNet(train_model.net.Proto().name)
                t2 = time.time()
                dt = t2 - t1

            # display_first_image()

            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
            log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
            prefix = "{}_{}".format(
                train_model._device_prefix,
                train_model._devices[0])
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            loss = workspace.FetchBlob(prefix + '/loss')
            train_fmt = "Training loss: {}, accuracy: {}"
            log.info(train_fmt.format(loss, accuracy))
            epoch_loss.append(loss)
            epoch_accuracy.append(accuracy)

        num_images = epoch * epoch_iters * total_batch_size
        prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        learning_rate = workspace.FetchBlob(
            data_parallel_model.GetLearningRateBlobNames(train_model)[0]
        )
        test_accuracy = 0
        if (test_model is not None):
            # Run 100 iters of testing
            ntests = 0
            for _ in range(0, 100):
                workspace.RunNet(test_model.net.Proto().name)
                for g in test_model._devices:
                    test_accuracy += np.asscalar(workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                    ))
                    ntests += 1
            test_accuracy /= ntests
        else:
            test_accuracy = (-1)

        explog.log(
            input_count=num_images,
            batch_count=(i + epoch * epoch_iters),
            additional_values={
                'accuracy': accuracy,
                'loss': loss,
                'learning_rate': learning_rate,
                'epoch': epoch,
                'test_accuracy': test_accuracy,
            }
        )
        assert loss < 40, "Exploded gradients :("
        if DEBUG_TRAINING:
            device_name = "{}_{}".format(test_model._device_prefix, test_model._devices[0])
            display_activation_map(plt_kernel, channel=0, batch_num=16, device_name=device_name)
            plt.pause(0.001) 

    #lfw verification test
    elif args.test_data_type == 'LFW' and args.load_model_path is not None:
        lfw_pairs = os.path.join(os.path.abspath('../dataset'), 'lfw_pairs.txt')
        if not os.path.exists(lfw_pairs):
            log.error('There is no lfw_pairs.txt in folder dataset/lfw!!!')
        else:
            actual_issame = lfw.get_issame_list(lfw.read_pairs(lfw_pairs))
            num_test_images = len(actual_issame) * 2
            assert num_test_images % total_batch_size == 0, \
                'The number of lfw test images must be interger multiple of the test bach size'
            num_batches = num_test_images // total_batch_size
            emb_array = np.zeros((num_test_images, args.feature_dim))
            for _ in range(0, num_batches):
                workspace.RunNet(test_model.net.Proto().name)
                for g in test_model._devices:
                    # display_activation_map(plt_kernel, channel=0, batch_num=16)
                    # plt.pause(0.001)
                    label = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/label')
                    embedding = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/fc5')
                    emb_array[label] = embedding

            _, _, test_accuracy, test_val, val_std, far = lfw.evaluate(emb_array,
                                                                       actual_issame,
                                                                       nrof_folds=10)
            log.info('Accuracy: %1.3f+-%1.3f' % (np.mean(test_accuracy), np.std(test_accuracy)))
            log.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (test_val, val_std, far))

    #megaface verification test
    elif args.test_data_type == 'MEGAFACE' and args.load_model_path is not None:
        pass



    return epoch + 1, epoch_loss, epoch_accuracy
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        # Light test for LR names
        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
        self.assertGreater(len(lr_names), 0)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                               np.array([i * 2]).astype(np.float32),
                               device_option=core.DeviceOption(
                                   model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(model._device_prefix +
                                           "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Example #6
0
def RunEpoch(args, epoch, train_model, test_model, explog,
             elapsed_training_time):
    """
    Run a training epoch one the evaluation model, and then compute the accuracy on a test model.

    :param args: the script's parameters 
    :param epoch: the current epoch'count
    :param train_model: the model on which training will be performed
    :param test_model: the model on which testing will be performed
    :param explog: the log object wrapping the file
    """
    log.info("Starting epoch {}/{}".format(epoch + 1, args.epoch_count))
    epoch_iters = int(args.epoch_size / args.batch_size / args.num_shards)
    test_epoch_iters = int(args.test_epoch_size / args.batch_size /
                           args.num_shards)
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])

    total_time = 0.

    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600 if i == 0 else 300
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
            total_time += dt

        # Log the tiem it took to run the current batch
        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(
            fmt.format(i + 1, epoch_iters, epoch + 1, args.batch_size / dt))

        # Get the accuracy and loss for this particular device
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')

        # Write the training loss and accuracy for this batch
        log.info("Training loss: {}, accuracy: {}".format(loss, accuracy))

    # Compute the total number of images computed for this epoch; get the accuracy and the loss
    num_images = (epoch + 1) * epoch_iters * args.batch_size
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')

    try:
        learning_rate = workspace.FetchBlob(
            (prefix if args.per_device_optimization else '') +
            data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    except AttributeError:
        log.error(
            "The learning rate could not be found on this peer; this is likely due to the "
            "--per_device_optimization=True option.")
        learning_rate = 'unknown'

    # Prepare the parameters required for testing
    test_accuracy = 0
    test_accuracy_top5 = 0
    if test_model is not None:

        ntests = 0
        for _ in range(test_epoch_iters):
            workspace.RunNet(test_model.net.Proto().name)

            # Aggregate the accuracy across all the devices involved in testing
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                test_accuracy_top5 += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy_top5'))
                ntests += 1

        # Compute the average test_accuracy and the average top-5 test accuracy across
        # a test epoch, and across all devices involved in it
        test_accuracy /= ntests
        test_accuracy_top5 /= ntests

    # Log the results to stdout, update total training time
    elapsed_training_time += total_time
    on_target = test_accuracy >= args.target_accuracy
    log.info("Finished testing on epoch {}. Obtained:\nAccuracy (Local - Training): {}\n" \
        "Loss (Local - Training): {}\nTop-1 Acc: {}\nTop-5 Acc: {}\nOn target: {}\n Elapsed training time: {}"
        .format(epoch + 1, accuracy, loss, test_accuracy, test_accuracy_top5, on_target, elapsed_training_time))

    # Log this epoch's results
    explog.log(input_count=num_images,
               batch_count=((epoch + 1) * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch + 1,
                   'top1_test_accuracy': test_accuracy,
                   'top5_test_accuracy': test_accuracy_top5,
                   'target_accuracy': args.target_accuracy,
                   'on_target': on_target,
                   'elapsed_training_time': elapsed_training_time,
               })

    assert loss < 40, "Exploded gradients"

    return elapsed_training_time, on_target
Example #7
0
                        labels_device)
            if i == 0 and e == 0:
                workspace.RunNetOnce(train_model.param_init_net)
                workspace.CreateNet(train_model.net)
                workspace.RunNetOnce(test_model.param_init_net)
                workspace.CreateNet(test_model.net, overwrite=True)
                workspace.RunNetOnce(deploy_model.param_init_net)
                workspace.CreateNet(deploy_model.net, overwrite=True)

            workspace.RunNet(train_model.net.Proto().name)
            loss_sum += workspace.FetchBlob("gpu_0/loss")
            correct += workspace.FetchBlob("gpu_0/accuracy")

        time_ep = time.time() - time_ep
        lr = workspace.FetchBlob(
            data_parallel_model.GetLearningRateBlobNames(train_model)[0])

        values = [
            e + 1,
            lr,
            loss_sum / batch_num,
            correct / batch_num,
            test_res['loss'],
            test_res['accuracy'],
            time_ep,
        ]
        table = tabulate.tabulate([values],
                                  columns,
                                  tablefmt='simple',
                                  floatfmt='8.4f')
        if e % 25 == 0: