Example #1
0
def compute_multi_device_topk_accuracy(blob_name,
                                       top_k,
                                       split,
                                       input_db=None,
                                       generate_json=False,
                                       out_json_pred=None):
    top_k_accuracy = 0.0
    device_prefix, _ = helpers.get_prefix_and_device()
    for idx in range(0, cfg.NUM_DEVICES):
        prefix = '{}{}'.format(device_prefix, idx)
        softmax = workspace.FetchBlob(prefix + '/' + blob_name)
        indices = workspace.FetchBlob(prefix + '/' + 'db_indices')
        if cfg.TEST.TEN_CROP and split in ['test', 'val']:
            softmax = np.reshape(
                softmax, (int(softmax.shape[0] / 10), 10, softmax.shape[1]))
            softmax = np.mean(softmax, axis=1)
        labels = workspace.FetchBlob(prefix + '/labels')
        paths = None
        batch_size = softmax.shape[0]
        assert labels.shape[0] == batch_size, \
            "Something went wrong with data loading"
        acc, out_json_pred = compute_topk_accuracy(top_k, softmax.copy(),
                                                   labels, input_db, paths,
                                                   generate_json,
                                                   out_json_pred, indices)
        top_k_accuracy += acc
    accuracy = float(top_k_accuracy) / cfg.NUM_DEVICES
    return accuracy, out_json_pred
def scale_momentum(scale, model):
    """
    The MomentumSGDUpdate op implements the update V (a model parameter) as:

        V := mu * V + lr * grad,

    where mu is the momentum factor, lr is the learning rate, and grad is the
    stochastic gradient. Since V is not defined independently of the learning
    rate (as it should ideally be), when the learning rate is changed we should
    scale the update history V in order to make it compatible in scale with
    lr * grad.
    """
    # for the LR warm-up in distributed training, when we change the LR after
    # warm-up, then we need to update the momentum accordingly
    logger.info('Scaling momentum: {}'.format(scale))
    prefix, device = helpers.get_prefix_and_device()
    for idx in range(0, cfg.NUM_DEVICES):
        with core.DeviceScope(core.DeviceOption(device, idx)):
            with core.NameScope("{}{}".format(prefix, idx)):
                params = model.GetParams()
                for param in params:
                    op = core.CreateOperator(
                        'Scale', [param + '_momentum'], [param + '_momentum'],
                        scale=scale)
                    workspace.RunOperatorOnce(op)
Example #3
0
def save_model_params(model, params_file, checkpoint_dir, model_iter):
    logger.info("Saving model params to weights file {}".format(params_file))
    prefix, _ = helpers.get_prefix_and_device()
    master_device = helpers.get_master_device_prefix(prefix)
    save_params = [str(param) for param in model.GetParams(master_device)]
    save_computed_params = [
        str(param) for param in model.GetComputedParams(master_device)]
    save_blobs = {}
    # also save total model iterations so far
    save_blobs['model_iter'] = model_iter + 1
    save_blobs['lr'] = workspace.FetchBlob(prefix + '{}/lr'.format(0))
    # save param momentum as well
    for param in save_params:
        for scoped_blob_name in get_momentum_blobs(param):
            unscoped_blob_name = helpers.unscope_name(scoped_blob_name)
            if unscoped_blob_name not in save_blobs:
                if workspace.HasBlob(scoped_blob_name):
                    data = workspace.FetchBlob(scoped_blob_name)
                    save_blobs[unscoped_blob_name] = data

    for param in save_params + save_computed_params:
        scoped_blob_name = str(param)
        unscoped_blob_name = helpers.unscope_name(scoped_blob_name)
        if unscoped_blob_name not in save_blobs:
            data = workspace.FetchBlob(scoped_blob_name)
            save_blobs[unscoped_blob_name] = data
    save_object(dict(blobs=save_blobs), params_file)
Example #4
0
def broadcast_parameters(model, num_devices):
    if num_devices == 1:
        return
    prefix, device = helpers.get_prefix_and_device()
    master_device = helpers.get_master_device_prefix(prefix)
    # get all params to initialize: model params + param momentum
    all_model_params = model.GetAllParams(master_device)
    all_params_momentum = []
    if 'test' not in model.net.Name():
        for param in model.GetParams(master_device):
            for mom_blob in get_momentum_blobs(param):
                all_params_momentum.append(mom_blob)
    all_params = all_model_params + all_params_momentum

    # load the value of params
    for param in all_params:
        if workspace.HasBlob(str(param)):
            data = workspace.FetchBlob(str(param))
            unscoped_name = helpers.unscope_name(str(param))
            logger.info('Broadcasting {} to'.format(str(param)))
            for idx in range(1, num_devices):
                with core.NameScope(prefix + str(idx)):
                    with core.DeviceScope(core.DeviceOption(device, idx)):
                        device_scoped_name = helpers.scoped_name(unscoped_name)
                        if workspace.HasBlob(device_scoped_name):
                            logger.info(' |-> {}'.format(device_scoped_name))
                            workspace.FeedBlob(device_scoped_name, data)
                        else:
                            logger.info('Blob non-existent. Not broadcasting')
def sum_multi_device_blob(blob_name):
    """Average values of a blob on each device"""
    value = 0
    prefix, _ = helpers.get_prefix_and_device()
    for idx in range(0, cfg.NUM_DEVICES):
        value += workspace.FetchBlob('{}{}/{}'.format(prefix, idx, blob_name))
    return value
Example #6
0
 def shutdown_dataloader(self):
     self.coordinator.request_stop()
     self.coordinator.wait_for_stop()
     prefix, _ = helpers.get_prefix_and_device()
     for idx in range(0, self._num_devices):
         with core.NameScope("{}{}".format(prefix, idx)):
             self.close_blobs_queue()
     self.join()
Example #7
0
    def create_threads(self):
        # "worker" threads to construct (partial) minibatches and put them on
        # minibatch CPU queue in CPU memory (limited by queue size).
        self._worker_ids = self.get_worker_ids()
        self._workers = [
            threading.Thread(
                target=self.minibatch_loader,
                name='worker_{}'.format(worker_id),
                args=[worker_id],
            ) for worker_id in self._worker_ids
        ]

        # create one BlobsQueue per DEVICE which holds the training data in GPU
        # memory and feeds to the net
        prefix, device = helpers.get_prefix_and_device()
        # the root device id = 0
        for device_id in range(0, self._num_devices):
            with core.NameScope('{}{}'.format(prefix, device_id)):
                self.create_blobs_queue(
                    queue_name=self._blobs_queue_name,
                    num_blobs=len(self._blobs_idx_map),
                    capacity=self._device_blobs_queue_capacity)

        # launch enqueuer threads
        # Create one blob for each (blob_name, enqueuer_thread_id) pair:
        #  <train/test>_<blob_name>_enqueue_<enqueuer_thread_id>
        # The distinction between train/test here is important since when we use
        # EnqueueBlobs op, we need to distinguish otherwise data can get mixed.
        blob_names = self._blobs_idx_map.keys()
        enqueue_blobs_names = [[
            '{}_{}_enqueue_{}'.format(self._split, blob_name, idx)
            for blob_name in blob_names
        ] for idx in range(self._num_enqueuers)]
        for device_id in range(0, self._num_devices):
            # NameScope is prepended to all the blobs in the workspace
            with core.NameScope('{}{}'.format(prefix, device_id)):
                with core.DeviceScope(core.DeviceOption(device, device_id)):
                    for blob_list in enqueue_blobs_names:
                        for blob in blob_list:
                            scoped_blob_name = scope.CurrentNameScope() + blob
                            workspace.CreateBlob(scoped_blob_name)
        # create the enqueuer threads
        self._enqueuers = [
            threading.Thread(target=self.enqueue_blobs_thread,
                             args=(device_id, enqueue_blobs_names[idx]))
            for device_id in range(0, self._num_devices)
            for idx in range(self._num_enqueuers)
        ]
Example #8
0
    def enqueue_blobs(self, device_id, enqueue_blobs_names, blob_values):
        # enqueue blob names is <blob_name>_enqueue_<enqueuer_thread_id>
        # for the current nameScope, feed blobs using the blob values
        prefix, device = helpers.get_prefix_and_device()
        enqueue_blobs_names = [
            '{}{}/{}'.format(prefix, device_id, enqueue_blob_name)
            for enqueue_blob_name in enqueue_blobs_names
        ]
        deviceOption = core.DeviceOption(device, device_id)
        for (blob_name, blob) in zip(enqueue_blobs_names, blob_values):
            workspace.FeedBlob(blob_name, blob, device_option=deviceOption)

        workspace.RunOperatorOnce(
            core.CreateOperator(
                'EnqueueBlobs',
                ['{}{}/{}'.format(prefix, device_id, self._blobs_queue_name)] +
                enqueue_blobs_names,
                enqueue_blobs_names,
                device_option=deviceOption,
            ))
Example #9
0
def compute_multi_device_mAP(split):
    predictions, truth = [], []
    device_prefix, _ = helpers.get_prefix_and_device()
    for idx in range(0, cfg.NUM_DEVICES):
        prefix = '{}{}'.format(device_prefix, idx)
        preds = workspace.FetchBlob(prefix + '/pred')
        if cfg.TEST.TEN_CROP and split in ['test', 'val']:
            preds = np.reshape(
                preds, (int(preds.shape[0] / 10), 10, preds.shape[1]))
            preds = np.sum(preds, axis=1)
        labels = workspace.FetchBlob(prefix + '/labels')
        batch_size = preds.shape[0]
        assert labels.shape[0] == batch_size, \
            "Something went wrong with data loading"
        predictions.append(preds)
        truth.append(labels)
    mAP = compute_mAP(predictions, truth)
    # return the concatenated output (batch_size * num_gpus) x 20
    cat_preds = np.concatenate(predictions, axis=0)
    cat_gts = np.concatenate(truth, axis=0)
    return mAP, cat_preds, cat_gts
def add_variable_stepsize_lr(
    curr_iter, num_devices, lr_iters, start_model_iter, epoch_iters=None,
    model=None, prev_checkpointed_lr=None,
):
    global CURRENT_LR
    # if the model is resumed from some checkpoint state, then we load the
    # checkpoint LR into the CURRENT_LR at the start of training only
    if prev_checkpointed_lr is not None and (curr_iter == start_model_iter):
        CURRENT_LR = prev_checkpointed_lr

    if curr_iter <= lr_iters[0]:
        gamma_pow = 0
    else:
        idx = 0
        while idx < len(lr_iters) and lr_iters[idx] < curr_iter:
            idx += 1
        gamma_pow = idx

    learning_rate = (cfg.SOLVER.BASE_LR * math.pow(cfg.SOLVER.GAMMA, gamma_pow))
    new_lr = learning_rate
    if curr_iter == 1:
        prev_lr = new_lr
    else:
        prev_lr = CURRENT_LR
    # import pdb; pdb.set_trace()
    if cfg.SOLVER.SCALE_MOMENTUM and (not new_lr == prev_lr):
        scale = new_lr / float(prev_lr)
        scale_momentum(scale, model)

    CURRENT_LR = new_lr
    prefix, device = helpers.get_prefix_and_device()
    for idx in range(0, num_devices):
        with core.DeviceScope(core.DeviceOption(device, idx)):
            workspace.FeedBlob(
                '{}{}/lr'.format(prefix, idx),
                np.array(learning_rate, dtype=np.float32)
            )
def create_multi_gpu_blob(blob_name):
    prefix, device = helpers.get_prefix_and_device()
    for idx in range(0, cfg.NUM_DEVICES):
        with core.DeviceScope(core.DeviceOption(device, idx)):
            workspace.CreateBlob('{}{}/{}'.format(prefix, idx, blob_name))
Example #12
0
def test_net(opts):
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
    np.random.seed(cfg.RNG_SEED)

    prefix, device = helpers.get_prefix_and_device()

    ############################################################################
    name = '{}_test'.format(cfg.MODEL.MODEL_NAME)
    logger.info(
        '=================Creating model: {}============='.format(name))
    data_type, batch_size = cfg['TEST'].DATA_TYPE, cfg['TEST'].BATCH_SIZE
    test_model = model_builder.ModelBuilder(
        name=name,
        train=False,
        use_cudnn=True,
        cudnn_exhaustive_search=True,
        split=data_type,
        ws_nbytes_limit=(cfg.CUDNN_WORKSPACE_LIMIT * 1024 * 1024),
    )

    test_model.build_model()
    test_model.create_net()
    test_model.start_data_loader()

    assert cfg.METRICS.TYPE in ['topk', 'AP'], "Invalid metrics type"
    test_metrics_calculator = None
    if cfg.METRICS.TYPE == 'topk':
        test_metrics_calculator = metrics_topk.TopkMetricsCalculator(
            model=test_model,
            split=data_type,
            batch_size=batch_size,
            prefix=prefix)
    else:
        test_metrics_calculator = metrics_ap.APMetricsCalculator(
            model=test_model,
            split=data_type,
            batch_size=batch_size,
            prefix=prefix)

    test_timer = Timer()
    total_test_iters = helpers.get_num_test_iter(test_model.input_db)
    logger.info('Test epoch iters: {}'.format(total_test_iters))

    # save proto for debugging
    helpers.save_model_proto(test_model)

    ############################################################################
    # initialize the model from the checkpoint
    if cfg.TEST.PARAMS_FILE:
        checkpoints.load_model_from_params_file(
            test_model, params_file=cfg.TEST.PARAMS_FILE, checkpoint_dir=None)
    else:
        logger.info('No params files specified for testing model. Aborting!')
        os._exit(0)

    ############################################################################
    logger.info("Testing model...")
    test_metrics_calculator.reset()
    for test_iter in range(0, total_test_iters):
        test_timer.tic()
        workspace.RunNet(test_model.net.Proto().name)
        test_timer.toc()
        if test_iter == 0:
            helpers.print_net(test_model)
        rem_test_iters = total_test_iters - test_iter - 1
        test_metrics_calculator.calculate_and_log_test_iter_metrics(
            test_iter, test_timer, rem_test_iters, total_test_iters)
    test_metrics_calculator.finalize_metrics()
    test_metrics_calculator.compute_and_log_epoch_best_metric(
        model_iter=test_iter)
    test_metrics_calculator.log_best_model_metrics(test_iter, total_test_iters)
    logger.info('Total images tested: {}'.format(
        test_metrics_calculator.split_N))
    logger.info('Done!!!')
    test_model.data_loader.shutdown_dataloader()
def train_net(opts):
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
    logging.getLogger(__name__)
    np.random.seed(cfg.RNG_SEED)

    prefix, device = helpers.get_prefix_and_device()
    device_prefix = '{}{}'.format(prefix, 0)

    ############################################################################
    total_test_iters, test_metrics_calculator = 0, None
    if cfg.MODEL.TEST_MODEL:
        # Build test_model: we do this first so we don't overwrite init (if any)
        test_model, test_metrics_calculator, test_timer = build_wrapper(
            is_train=False, prefix=device_prefix)
        total_test_iters = helpers.get_num_test_iter(test_model.input_db)
        logger.info('Test epoch iters: {}'.format(total_test_iters))

    ############################################################################
    # create training model and metrics
    train_model, train_metrics_calculator, train_timer = build_wrapper(
        is_train=True, prefix=device_prefix)
    # save proto for debugging
    helpers.save_model_proto(train_model)

    ############################################################################
    # setup the checkpoint directory and load model from checkpoint
    if cfg.CHECKPOINT.CHECKPOINT_ON:
        checkpoint_dir = checkpoints.get_checkpoint_directory()
        logger.info('Checkpoint directory: {}'.format(checkpoint_dir))

    # checkpoint_exists variable is used to track whether this is a model
    # resume from a failed training or not. It will look up in the checkpoint_dir
    # whether there are any model checkpoints available. If yes, then it is set
    # to True.
    start_model_iter, prev_checkpointed_lr, checkpoint_exists = 0, None, False
    if cfg.CHECKPOINT.RESUME or cfg.TRAIN.PARAMS_FILE:
        start_model_iter, prev_checkpointed_lr, checkpoint_exists = (
            checkpoints.load_model_from_params_file(
                train_model,
                params_file=cfg.TRAIN.PARAMS_FILE,
                checkpoint_dir=checkpoint_dir))
        # if we are fine-tuning, it's possible that the training might get
        # stopped/killed
        if cfg.MODEL.FINE_TUNE and not checkpoint_exists:
            start_model_iter = 0

    ############################################################################
    logger.info("=> Training model...")
    model_flops, model_params = 0, 0
    lr_iters = lr_utils.get_lr_steps()
    for curr_iter in range(start_model_iter, cfg.SOLVER.NUM_ITERATIONS):
        # set LR
        lr_utils.add_variable_stepsize_lr(curr_iter + 1, cfg.NUM_DEVICES,
                                          lr_iters, start_model_iter + 1,
                                          cfg.TRAIN.EVALUATION_FREQUENCY,
                                          train_model, prev_checkpointed_lr)

        # run the model training iteration
        train_timer.tic()
        workspace.RunNet(train_model.net.Proto().name)
        train_timer.toc(average=False)

        # logging after 1st iteration
        if curr_iter == start_model_iter:
            helpers.print_net(train_model)
            os.system('nvidia-smi')
            model_flops, model_params = helpers.get_flops_params(train_model)

        # check nan loses
        helpers.check_nan_losses(cfg.NUM_DEVICES)

        # log metrics at the cfg.LOGGER_FREQUENCY
        rem_train_iters = cfg.SOLVER.NUM_ITERATIONS - curr_iter - 1
        train_metrics_calculator.calculate_and_log_train_iter_metrics(
            curr_iter, train_timer, rem_train_iters, cfg.SOLVER.NUM_ITERATIONS,
            train_model.data_loader.minibatch_queue_size())

        # checkpoint model at CHECKPOINT_PERIOD
        if (cfg.CHECKPOINT.CHECKPOINT_ON
                and (curr_iter + 1) % cfg.CHECKPOINT.CHECKPOINT_PERIOD == 0):
            params_file = os.path.join(
                checkpoint_dir, 'c2_model_iter{}.pkl'.format(curr_iter + 1))
            checkpoints.save_model_params(model=train_model,
                                          params_file=params_file,
                                          model_iter=curr_iter,
                                          checkpoint_dir=checkpoint_dir)

        if (curr_iter + 1) % cfg.TRAIN.EVALUATION_FREQUENCY == 0:
            train_metrics_calculator.finalize_metrics()
            # test model if the testing is ON
            if cfg.MODEL.TEST_MODEL:
                test_metrics_calculator.reset()
                logger.info("=> Testing model...")
                for test_iter in range(0, total_test_iters):
                    # run a test iteration
                    test_timer.tic()
                    workspace.RunNet(test_model.net.Proto().name)
                    test_timer.toc()
                    rem_test_iters = (total_test_iters - test_iter - 1)
                    num_rem_iter = (cfg.SOLVER.NUM_ITERATIONS - curr_iter - 1)
                    num_rem_ep = num_rem_iter / cfg.TRAIN.EVALUATION_FREQUENCY
                    if (test_iter + 1) % cfg.LOGGER_FREQUENCY == 0:
                        rem_test_iters += int(total_test_iters * num_rem_ep)
                    test_metrics_calculator.calculate_and_log_test_iter_metrics(
                        test_iter, test_timer, rem_test_iters,
                        total_test_iters)
                test_metrics_calculator.finalize_metrics()
                test_metrics_calculator.compute_and_log_epoch_best_metric(
                    model_iter=curr_iter)
            json_stats = metrics_helper.get_json_stats_dict(
                train_metrics_calculator,
                test_metrics_calculator,
                curr_iter,
                model_flops,
                model_params,
            )

            json_stats['average_time'] = round(
                train_timer.average_time + test_timer.average_time, 3)
            metrics_helper.print_json_stats(json_stats)
            train_metrics_calculator.reset()

    if test_metrics_calculator is not None:
        test_metrics_calculator.log_best_model_metrics(
            model_iter=curr_iter,
            total_iters=cfg.SOLVER.NUM_ITERATIONS,
        )

    train_model.data_loader.shutdown_dataloader()
    if cfg.MODEL.TEST_MODEL:
        test_model.data_loader.shutdown_dataloader()

    logger.info('Training has successfully finished...exiting!')
    os._exit(0)
def test_net(opts):
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
    np.random.seed(cfg.RNG_SEED)
    prefix, device = helpers.get_prefix_and_device()

    ############################################################################
    name = '{}_test'.format(cfg.MODEL.MODEL_NAME)
    logger.info(
        '=================Creating model: {}============='.format(name))
    data_type, batch_size = cfg['TEST'].DATA_TYPE, cfg['TEST'].BATCH_SIZE
    test_model = model_builder.ModelBuilder(
        name=name,
        train=False,
        use_cudnn=True,
        cudnn_exhaustive_search=True,
        split=data_type,
        ws_nbytes_limit=(cfg.CUDNN_WORKSPACE_LIMIT * 1024 * 1024),
    )

    test_model.build_model()
    test_model.create_net()
    test_model.start_data_loader()

    test_metrics_calculator = test_model.get_metrics_calculator(
        data_type, batch_size, prefix, generate_json=opts.generate_json)

    test_timer = Timer()
    total_test_iters = helpers.get_num_test_iter(test_model.input_db)
    if opts.generate_json:
        # we test for double the number of iterations to ensure that we get
        # output on all the images due to the multi-threaded/processing nature
        # of dataloader. This does not affect accuracy in any way.
        total_test_iters = 2 * total_test_iters
    logger.info('Test epoch iters: {}'.format(total_test_iters))

    # save proto for debugging
    helpers.save_model_proto(test_model)

    ############################################################################
    # initialize the model from the checkpoint
    if cfg.TEST.PARAMS_FILE:
        checkpoints.load_model_from_params_file(
            test_model, params_file=cfg.TEST.PARAMS_FILE, checkpoint_dir=None)
    else:
        logger.info('No params files specified for testing model. Aborting!')
        os._exit(0)

    ############################################################################
    logger.info("Testing model...")
    test_metrics_calculator.reset()
    for test_iter in range(0, total_test_iters):
        test_timer.tic()
        workspace.RunNet(test_model.net.Proto().name)
        test_timer.toc()
        if test_iter == 0:
            helpers.print_net(test_model)
        rem_test_iters = total_test_iters - test_iter - 1
        test_metrics_calculator.calculate_and_log_test_iter_metrics(
            test_iter,
            test_timer,
            rem_test_iters,
            total_test_iters,
            input_db=test_model.input_db)
    test_metrics_calculator.finalize_metrics()
    test_metrics_calculator.compute_and_log_epoch_best_metric(
        model_iter=test_iter)
    test_metrics_calculator.log_best_model_metrics(test_iter, total_test_iters)
    if opts.generate_json:
        json_predictions = test_metrics_calculator.get_json_predictions()
        for bl in cfg.ACCURACY_BLOBS:
            output_file = os.path.join(opts.output_path,
                                       '{}_json_preds.json'.format(bl))
            with open(output_file, 'w') as fp:
                json.dump(json_predictions[bl], fp)
            logger.info('Saved {} json predictions to: {}'.format(
                bl, output_file))
    logger.info('Total images tested: {}'.format(
        test_metrics_calculator.split_N))
    logger.info('Done!!!')
    test_model.data_loader.shutdown_dataloader()