예제 #1
0
def ExtractFeatures(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(",")]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        log.info("Running on GPUs: {}".format(gpus))
    else:
        log.info("Running on CPU")

    my_arg_scope = {
        "order": "NCHW",
        "use_cudnn": True,
        "cudnn_exhaustive_search": True
    }

    model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope)

    video_input_args = dict(
        batch_size=args.batch_size,
        clip_per_video=args.clip_per_video,
        decode_type=args.decode_type,
        length_rgb=args.clip_length_rgb,
        sampling_rate_rgb=args.sampling_rate_rgb,
        scale_h=args.scale_h,
        scale_w=args.scale_w,
        crop_size=args.crop_size,
        video_res_type=args.video_res_type,
        short_edge=min(args.scale_h, args.scale_w),
        num_decode_threads=args.num_decode_threads,
        do_multi_label=args.multi_label,
        num_of_class=args.num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=args.input_type,
        length_of=args.clip_length_of,
        sampling_rate_of=args.sampling_rate_of,
        frame_gap_of=args.frame_gap_of,
        do_flow_aggregation=args.do_flow_aggregation,
        flow_data_type=args.flow_data_type,
        get_rgb=args.input_type == 0,
        get_optical_flow=args.input_type == 1,
        get_video_id=args.get_video_id,
        get_start_frame=args.get_start_frame,
        use_local_file=args.use_local_file,
        crop_per_clip=args.crop_per_clip,
    )

    reader_args = dict(
        name="extract_features" + "_reader",
        input_data=args.test_data,
    )

    reader, num_examples = reader_utils.create_data_reader(
        model, **reader_args)

    def input_fn(model):
        model_helper.AddVideoInput(model, reader, **video_input_args)

    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(args.clip_length_of
                         if args.input_type == 1 else args.clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            use_convolutional_pred=args.use_convolutional_pred,
            use_pool1=args.use_pool1,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,  # 'None' since we aren't training
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        model._device_type = caffe2_pb2.CPU
        model._devices = [0]
        device_opt = core.DeviceOption(model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                input_fn(model)
                create_model_ops(model, 1.0)

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)

    if args.db_type == "pickle":
        model_loader.LoadModelFromPickleFile(model, args.load_model_path)
    elif args.db_type == "minidb":
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(model)

    def fetchActivations(model, outputs, num_iterations):

        all_activations = {}
        for counter in range(num_iterations):
            workspace.RunNet(model.net.Proto().name)

            num_devices = 1  # default for cpu
            if num_gpus > 0:
                num_devices = num_gpus
            for g in range(num_devices):
                for output_name in outputs:
                    blob_name = "gpu_{}/{}".format(g, output_name)
                    activations = workspace.FetchBlob(blob_name)
                    if output_name not in all_activations:
                        all_activations[output_name] = []
                    all_activations[output_name].append(activations)

            if counter % 20 == 0:
                log.info("{}/{} iterations".format(counter, num_iterations))

        # each key holds a list of activations obtained from each minibatch.
        # we now concatenate these lists to get the final arrays.
        # concatenating during the loop requires a realloc and can get slow.
        for key in all_activations:
            all_activations[key] = np.concatenate(all_activations[key])

        return all_activations

    outputs = [name.strip() for name in args.features.split(",")]
    assert len(outputs) > 0

    if args.num_iterations > 0:
        num_iterations = args.num_iterations
    else:
        if num_gpus > 0:
            examples_per_iteration = args.batch_size * num_gpus
        else:
            examples_per_iteration = args.batch_size
        num_iterations = int(num_examples / examples_per_iteration)

    activations = fetchActivations(model, outputs, num_iterations)

    # saving extracted features
    for index in range(len(outputs)):
        log.info("Read '{}' with shape {}".format(
            outputs[index], activations[outputs[index]].shape))

    if args.output_path:
        output_path = args.output_path
    else:
        output_path = os.path.dirname(args.test_data) + "/features.pickle"

    log.info("Writing to {}".format(output_path))
    if args.save_h5:
        with h5py.File(output_path, "w") as handle:
            for name, activation in activations.items():
                handle.create_dataset(name, data=activation)
    else:
        with open(output_path, "wb") as handle:
            pickle.dump(activations, handle)

    # perform sanity check
    if args.sanity_check == 1:  # check clip accuracy
        assert args.multi_label == 0
        clip_acc = 0
        softmax = activations["softmax"]
        label = activations["label"]
        for i in range(len(softmax)):
            sorted_preds = np.argsort(softmax[i])
            sorted_preds[:] = sorted_preds[::-1]
            if sorted_preds[0] == label[i]:
                clip_acc += 1
        log.info("Sanity check --- clip accuracy: {}".format(clip_acc /
                                                             len(softmax)))
    elif args.sanity_check == 2:  # check auc
        assert args.multi_label == 1
        prob = activations["prob"]
        label = activations["label"]
        mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label)
        log.info("Sanity check --- AUC: {}, mAP: {}, mWAP: {}".format(
            mean_auc, mean_ap, mean_wap))
예제 #2
0
    def test_forward_optim_tree_harder(self, input_dim, output_dim,
                                       batch_size):
        m = model_helper.ModelHelper()
        m.net.Proto().type = "dag"
        m.net.Proto().num_workers = 4
        m.net.AddExternalInput("label")
        m.net.AddExternalInput("data")

        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)

            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)

            # Branch
            fc3b = brew.fc(m,
                           fc2,
                           "fc3b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc4b = brew.fc(m,
                           fc3b,
                           "fc4b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc5b = brew.fc(m,
                           fc4b,
                           "fc5b",
                           dim_in=output_dim,
                           dim_out=output_dim)

            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
            fc5sum.Relu([], "relu1") \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.optimize_inference_for_dag(
            m.net, ["name_x/data"], "name_x/")

        blobs_after = count_blobs(optim_proto)

        # Extra test with when one of the parameters is also an input.
        # This caused a bug before.
        optim_proto_extra_input = memonger.optimize_inference_for_dag(
            m.net, ["name_x/data", "name_x/fc1_w"], "name_x/")
        blobs_after_extra_input = count_blobs(optim_proto_extra_input)
        self.assertEqual(blobs_after, blobs_after_extra_input)
        ###

        print(str(optim_proto))
        self.assertLess(blobs_after, blobs_before)

        # Test networks produce exactly same results
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
예제 #3
0
def build_embedding_encoder(
    model,
    encoder_params,
    inputs,
    input_lengths,
    vocab_size,
    embeddings,
    embedding_size,
    use_attention,
    num_gpus=0,
    scope=None,
):
    with core.NameScope(scope or ''):
        if num_gpus == 0:
            embedded_encoder_inputs = model.net.Gather(
                [embeddings, inputs],
                ['embedded_encoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_encoder_inputs_cpu = model.net.Gather(
                    [embeddings, inputs],
                    ['embedded_encoder_inputs_cpu'],
                )
            embedded_encoder_inputs = model.CopyCPUToGPU(
                embedded_encoder_inputs_cpu,
                'embedded_encoder_inputs',
            )

    assert len(encoder_params['encoder_layer_configs']) == 1
    encoder_num_units = (
        encoder_params['encoder_layer_configs'][0]['num_units'])
    with core.NameScope(scope or ''):
        encoder_initial_cell_state = model.param_init_net.ConstantFill(
            [],
            ['encoder_initial_cell_state'],
            shape=[encoder_num_units],
            value=0.0,
        )
        encoder_initial_hidden_state = model.param_init_net.ConstantFill(
            [],
            'encoder_initial_hidden_state',
            shape=[encoder_num_units],
            value=0.0,
        )
        # Choose corresponding rnn encoder function
        if encoder_params['use_bidirectional_encoder']:
            rnn_encoder_func = rnn_bidirectional_encoder
            encoder_output_dim = 2 * encoder_num_units
        else:
            rnn_encoder_func = rnn_unidirectional_encoder
            encoder_output_dim = encoder_num_units

    (
        encoder_outputs,
        final_encoder_hidden_state,
        final_encoder_cell_state,
    ) = rnn_encoder_func(
        model,
        embedded_encoder_inputs,
        input_lengths,
        encoder_initial_hidden_state,
        encoder_initial_cell_state,
        embedding_size,
        encoder_num_units,
        use_attention,
        scope=scope,
    )
    weighted_encoder_outputs = None

    return (
        encoder_outputs,
        weighted_encoder_outputs,
        final_encoder_hidden_state,
        final_encoder_cell_state,
        encoder_output_dim,
    )
예제 #4
0
def GpuNameScope(gpu_id):
    """Create a name scope for GPU device `gpu_id`."""
    with core.NameScope('gpu_{:d}'.format(gpu_id)):
        yield
예제 #5
0
def Parallelize_GPU(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    devices=range(0, workspace.NumCudaDevices()),
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
):
    '''
    Function to create a model that can run on many GPUs.
      model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all GPUs share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient.
                        Signature: forward_pass_builder_fun(model)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying. Function is also passed the learning
                        rate scaling factor. You should multiple the learning
                        rate by the factor to maintain invariant of same
                        results with same total batch size, regardless of
                        number of gpus.
                        Signature: param_update_builder_fun(model, lr_scale)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type

    '''
    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelperBase)
    assert model_helper_obj.params == [], "Model needs to be empty"

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                log.info("Model for GPU: {}".format(device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj)
                # Losses are not needed for test net
                if param_update_builder_fun is not None:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(devices, model_helper_obj.computed_params)
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()
    model_helper_obj._computed_param_names = computed_params_grouped.keys()

    if (param_update_builder_fun is None):
        log.info("Parameter update function not defined --> only forward")
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [
        param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad
    ]
    gradients_grouped = _GroupByDevice(
        devices,
        grads_ordered,
    )
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = gradients_grouped.keys()

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous)
    _AllReduceGradients(devices, model_helper_obj, rendezvous)

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    lr_scale = 1.0 / (len(devices) * num_shards)
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                param_update_builder_fun(model_helper_obj, lr_scale)

    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    if (rendezvous is not None):
        _AddDistributedParameterSync(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj.param_init_net,
            rendezvous,
        )

    _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)

    if optimize_gradient_memory:
        _OptimizeGradientMemory(model_helper_obj, losses_by_gpu, devices)
예제 #6
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            ('id_score_pairs',
             Map(
                 Scalar(np.int32),
                 Map(Scalar(np.int64),
                     Scalar(np.float32),
                     keys_name='ids',
                     values_name='scores'),
             )),
            # additional scalar information
            ('metadata',
             Struct(
                 ('user_id', Scalar(np.int64)),
                 ('user_embed', Scalar((np.float32, 2))),
                 ('query', Scalar(str)),
             )),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(expected_fields, schema.field_names(),
                     schema.field_types())
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2,
             32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            (
                [], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        Trim a dataset
        """
        trim_net = core.Net('trim_ds')
        ds.trim(trim_net, multiple_of=2)
        workspace.RunNetOnce(trim_net)
        trimmed = FetchRecord(ds.content())
        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
        self.assertEquals(EXPECTED_SIZES, actual_sizes)
예제 #7
0
    def run_model(self, V, gpu_devices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec,
                    'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)])
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill([],
                                                     "vec_{}".format(num),
                                                     shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [], "vec_2", shape=[batch_per_device, 16]))
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size)
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(vec, orig_vec)
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format(
                    g, num)) for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.
                    format(len(idx), len(grad_slice)))
예제 #8
0
def rnn_bidirectional_encoder(model, embedded_inputs, input_lengths,
                              initial_hidden_state, initial_cell_state,
                              embedding_size, encoder_num_units,
                              use_attention):
    """ Bidirectional (forward pass and backward pass) LSTM encoder."""

    with core.NameScope('', reset=True):
        # Forward pass
        (
            outputs_fw,
            final_hidden_state_fw,
            _,
            final_cell_state_fw,
        ) = recurrent.LSTM(
            model=model,
            input_blob=embedded_inputs,
            seq_lengths=input_lengths,
            initial_states=(initial_hidden_state, initial_cell_state),
            dim_in=embedding_size,
            dim_out=encoder_num_units,
            scope='forward_encoder',
            outputs_with_grads=([0] if use_attention else [1, 3]),
        )

        # Backward pass
        reversed_embedded_inputs = model.net.ReversePackedSegs(
            [embedded_inputs, input_lengths],
            ['reversed_embedded_inputs'],
        )

        (
            outputs_bw,
            final_hidden_state_bw,
            _,
            final_cell_state_bw,
        ) = recurrent.LSTM(
            model=model,
            input_blob=reversed_embedded_inputs,
            seq_lengths=input_lengths,
            initial_states=(initial_hidden_state, initial_cell_state),
            dim_in=embedding_size,
            dim_out=encoder_num_units,
            scope='backward_encoder',
            outputs_with_grads=([0] if use_attention else [1, 3]),
        )

        outputs_bw = model.net.ReversePackedSegs(
            [outputs_bw, input_lengths],
            ['outputs_bw'],
        )

        # Concatenate forward and backward results
        outputs, _ = model.net.Concat(
            [outputs_fw, outputs_bw],
            ['outputs', 'outputs_dim'],
            axis=2,
        )

        final_hidden_state, _ = model.net.Concat(
            [final_hidden_state_fw, final_hidden_state_bw],
            ['final_hidden_state', 'final_hidden_state_dim'],
            axis=2,
        )

        final_cell_state, _ = model.net.Concat(
            [final_cell_state_fw, final_cell_state_bw],
            ['final_cell_state', 'final_cell_state_dim'],
            axis=2,
        )
    return outputs, final_hidden_state, final_cell_state
예제 #9
0
def LoadModelFromPickleFile(
    model,
    pkl_file,
    use_gpu=True,
    root_gpu_id=0,
    bgr2rgb=False,
    inflating=True,
    collapsing=True,
    center_init=False,
):

    ws_blobs = workspace.Blobs()
    with open(pkl_file, 'r') as fopen:
        blobs = pickle.load(fopen)

    if 'blobs' in blobs:
        blobs = blobs['blobs']

    unscoped_blob_names = OrderedDict()
    for blob in model.GetAllParams():
        unscoped_blob_names[unscope_name(str(blob))] = True

    if use_gpu:
        device_opt = caffe2_pb2.CUDA
    else:
        device_opt = caffe2_pb2.CPU

    with core.NameScope('gpu_{}'.format(root_gpu_id)):
        with core.DeviceScope(core.DeviceOption(device_opt, root_gpu_id)):
            for unscoped_blob_name in unscoped_blob_names.keys():
                scoped_blob_name = scoped_name(unscoped_blob_name)
                if unscoped_blob_name not in blobs:
                    log.info('{} not found'.format(unscoped_blob_name))
                    continue
                if scoped_blob_name in ws_blobs:
                    ws_blob = workspace.FetchBlob(scoped_blob_name)
                    target_shape = ws_blob.shape
                    if target_shape == blobs[unscoped_blob_name].shape:
                        log.info('copying {}'.format(unscoped_blob_name))
                        if bgr2rgb and unscoped_blob_name == 'conv1_w':
                            feeding_blob = FlipBGR2RGB(
                                blobs[unscoped_blob_name]
                            )
                        else:
                            feeding_blob = blobs[unscoped_blob_name]

                    elif ws_blob.ndim == 5:
                        # inflate from FC to 1x1x1 conv
                        if blobs[unscoped_blob_name].ndim == 2:
                            log.info('convolutionalize {}'.format(
                                unscoped_blob_name)
                            )
                            feeding_blob = blobs[unscoped_blob_name]
                            feeding_blob = np.reshape(
                                feeding_blob,
                                feeding_blob.shape + (1, 1, 1)
                            )
                        else:
                            # may need to inflate
                            if not inflating:
                                log.info(
                                    '{} found, but inflating is ignored'.format(
                                        unscoped_blob_name
                                    )
                                )
                                continue
                            feeding_blob = InflateBlob(
                                blobs[unscoped_blob_name],
                                target_shape,
                                unscoped_blob_name,
                                (0 if center_init else 1)
                            )

                    elif ws_blob.ndim == 4:
                        # may need to collapse
                        if not collapsing:
                            log.info(
                                '{} found, but collapsing is ignored'.format(
                                    unscoped_blob_name
                                )
                            )
                            continue
                        feeding_blob = CollapseBlob(
                            blobs[unscoped_blob_name],
                            target_shape,
                            unscoped_blob_name
                        )
                    # either copy, inflate, or collapse blob
                    workspace.FeedBlob(
                        scoped_blob_name,
                        feeding_blob.astype(np.float32, copy=False)
                    )
예제 #10
0
 def read(self, read_net):
     with core.NameScope(read_net.NextName(self.name)):
         status = read_net.NextName()
         fields = read_net.SafeDequeueBlobs(
             self.blobs_queue, self._schema.field_names() + [status])
         return (fields[-1], fields[:-1])
예제 #11
0
def Test(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        total_batch_size = args.batch_size * num_gpus
        log.info("Running on GPUs: {}".format(gpus))
        log.info("total_batch_size: {}".format(total_batch_size))
    else:
        total_batch_size = args.batch_size
        log.info("Running on CPU")
        log.info("total_batch_size: {}".format(total_batch_size))

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(
                args.clip_length_of if args.input_type == 1
                else args.clip_length_rgb
            ),
            loss_scale=loss_scale,
            is_test=1,
            pred_layer_name=args.pred_layer_name,
        )

    test_model = cnn.CNNModelHelper(
        order="NCHW",
        name="video_model_test",
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
    )

    test_reader, number_of_examples = model_builder.create_data_reader(
        test_model,
        name="test_reader",
        input_data=args.test_data,
    )

    if args.num_iter <= 0:
        num_iter = int(number_of_examples / total_batch_size)
    else:
        num_iter = args.num_iter

    def test_input_fn(model):
        model_helper.AddVideoInput(
            test_model,
            test_reader,
            batch_size=args.batch_size,
            clip_per_video=args.clip_per_video,
            decode_type=1,
            length_rgb=args.clip_length_rgb,
            sampling_rate_rgb=args.sampling_rate_rgb,
            scale_h=args.scale_h,
            scale_w=args.scale_w,
            crop_size=args.crop_size,
            num_decode_threads=4,
            num_of_class=args.num_labels,
            random_mirror=False,
            random_crop=False,
            input_type=args.input_type,
            length_of=args.clip_length_of,
            sampling_rate_of=args.sampling_rate_of,
            frame_gap_of=args.frame_gap_of,
            do_flow_aggregation=args.do_flow_aggregation,
            flow_data_type=args.flow_data_type,
            get_rgb=(args.input_type == 0),
            get_optical_flow=(args.input_type == 1),
            get_video_id=args.get_video_id,
            use_local_file=args.use_local_file,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,
            devices=gpus
        )
    else:
        test_model._device_type = caffe2_pb2.CPU
        test_model._devices = [0]
        device_opt = core.DeviceOption(test_model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                test_input_fn(test_model)
                create_model_ops(test_model, 1.0)

    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    if args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
            data_parallel_model.FinalizeAfterCheckpoint(test_model)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        if num_gpus > 0:
            model_loader.LoadModelFromPickleFile(
                test_model,
                args.load_model_path,
                use_gpu=True,
                root_gpu_id=gpus[0]
            )
            data_parallel_model.FinalizeAfterCheckpoint(test_model)
        else:
            model_loader.LoadModelFromPickleFile(
                test_model,
                args.load_model_path,
                use_gpu=False
            )
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))


    # metric counters for classification
    clip_acc = 0
    video_top1 = 0
    video_topk = 0
    video_count = 0
    clip_count = 0

    for i in range(num_iter):
        workspace.RunNet(test_model.net.Proto().name)
        num_devices = 1  # default for cpu
        if args.num_gpus > 0:
            num_devices = args.num_gpus

        for g in range(num_devices):
            # get labels
            label = workspace.FetchBlob(
                "gpu_{}".format(g) + '/label'
            )
            # get predictions
            predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax')
            assert predicts.shape[0] == args.batch_size * args.clip_per_video

            for j in range(args.batch_size):
                # get label for one video
                sample_label = label[j * args.clip_per_video]
                # get clip accuracy
                for k in range(args.clip_per_video):
                    c1, _ = metric.accuracy_metric(
                        predicts[j * args.clip_per_video + k, :],
                        label[j * args.clip_per_video + k])
                    clip_acc = clip_acc + c1
                # get all clip predictions for one video
                all_clips = predicts[
                    j * args.clip_per_video:(j + 1) * args.clip_per_video, :]
                # aggregate predictions into one
                video_pred = PredictionAggregation(all_clips, args.aggregation)
                c1, ck = metric.accuracy_metric(
                    video_pred, sample_label, args.top_k)
                video_top1 = video_top1 + c1
                video_topk = video_topk + ck

            video_count = video_count + args.batch_size
            clip_count = clip_count + label.shape[0]

        if i > 0 and i % args.display_iter == 0:
            log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format(
                i,
                num_iter,
                clip_acc / clip_count,
                video_top1 / video_count,
                video_topk / video_count))

    log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format(
        clip_acc / clip_count,
        video_top1 / video_count,
        args.top_k,
        video_topk / video_count
    ))

    if num_gpus > 0:
        flops, params = model_helper.GetFlopsAndParams(test_model, gpus[0])
    else:
        flops, params = model_helper.GetFlopsAndParams(test_model)
    log.info('FLOPs: {}, params: {}'.format(flops, params))
예제 #12
0
    def _build_decoder(
        self,
        model,
        step_model,
        model_params,
        scope,
        previous_tokens,
        timestep,
        fake_seq_lengths,
    ):
        attention_type = model_params['attention']
        assert attention_type in ['none', 'regular']
        use_attention = (attention_type != 'none')

        with core.NameScope(scope):
            encoder_embeddings = seq2seq_util.build_embeddings(
                model=model,
                vocab_size=self.source_vocab_size,
                embedding_size=model_params['encoder_embedding_size'],
                name='encoder_embeddings',
                freeze_embeddings=False,
            )

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        ) = seq2seq_util.build_embedding_encoder(
            model=model,
            encoder_params=model_params['encoder_type'],
            inputs=self.encoder_inputs,
            input_lengths=self.encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=encoder_embeddings,
            embedding_size=model_params['encoder_embedding_size'],
            use_attention=use_attention,
            num_gpus=0,
            scope=scope,
        )
        with core.NameScope(scope):
            # [max_source_length, beam_size, encoder_output_dim]
            encoder_outputs = model.net.Tile(
                encoder_outputs,
                'encoder_outputs_tiled',
                tiles=self.beam_size,
                axis=1,
            )
            if weighted_encoder_outputs is not None:
                weighted_encoder_outputs = model.net.Tile(
                    weighted_encoder_outputs,
                    'weighted_encoder_outputs_tiled',
                    tiles=self.beam_size,
                    axis=1,
                )

            decoder_embeddings = seq2seq_util.build_embeddings(
                model=model,
                vocab_size=self.target_vocab_size,
                embedding_size=model_params['decoder_embedding_size'],
                name='decoder_embeddings',
                freeze_embeddings=False,
            )
            embedded_tokens_t_prev = step_model.net.Gather(
                [decoder_embeddings, previous_tokens],
                'embedded_tokens_t_prev',
            )

        decoder_num_units = (
            model_params['decoder_layer_configs'][0]['num_units']
        )

        with core.NameScope(scope):
            if not use_attention and final_encoder_hidden_state is not None:
                final_encoder_hidden_state = model.net.Tile(
                    final_encoder_hidden_state,
                    'final_encoder_hidden_state_tiled',
                    tiles=self.beam_size,
                    axis=1,
                )
            if not use_attention and final_encoder_cell_state is not None:
                final_encoder_cell_state = model.net.Tile(
                    final_encoder_cell_state,
                    'final_encoder_cell_state_tiled',
                    tiles=self.beam_size,
                    axis=1,
                )
            initial_states = seq2seq_util.build_initial_rnn_decoder_states(
                model=model,
                encoder_num_units=encoder_output_dim,
                decoder_num_units=decoder_num_units,
                final_encoder_hidden_state=final_encoder_hidden_state,
                final_encoder_cell_state=final_encoder_cell_state,
                use_attention=use_attention,
            )

        if use_attention:
            decoder_cell = rnn_cell.LSTMWithAttentionCell(
                encoder_output_dim=encoder_output_dim,
                encoder_outputs=encoder_outputs,
                decoder_input_dim=model_params['decoder_embedding_size'],
                decoder_state_dim=decoder_num_units,
                name=self.scope(scope, 'decoder'),
                attention_type=attention.AttentionType.Regular,
                weighted_encoder_outputs=weighted_encoder_outputs,
                forget_bias=0.0,
                lstm_memory_optimization=False,
                attention_memory_optimization=True,
            )
            decoder_output_dim = decoder_num_units + encoder_output_dim
        else:
            decoder_cell = rnn_cell.LSTMCell(
                name=self.scope(scope, 'decoder'),
                input_size=model_params['decoder_embedding_size'],
                hidden_size=decoder_num_units,
                forget_bias=0.0,
                memory_optimization=False,
            )
            decoder_output_dim = decoder_num_units

        states_prev = step_model.net.AddExternalInputs(*[
            s + '_prev' for s in decoder_cell.get_state_names()
        ])
        _, states = decoder_cell.apply(
            model=step_model,
            input_t=embedded_tokens_t_prev,
            seq_lengths=fake_seq_lengths,
            states=states_prev,
            timestep=timestep,
        )
        if use_attention:
            with core.NameScope(scope or ''):
                decoder_outputs, _ = step_model.net.Concat(
                    [states[0], states[2]],
                    [
                        'states_and_context_combination',
                        '_states_and_context_combination_concat_dims',
                    ],
                    axis=2,
                )
        else:
            decoder_outputs = states[0]

        state_configs = [
            BeamSearchForwardOnly.StateConfig(
                initial_value=initial_state,
                state_prev_link=BeamSearchForwardOnly.LinkConfig(
                    blob=state_prev,
                    offset=0,
                    window=1,
                ),
                state_link=BeamSearchForwardOnly.LinkConfig(
                    blob=state,
                    offset=1,
                    window=1,
                ),
            )
            for initial_state, state_prev, state in zip(
                initial_states,
                states_prev,
                states,
            )
        ]

        with core.NameScope(scope):
            decoder_outputs_flattened, _ = step_model.net.Reshape(
                [decoder_outputs],
                [
                    'decoder_outputs_flattened',
                    'decoder_outputs_and_contexts_combination_old_shape',
                ],
                shape=[-1, decoder_output_dim],
            )
            output_logits = seq2seq_util.output_projection(
                model=step_model,
                decoder_outputs=decoder_outputs_flattened,
                decoder_output_size=decoder_output_dim,
                target_vocab_size=self.target_vocab_size,
                decoder_softmax_size=model_params['decoder_softmax_size'],
            )
            # [1, beam_size, target_vocab_size]
            output_probs = step_model.net.Softmax(
                output_logits,
                'output_probs',
            )
            output_log_probs = step_model.net.Log(
                output_probs,
                'output_log_probs',
            )
            if use_attention:
                attention_weights = decoder_cell.get_attention_weights()
            else:
                attention_weights = step_model.net.ConstantFill(
                    [self.encoder_inputs],
                    'zero_attention_weights_tmp_1',
                    value=0.0,
                )
                attention_weights = step_model.net.Transpose(
                    attention_weights,
                    'zero_attention_weights_tmp_2',
                )
                attention_weights = step_model.net.Tile(
                    attention_weights,
                    'zero_attention_weights_tmp',
                    tiles=self.beam_size,
                    axis=0,
                )

        return (
            state_configs,
            output_log_probs,
            attention_weights,
        )
예제 #13
0
def ExtractFeatures(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        log.info("Running on GPUs: {}".format(gpus))
    else:
        log.info("Running on CPU")

    log.info("Running on GPUs: {}".format(gpus))

    my_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True
    }

    model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope)

    reader, num_examples = model_builder.create_data_reader(
        model,
        name="reader",
        input_data=args.test_data,
    )

    def input_fn(model):
        model_helper.AddVideoInput(
            model,
            reader,
            batch_size=args.batch_size,
            clip_per_video=args.clip_per_video,
            decode_type=args.decode_type,
            length_rgb=args.clip_length_rgb,
            sampling_rate_rgb=args.sampling_rate_rgb,
            scale_h=args.scale_h,
            scale_w=args.scale_w,
            crop_size=args.crop_size,
            num_decode_threads=args.num_decode_threads,
            num_of_class=args.num_labels,
            random_mirror=False,
            random_crop=False,
            input_type=args.input_type,
            length_of=args.clip_length_of,
            sampling_rate_of=args.sampling_rate_of,
            frame_gap_of=args.frame_gap_of,
            do_flow_aggregation=args.do_flow_aggregation,
            flow_data_type=args.flow_data_type,
            get_rgb=(args.input_type == 0),
            get_optical_flow=(args.input_type == 1),
            get_video_id=args.get_video_id,
            use_local_file=args.use_local_file,
        )

    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(args.clip_length_of
                         if args.input_type == 1 else args.clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,  # 'None' since we aren't training
            devices=gpus,
        )
    else:
        model._device_type = caffe2_pb2.CPU
        model._devices = [0]
        device_opt = core.DeviceOption(model._device_type, 0)
        with core.DeviceScope(device_opt):
            with core.NameScope("{}_{}".format("gpu", 0)):
                input_fn(model)
                create_model_ops(model, 1.0)

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)

    if args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
            data_parallel_model.FinalizeAfterCheckpoint(model)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        if num_gpus > 0:
            model_loader.LoadModelFromPickleFile(model,
                                                 args.load_model_path,
                                                 use_gpu=True,
                                                 root_gpu_id=gpus[0])
        else:
            model_loader.LoadModelFromPickleFile(
                model,
                args.load_model_path,
                use_gpu=False,
            )
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    def fetchActivations(model, outputs, num_iterations):

        all_activations = {}
        for counter in range(num_iterations):
            workspace.RunNet(model.net.Proto().name)
            num_devices = 1  # default for cpu

            for g in gpus if num_gpus > 0 else range(num_devices):
                for output_name in outputs:
                    blob_name = 'gpu_{}/'.format(g) + output_name
                    activations = workspace.FetchBlob(blob_name)
                    if output_name not in all_activations:
                        all_activations[output_name] = []
                    all_activations[output_name].append(activations)

            if counter % 20 == 0:
                log.info('{}/{} iterations'.format(counter, num_iterations))

        # each key holds a list of activations obtained from each minibatch.
        # we now concatenate these lists to get the final arrays.
        # concatenating during the loop requires a realloc and can get slow.
        for key in all_activations:
            all_activations[key] = np.concatenate(all_activations[key])

        return all_activations

    outputs = [name.strip() for name in args.features.split(',')]
    assert len(outputs) > 0

    if args.num_iterations > 0:
        num_iterations = args.num_iterations
    else:
        if num_gpus > 0:
            examples_per_iteration = args.batch_size * num_gpus
        else:
            examples_per_iteration = args.batch_size
        num_iterations = int(num_examples / examples_per_iteration)

    activations = fetchActivations(model, outputs, num_iterations)

    # saving extracted features
    for index in range(len(outputs)):
        log.info("Read '{}' with shape {}".format(
            outputs[index], activations[outputs[index]].shape))

    if args.output_path:
        output_path = args.output_path
    else:
        output_path = os.path.dirname(args.test_data) + '/features.pickle'

    log.info('Writing to {}'.format(output_path))
    with open(output_path, 'wb') as handle:
        pickle.dump(activations, handle)

    # perform sanity check
    if args.sanity_check == 1:  # check clip accuracy
        clip_acc = 0
        softmax = activations['softmax']
        label = activations['label']
        for i in range(len(softmax)):
            sorted_preds = \
                np.argsort(softmax[i])
            sorted_preds[:] = sorted_preds[::-1]
            if sorted_preds[0] == label[i]:
                clip_acc += 1
        log.info('Sanity check --- clip accuracy: {}'.format(clip_acc /
                                                             len(softmax)))
예제 #14
0
def Parallelize(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun=None,
    optimizer_builder_fun=None,
    post_sync_builder_fun=None,
    devices=None,
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
    use_nccl=False,
    max_concurrent_distributed_ops=16,
    cpu_device=False,
):
    '''
    Function to create a model that can run on many GPUs or CPUs.
      model_helper_obj: an object of ModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all devices share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient. Loss scale parameter
                        is passed, as you should scale the loss of your model
                        by 1.0 / the total number of devices.
                        Signature: forward_pass_builder_fun(model, loss_scale)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying. This is called for each GPU separately.
                        Signature: param_update_builder_fun(model)
      optimizer_builder_fun:
                        Alternative to param_update_builder_fun, allows one
                        to add an optimizer for the whole model. Called only
                        once, without name or devicescope.
      post_sync_builder_fun:
                        Function applied after initial parameter sync has been
                        completed, such as keeping multi-precision parameters
                        in sync.
                        Signature: post_sync_builder_fun(model)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type
      optimize_gradient_memory: whether to apply 'memonger' to share blobs
                        in gradient computation to reduce memory footprint
      cpu_device        Use CPU instead of GPU
    '''
    if devices is None:
        devices = list(range(0, workspace.NumCudaDevices())),

    if not cpu_device:
        for gpu in devices:
            if gpu >= workspace.NumCudaDevices():
                log.warning(
                    "** Only {} GPUs available, GPUs {} requested".format(
                        workspace.NumCudaDevices(), devices))
                break
        model_helper_obj._device_type = caffe2_pb2.CUDA
        model_helper_obj._device_prefix = "gpu"
        device_name = "GPU"
    else:
        model_helper_obj._device_type = caffe2_pb2.CPU
        model_helper_obj._device_prefix = "cpu"
        device_name = "CPU"

    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    num_workers = len(devices) * 4 + extra_workers
    max_concurrent_distributed_ops =\
        min(max_concurrent_distributed_ops, num_workers - 1)
    model_helper_obj.net.Proto().num_workers = num_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    loss_scale = 1.0 / (len(devices) * num_shards)

    has_parameter_updates = param_update_builder_fun is not None or \
        optimizer_builder_fun is not None
    assert not (
        param_update_builder_fun is not None
        and optimizer_builder_fun is not None
    ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun'

    for device in devices:
        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
                                               device)):
                log.info("Model for {} : {}".format(device_name, device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
                # Losses are not needed for test net
                if has_parameter_updates:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses
    _ValidateParams(model_helper_obj.params)

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(model_helper_obj, devices,
                       model_helper_obj.params, non_datapar_params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(model_helper_obj, devices,
                       model_helper_obj.GetComputedParams(''), [])
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        list(viewkeys(model_helper_obj._device_grouped_blobs))
    model_helper_obj._computed_param_names =\
        list(viewkeys(computed_params_grouped))

    if not has_parameter_updates:
        log.info("Parameter update function not defined --> only forward")
        _InferBlobDevice(model_helper_obj)
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    _ValidateParams(model_helper_obj.params)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [
        param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad
    ]
    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]

    gradients_grouped = _GroupByDevice(model_helper_obj, devices,
                                       grads_ordered, non_datapar_grads)
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = list(viewkeys(gradients_grouped))
    model_helper_obj._losses_by_gpu = losses_by_gpu

    _InferBlobDevice(model_helper_obj)

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous,
                                 use_nccl)

    if len(model_helper_obj._grad_names) > 0:
        # Gradients in reverse order
        reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj)
        assert (len(reverse_ordered_grads) > 0)
        _AllReduceBlobs(
            reverse_ordered_grads,
            devices,
            model_helper_obj,
            model_helper_obj.net,
            rendezvous,
            use_nccl,
            max_concurrent_distributed_ops,
        )
    else:
        log.info("NOTE: Param builder function did not create any parameters.")

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']

    if param_update_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(model_helper_obj._device_type,
                                           device)
            with core.DeviceScope(device_opt):
                with core.NameScope("{}_{}".format(
                        model_helper_obj._device_prefix, device)):
                    param_update_builder_fun(model_helper_obj)
    else:
        log.info("Calling optimizer builder function")
        optimizer_builder_fun(model_helper_obj)

    (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj)
    sync_blobs_grouped = _GroupByDevice(
        model_helper_obj,
        devices,
        sync_blobs,
        [],
    )
    model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped)

    _InferBlobDevice(model_helper_obj)
    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    _SyncAllParams(devices,
                   model_helper_obj,
                   model_helper_obj.param_init_net,
                   model_helper_obj.param_init_net,
                   rendezvous,
                   sync_names,
                   max_concurrent_distributed_ops=1)

    # Handle any operations that need to be done after parameter sync
    # i.e. making sure multi-precision copies of parameters are up-to-date
    if post_sync_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(model_helper_obj._device_type,
                                           device)
            with core.DeviceScope(device_opt):
                with core.NameScope("{}_{}".format(
                        model_helper_obj._device_prefix, device)):
                    post_sync_builder_fun(model_helper_obj)

    if optimize_gradient_memory:
        _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices)

    model_helper_obj._data_parallel_model_init_nets = [
        model_helper_obj.param_init_net,
    ]
    model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
예제 #15
0
def initialize_gpu_0_from_weights_file(model, weights_file):
    logger.info('Loading from: {}'.format(weights_file))
    is_first_init = 'trainedCOCO' in weights_file
    ws_blobs = workspace.Blobs()
    with open(weights_file, 'r') as f:
        src_blobs = pickle.load(f)
    if 'cfg' in src_blobs:
        saved_cfg = yaml.load(src_blobs['cfg'])
        configure_bbox_reg_weights(model, saved_cfg)
    if 'blobs' in src_blobs:
        # Backwards compat--dictionary used to be only blobs, now they are
        # stored under the 'blobs' key
        src_blobs = src_blobs['blobs']
    # Initialize weights on GPU 0 only
    unscoped_param_names = OrderedDict()  # Print these out in model order
    for blob in model.params:
        unscoped_param_names[utils.blob.unscope_name(str(blob))] = True
    with core.NameScope('gpu_0'):
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for unscoped_param_name in unscoped_param_names.keys():
                if (unscoped_param_name.find(']_') >= 0 and
                        unscoped_param_name not in src_blobs):
                    # Special case for sharing initialization from a pretrained
                    # model:
                    # If a blob named '_[xyz]_foo' is in model.params and not in
                    # the initialization blob dictionary, then load source blob
                    # 'foo' into destination blob '_[xyz]_foo'
                    src_name = unscoped_param_name[
                        unscoped_param_name.find(']_') + 2:]
                else:
                    src_name = unscoped_param_name
                if src_name not in src_blobs:
                    logger.info('{:s} not found'.format(src_name))
                    continue
                dst_name = core.ScopedName(unscoped_param_name)
                has_momentum = src_name + '_momentum' in src_blobs
                has_momentum_str = ' [+ momentum]' if has_momentum else ''
                logger.info('{:s}{:} loaded from weights file into {:s}: {}'.
                            format(
                                src_name, has_momentum_str,
                                dst_name, src_blobs[src_name].shape))
                pretrained_w = src_blobs[src_name]
                if dst_name in ws_blobs:
                    # If the blob is already in the workspace, make sure that it
                    # matches the shape of the loaded blob
                    ws_blob = workspace.FetchBlob(dst_name)
                    if ws_blob.shape != src_blobs[src_name].shape:
                        pretrained_w = inflate_weights(
                            pretrained_w, ws_blob, src_name, src_blobs)
                workspace.FeedBlob(
                    dst_name,
                    pretrained_w.astype(np.float32, copy=False))
                if has_momentum and not is_first_init:
                    # when feeding momentum, we're probably resuming from
                    # previous checkpoint. So all the inflated stuff won't be
                    # needed in that case
                    workspace.FeedBlob(
                        dst_name + '_momentum',
                        src_blobs[src_name + '_momentum'].astype(
                            np.float32, copy=False))

    # Add _rm/_riv BN mean/var params, in case the pre-trained model contains it.
    # Needed to test the scratch trained models.
    for src_name in src_blobs.keys():
        if src_name.endswith('_rm') or src_name.endswith('_riv'):
            with core.NameScope('gpu_0'):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                    dst_name = core.ScopedName(src_name)
                    workspace.FeedBlob(dst_name, src_blobs[src_name])
                    logger.info('Loaded BN param {}'.format(src_name))

    # We preserve blobs that are in the weights file but not used by the current
    # model. We load these into CPU memory under the '__preserve__/' namescope.
    # These blobs will be stored when saving a model to a weights file. This
    # feature allows for alternating optimization of Faster R-CNN in which blobs
    # unused by one step can still be preserved forward and used to initialize
    # another step.
    for src_name in src_blobs.keys():
        if (src_name not in unscoped_param_names and
                not src_name.endswith('_momentum') and
                src_blobs[src_name] is not None):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                workspace.FeedBlob(
                    '__preserve__/{:s}'.format(src_name), src_blobs[src_name])
                logger.info(
                    '{:s} preserved in workspace (unused)'.format(src_name))
예제 #16
0
    def test_resnet_shared_grads(self, with_shapes, gc, dc):
        model = cnn.CNNModelHelper(
            order="NCHW",
            name="test",
            cudnn_exhaustive_search=True,
        )
        with core.NameScope("gpu_0"):
            data = model.net.AddExternalInput("gpu_0/data")
            label = model.net.AddExternalInput("gpu_0/label")
            (_softmax, loss) = resnet.create_resnet50(
                model,
                data,
                num_input_channels=3,
                num_labels=1000,
                label=label,
                is_test=False,
            )

        param_to_grad = model.AddGradientOperators([loss])

        (shapes, types) = workspace.InferShapesAndTypes(
            [model.param_init_net, model.net],
            {'gpu_0/data': [4, 3, 227, 227],
                         'gpu_0/label': [4]},
        )

        count_before = count_blobs(model.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            model.net,
            ["gpu_0/loss"],
            set(model.param_to_grad.values()),
            "gpu_0/",
            share_activations=True,
            dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
            blob_shapes=shapes if with_shapes else None,
        )
        count_after = count_blobs(optim_proto)
        self.assertTrue(count_after < count_before)

        # Run model and compare results. We check that the loss is same
        # and also that the final gradient (conv1_w_grad is same)
        workspace.RunNetOnce(model.param_init_net)
        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
        label = (np.random.rand(4) * 1000).astype(np.int32)

        workspace.FeedBlob("gpu_0/data", data)
        workspace.FeedBlob("gpu_0/label", label)

        workspace.RunNetOnce(model.net)
        model.net.Proto().type = 'dag'
        model.net.Proto().num_workers = 4
        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
        workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])

        print("before: {} after: {}".format(count_before, count_after))

        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
예제 #17
0
    def __init__(self,
                 model,
                 input_record,
                 seed=0,
                 modulo=None,
                 use_hashing=True,
                 use_divide_mod=False,
                 divisor=None,
                 name='sparse_feature_hash',
                 **kwargs):
        super(SparseFeatureHash, self).__init__(model, name, input_record,
                                                **kwargs)

        assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time."

        if use_divide_mod:
            assert divisor >= 1, 'Unexpected divisor: {}'.format(divisor)

            self.divisor = self.create_param(
                param_name='divisor',
                shape=[1],
                initializer=('GivenTensorInt64Fill', {
                    'values': np.array([divisor])
                }),
                optimizer=model.NoOptim)

        self.seed = seed
        self.use_hashing = use_hashing
        self.use_divide_mod = use_divide_mod

        if schema.equal_schemas(input_record, IdList):
            self.modulo = modulo or self.extract_hash_size(
                input_record.items.metadata)
            metadata = schema.Metadata(
                categorical_limit=self.modulo,
                feature_specs=input_record.items.metadata.feature_specs,
                expected_value=input_record.items.metadata.expected_value)
            with core.NameScope(name):
                self.output_schema = schema.NewRecord(model.net, IdList)
            self.output_schema.items.set_metadata(metadata)

        elif schema.equal_schemas(input_record, IdScoreList):
            self.modulo = modulo or self.extract_hash_size(
                input_record.keys.metadata)
            metadata = schema.Metadata(
                categorical_limit=self.modulo,
                feature_specs=input_record.keys.metadata.feature_specs,
                expected_value=input_record.keys.metadata.expected_value)
            with core.NameScope(name):
                self.output_schema = schema.NewRecord(model.net, IdScoreList)
            self.output_schema.keys.set_metadata(metadata)

        else:
            assert False, "Input type must be one of (IdList, IdScoreList)"

        assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
        if input_record.lengths.metadata:
            self.output_schema.lengths.set_metadata(
                input_record.lengths.metadata)

        # operators in this layer do not have CUDA implementation yet.
        # In addition, since the sparse feature keys that we are hashing are
        # typically on CPU originally, it makes sense to have this layer on CPU.
        self.tags.update([Tags.CPU_ONLY])
예제 #18
0
    def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        m.Proto().type = "dag"
        m.Proto().num_workers = 4

        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)

            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)

            # Branch
            fc3b = brew.fc(m,
                           fc2,
                           "fc3b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc4b = brew.fc(m,
                           fc3b,
                           "fc4b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc5b = brew.fc(m,
                           fc4b,
                           "fc5b",
                           dim_in=output_dim,
                           dim_out=output_dim)

            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")

            fc5.Relu([], fc5sum) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.optimize_inference_for_dag(
            m.net, ["name_x/data"], "name_x")
        self.assertTrue(
            memonger.verify_graph_equality(m.net.Proto(), optim_proto))
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Test networks produce exactly same results
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
예제 #19
0
    def run_model(self, V, gpu_devices, cpu_indices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather([self.vecs, 'indices'],
                                                    'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs,
                    "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [param_grad.values, param_momentum, param],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill([],
                                                             "vecs",
                                                             shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([
                            param, self.ONE_CPU, param_grad.indices,
                            param_grad.values, self.LR
                        ], self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16)
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(self.vecs, orig_vecs)
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs),
                                     orig_vecs))
        return [
            workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
            workspace.FetchBlob("gpu_0/fc_w")
        ]
예제 #20
0
    def test_cpu2gpu_gpu2cpu_gradients(self):
        model = model_helper.ModelHelper(name="copy_test")

        batch = 32
        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)

        with core.NameScope("cpu"):
            with core.DeviceScope(cpu_opt):
                x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8)

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu")
                pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4)
                pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu")

        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                (softmax, loss) = model.SoftmaxWithLoss(
                    [pred_cpu, "label"],
                    ["softmax", "loss"],
                )

        gradient_map = model.AddGradientOperators([loss])

        # Add param updates (for cpu and gpu)
        init_net = model.param_init_net
        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.DeviceScope(cpu_opt):
            workspace.FeedBlob(
                'cpu/data',
                np.random.rand(batch, 16).astype(np.float32),
            )
            workspace.FeedBlob(
                'cpu/label',
                np.random.randint(4, size=batch).astype(np.int32),
            )

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)

        initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
        workspace.RunNet(model.net.Proto().name)
        updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}

        for p in model.GetParams():
            g = gradient_map[p]
            expected = initial_params[p] - 2.0 * workspace.FetchBlob(g)
            actual = updated_params[p]
            self.assertTrue(
                np.array_equal(expected, updated_params[p]),
                "Mismatch: {}: {}, {}".format(p, expected, actual),
            )
예제 #21
0
    def run(self, index, inputFeatures, accumulate=True, image_path=None):
        """
        index - index of the dataset entry
        inputFeatures - features input to the head
        accumulate - whether to save to predictions in self.all_... members
        image_path - path to the annotated image, to which the predictions correspond
        """
        timers = self.timers
        # Format the inputs to the mask rcnn head
        features = {}
        for k, v in inputFeatures.iteritems():
            assert v.dim() == 3, 'Batch mode not allowed'
            features[k] = np.expand_dims(v.data.cpu().numpy(), axis=0)

        gpu_dev = caffe2_core.DeviceOption(caffe2_pb2.CUDA, self.gpu_id)
        name_scope = 'gpu_{}'.format(self.gpu_id)

        # Clean the workspace to make damn sure that nothing comes from the
        # possible forwarding of target features, depending on the use of this
        # module
        parameters = [str(s) for s in self.model.params] + [
            str(s) + '_momentum' for s in self.model.TrainableParams()
        ]
        for b in workspace.Blobs():
            if not b in parameters:
                workspace.FeedBlob(b, np.array([]))

        # Produce the top level of the pyramid of features
        with caffe2_core.NameScope(name_scope):
            with caffe2_core.DeviceScope(gpu_dev):
                workspace.FeedBlob(
                    caffe2_core.ScopedName("predicted_fpn_res5_2_sum"),
                    features['fpn_res5_2_sum'])
                workspace.RunOperatorOnce(self.subsampler)
                features[
                    u'fpn_res5_2_sum_subsampled_2x'] = workspace.FetchBlob(
                        caffe2_core.ScopedName(
                            "predicted_fpn_res5_2_sum_subsampled_2x"))

        # Forward the rest of the features in the head of the model
        im_info = np.array([[1024., 2048., 1.]], dtype=np.float32)
        im_scales = np.array([1.])
        im_shape = (1024, 2048, 3)
        with caffe2_core.NameScope(name_scope):
            with caffe2_core.DeviceScope(gpu_dev):
                cls_boxes_i, cls_segms_i = im_detect_all_given_features(
                    self.model, self.subsampler, features, im_info, im_scales,
                    im_shape, timers)

        # If required, store the results in the class's members
        if accumulate:
            extend_results(index, self.all_boxes_ann_frame, cls_boxes_i)
        if cls_segms_i is not None and accumulate:
            extend_results(index, self.all_segms_ann_frame, cls_segms_i)

        if image_path is not None and accumulate:
            self.id_sequences.append(image_path)

        if index % 10 == 0:
            ave_total_time = np.sum([t.average_time for t in timers.values()])
            det_time = (timers['im_detect_bbox'].average_time +
                        timers['im_detect_mask'].average_time)
            misc_time = (timers['misc_bbox'].average_time +
                         timers['misc_mask'].average_time)
            print(('im_detect: '
                   '{:d}/{:d} {:.3f}s + {:.3f}s => avg total time: {:.3f}s'
                   ).format(index, self.num_images, det_time, misc_time,
                            ave_total_time))

        return cls_boxes_i, cls_segms_i
예제 #22
0
def _LSTM(
    cell_class,
    model,
    input_blob,
    seq_lengths,
    initial_states,
    dim_in,
    dim_out,
    scope,
    outputs_with_grads=(0, ),
    return_params=False,
    memory_optimization=False,
    forget_bias=0.0,
    forward_only=False,
    drop_states=False,
    return_last_layer_only=True,
):
    '''
    Adds a standard LSTM recurrent network operator to a model.

    cell_class: LSTMCell or compatible subclass

    model: CNNModelHelper object new operators would be added to

    input_blob: the input sequence in a format T x N x D
            where T is sequence size, N - batch size and D - input dimension

    seq_lengths: blob containing sequence lengths which would be passed to
            LSTMUnit operator

    initial_states: a list of (2 * num_layers) blobs representing the initial
            hidden and cell states of each layer. If this argument is None,
            these states will be added to the model as network parameters.

    dim_in: input dimension

    dim_out: number of units per LSTM layer
            (use int for single-layer LSTM, list of ints for multi-layer)

    outputs_with_grads : position indices of output blobs for LAST LAYER which
            will receive external error gradient during backpropagation.
            These outputs are: (h_all, h_last, c_all, c_last)

    return_params: if True, will return a dictionary of parameters of the LSTM

    memory_optimization: if enabled, the LSTM step is recomputed on backward
            step so that we don't need to store forward activations for each
            timestep. Saves memory with cost of computation.

    forget_bias: forget gate bias (default 0.0)

    forward_only: whether to create a backward pass

    drop_states: drop invalid states, passed through to LSTMUnit operator

    return_last_layer_only: only return outputs from final layer
            (so that length of results does depend on number of layers)
    '''
    if type(dim_out) is not list and type(dim_out) is not tuple:
        dim_out = [dim_out]
    num_layers = len(dim_out)

    cells = []
    for i in range(num_layers):
        name = '{}/layer_{}'.format(scope, i) if num_layers > 1 else scope
        cell = cell_class(
            input_size=(dim_in if i == 0 else dim_out[i - 1]),
            hidden_size=dim_out[i],
            forget_bias=forget_bias,
            memory_optimization=memory_optimization,
            name=name,
            forward_only=forward_only,
            drop_states=drop_states,
        )
        cells.append(cell)

    if num_layers > 1:
        multicell = MultiRNNCell(
            cells,
            name=scope,
            forward_only=forward_only,
        )
    else:
        multicell = cells[0]

    if initial_states is None:
        initial_states = []
        for i in range(num_layers):
            with core.NameScope(scope):
                suffix = '_{}'.format(i) if num_layers > 1 else ''
                initial_hidden = model.param_init_net.ConstantFill(
                    [],
                    'initial_hidden_state' + suffix,
                    shape=[dim_out[i]],
                    value=0.0,
                )
                initial_cell = model.param_init_net.ConstantFill(
                    [],
                    'initial_cell_state' + suffix,
                    shape=[dim_out[i]],
                    value=0.0,
                )
                initial_states.extend([initial_hidden, initial_cell])
                model.params.extend([initial_hidden, initial_cell])

    assert len(initial_states) == 2 * num_layers, \
            "Incorrect initial_states, was expecting 2 * num_layers elements" \
            + " but had only {}".format(len(initial_states))

    # outputs_with_grads argument indexes into final layer
    outputs_with_grads = [4 * (num_layers - 1) + i for i in outputs_with_grads]

    _, result = multicell.apply_over_sequence(
        model=model,
        inputs=input_blob,
        seq_lengths=seq_lengths,
        initial_states=initial_states,
        outputs_with_grads=outputs_with_grads,
    )

    if return_last_layer_only:
        result = result[4 * (num_layers - 1):]
    if return_params:
        result = list(result) + [{
            'input': cell.get_input_params(),
            'recurrent': cell.get_recurrent_params(),
        }]
    return tuple(result)
예제 #23
0
    def test_simple_model(self):
        model = model_helper.ModelHelper(name="mnist")
        # how come those inputs don't break the forward pass =.=a
        workspace.FeedBlob("data",
                           np.random.randn(1, 3, 64, 64).astype(np.float32))
        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))

        with core.NameScope("conv1"):
            conv1 = brew.conv(model,
                              "data",
                              'conv1',
                              dim_in=1,
                              dim_out=20,
                              kernel=5)
            # Image size: 24 x 24 -> 12 x 12
            pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
            # Image size: 12 x 12 -> 8 x 8
            conv2 = brew.conv(model,
                              pool1,
                              'conv2',
                              dim_in=20,
                              dim_out=100,
                              kernel=5)
            # Image size: 8 x 8 -> 4 x 4
            pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
        with core.NameScope("classifier"):
            # 50 * 4 * 4 stands for dim_out from previous layer multiplied by the image size
            fc3 = brew.fc(model, pool2, 'fc3', dim_in=100 * 4 * 4, dim_out=500)
            relu = brew.relu(model, fc3, fc3)
            pred = brew.fc(model, relu, 'pred', 500, 10)
            softmax = brew.softmax(model, pred, 'softmax')
            xent = model.LabelCrossEntropy([softmax, "label"], 'xent')
            # compute the expected loss
            loss = model.AveragedLoss(xent, "loss")
        model.net.RunAllOnMKL()
        model.param_init_net.RunAllOnMKL()
        model.AddGradientOperators([loss], skip=1)
        blob_name_tracker = {}
        graph = tb.model_to_graph_def(
            model,
            blob_name_tracker=blob_name_tracker,
            shapes={},
            show_simplified=False,
        )
        #self.assertEqual(
        #    blob_name_tracker['GRADIENTS/conv1/conv1_b_grad'],
        #    'conv1/conv1_b_grad',
        #)
        self.maxDiff = None
        # We can't guarantee the order in which they appear, so we sort
        # both before we compare them
        with open('tests/expect/caffe_mnist.expect') as f:
            EXPECTED_MNIST = f.read()
        sep = "node {"
        expected = "\n".join(
            sorted(sep + "\n  " + part.strip()
                   for part in EXPECTED_MNIST.strip().split(sep)
                   if part.strip()))
        actual = "\n".join(
            sorted(sep + "\n  " + part.strip()
                   for part in str(graph).strip().split(sep) if part.strip()))
예제 #24
0
def cudnn_LSTM(model,
               input_blob,
               initial_states,
               dim_in,
               dim_out,
               scope,
               recurrent_params=None,
               input_params=None,
               num_layers=1,
               return_params=False):
    '''
    CuDNN version of LSTM for GPUs.
    input_blob          Blob containing the input. Will need to be available
                        when param_init_net is run, because the sequence lengths
                        and batch sizes will be inferred from the size of this
                        blob.
    initial_states      tuple of (hidden_init, cell_init) blobs
    dim_in              input dimensions
    dim_out             output/hidden dimension
    scope               namescope to apply
    recurrent_params    dict of blobs containing values for recurrent
                        gate weights, biases (if None, use random init values)
                        See GetLSTMParamNames() for format.
    input_params        dict of blobs containing values for input
                        gate weights, biases (if None, use random init values)
                        See GetLSTMParamNames() for format.
    num_layers          number of LSTM layers
    return_params       if True, returns (param_extract_net, param_mapping)
                        where param_extract_net is a net that when run, will
                        populate the blobs specified in param_mapping with the
                        current gate weights and biases (input/recurrent).
                        Useful for assigning the values back to non-cuDNN
                        LSTM.
    '''
    with core.NameScope(scope):
        weight_params = GetLSTMParamNames()['weights']
        bias_params = GetLSTMParamNames()['biases']

        input_weight_size = dim_out * dim_in
        upper_layer_input_weight_size = dim_out * dim_out
        recurrent_weight_size = dim_out * dim_out
        input_bias_size = dim_out
        recurrent_bias_size = dim_out

        def init(layer, pname, input_type):
            input_weight_size_for_layer = input_weight_size if layer == 0 else \
                upper_layer_input_weight_size
            if pname in weight_params:
                sz = input_weight_size_for_layer if input_type == 'input' \
                    else recurrent_weight_size
            elif pname in bias_params:
                sz = input_bias_size if input_type == 'input' \
                    else recurrent_bias_size
            else:
                assert False, "unknown parameter type {}".format(pname)
            return model.param_init_net.UniformFill(
                [],
                "lstm_init_{}_{}_{}".format(input_type, pname, layer),
                shape=[sz])

        # Multiply by 4 since we have 4 gates per LSTM unit
        first_layer_sz = input_weight_size + recurrent_weight_size + \
                         input_bias_size + recurrent_bias_size
        upper_layer_sz = upper_layer_input_weight_size + \
                         recurrent_weight_size + input_bias_size + \
                         recurrent_bias_size
        total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)

        weights = model.param_init_net.UniformFill([],
                                                   "lstm_weight",
                                                   shape=[total_sz])

        model.params.append(weights)
        model.weights.append(weights)

        lstm_args = {
            'hidden_size': dim_out,
            'rnn_mode': 'lstm',
            'bidirectional': 0,  # TODO
            'dropout': 1.0,  # TODO
            'input_mode': 'linear',  # TODO
            'num_layers': num_layers,
            'engine': 'CUDNN'
        }

        param_extract_net = core.Net("lstm_param_extractor")
        param_extract_net.AddExternalInputs([input_blob, weights])
        param_extract_mapping = {}

        # Populate the weights-blob from blobs containing parameters for
        # the individual components of the LSTM, such as forget/input gate
        # weights and bises. Also, create a special param_extract_net that
        # can be used to grab those individual params from the black-box
        # weights blob. These results can be then fed to InitFromLSTMParams()
        for input_type in ['input', 'recurrent']:
            param_extract_mapping[input_type] = {}
            p = recurrent_params if input_type == 'recurrent' else input_params
            if p is None:
                p = {}
            for pname in weight_params + bias_params:
                for j in range(0, num_layers):
                    values = p[pname] if pname in p else init(
                        j, pname, input_type)
                    model.param_init_net.RecurrentParamSet(
                        [input_blob, weights, values],
                        weights,
                        layer=j,
                        input_type=input_type,
                        param_type=pname,
                        **lstm_args)
                    if pname not in param_extract_mapping[input_type]:
                        param_extract_mapping[input_type][pname] = {}
                    b = param_extract_net.RecurrentParamGet(
                        [input_blob, weights],
                        ["lstm_{}_{}_{}".format(input_type, pname, j)],
                        layer=j,
                        input_type=input_type,
                        param_type=pname,
                        **lstm_args)
                    param_extract_mapping[input_type][pname][j] = b

        (hidden_input_blob, cell_input_blob) = initial_states
        output, hidden_output, cell_output, rnn_scratch, dropout_states = \
            model.net.Recurrent(
                [input_blob, cell_input_blob, cell_input_blob, weights],
                ["lstm_output", "lstm_hidden_output", "lstm_cell_output",
                 "lstm_rnn_scratch", "lstm_dropout_states"],
                seed=random.randint(0, 100000),  # TODO: dropout seed
                **lstm_args
            )
        model.net.AddExternalOutputs(hidden_output, cell_output, rnn_scratch,
                                     dropout_states)

    if return_params:
        param_extract = param_extract_net, param_extract_mapping
        return output, hidden_output, cell_output, param_extract
    else:
        return output, hidden_output, cell_output
예제 #25
0
    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")
        input_to_grad = m.AddGradientOperators(
            ["name_x/loss1", "name_x/loss2"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss1", "name_x/loss2"],
            set(viewvalues(m.param_to_grad)),
            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
            share_activations=True,
            dont_share_blobs=set([
                'name_x/fc6', 'name_x/fc5',
                str(input_to_grad["name_x/fc1_w"])
            ]),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)
        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
        np.testing.assert_almost_equal(grad, optimized_grad)
예제 #26
0
    def _build_decoder(
        self,
        model,
        step_model,
        model_params,
        scope,
        previous_tokens,
        timestep,
        fake_seq_lengths,
    ):
        attention_type = model_params['attention']
        assert attention_type in ['none', 'regular']
        use_attention = (attention_type != 'none')

        with core.NameScope(scope):
            encoder_embeddings = seq2seq_util.build_embeddings(
                model=model,
                vocab_size=self.source_vocab_size,
                embedding_size=model_params['encoder_embedding_size'],
                name='encoder_embeddings',
                freeze_embeddings=False,
            )

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_states,
            final_encoder_cell_states,
            encoder_units_per_layer,
        ) = seq2seq_util.build_embedding_encoder(
            model=model,
            encoder_params=model_params['encoder_type'],
            num_decoder_layers=len(model_params['decoder_layer_configs']),
            inputs=self.encoder_inputs,
            input_lengths=self.encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=encoder_embeddings,
            embedding_size=model_params['encoder_embedding_size'],
            use_attention=use_attention,
            num_gpus=0,
            forward_only=True,
            scope=scope,
        )
        with core.NameScope(scope):
            if use_attention:
                # [max_source_length, beam_size, encoder_output_dim]
                encoder_outputs = model.net.Tile(
                    encoder_outputs,
                    'encoder_outputs_tiled',
                    tiles=self.beam_size,
                    axis=1,
                )

            if weighted_encoder_outputs is not None:
                weighted_encoder_outputs = model.net.Tile(
                    weighted_encoder_outputs,
                    'weighted_encoder_outputs_tiled',
                    tiles=self.beam_size,
                    axis=1,
                )

            decoder_embeddings = seq2seq_util.build_embeddings(
                model=model,
                vocab_size=self.target_vocab_size,
                embedding_size=model_params['decoder_embedding_size'],
                name='decoder_embeddings',
                freeze_embeddings=False,
            )
            embedded_tokens_t_prev = step_model.net.Gather(
                [decoder_embeddings, previous_tokens],
                'embedded_tokens_t_prev',
            )

        decoder_cells = []
        decoder_units_per_layer = []
        for i, layer_config in enumerate(model_params['decoder_layer_configs']):
            num_units = layer_config['num_units']
            decoder_units_per_layer.append(num_units)
            if i == 0:
                input_size = model_params['decoder_embedding_size']
            else:
                input_size = (
                    model_params['decoder_layer_configs'][i - 1]['num_units']
                )

            cell = rnn_cell.LSTMCell(
                forward_only=True,
                input_size=input_size,
                hidden_size=num_units,
                forget_bias=0.0,
                memory_optimization=False,
            )
            decoder_cells.append(cell)

        with core.NameScope(scope):
            if final_encoder_hidden_states is not None:
                for i in range(len(final_encoder_hidden_states)):
                    if final_encoder_hidden_states[i] is not None:
                        final_encoder_hidden_states[i] = model.net.Tile(
                            final_encoder_hidden_states[i],
                            'final_encoder_hidden_tiled_{}'.format(i),
                            tiles=self.beam_size,
                            axis=1,
                        )
            if final_encoder_cell_states is not None:
                for i in range(len(final_encoder_cell_states)):
                    if final_encoder_cell_states[i] is not None:
                        final_encoder_cell_states[i] = model.net.Tile(
                            final_encoder_cell_states[i],
                            'final_encoder_cell_tiled_{}'.format(i),
                            tiles=self.beam_size,
                            axis=1,
                        )
            initial_states = \
                seq2seq_util.build_initial_rnn_decoder_states(
                    model=model,
                    encoder_units_per_layer=encoder_units_per_layer,
                    decoder_units_per_layer=decoder_units_per_layer,
                    final_encoder_hidden_states=final_encoder_hidden_states,
                    final_encoder_cell_states=final_encoder_cell_states,
                    use_attention=use_attention,
                )

        attention_decoder = seq2seq_util.LSTMWithAttentionDecoder(
            encoder_outputs=encoder_outputs,
            encoder_output_dim=encoder_units_per_layer[-1],
            encoder_lengths=None,
            vocab_size=self.target_vocab_size,
            attention_type=attention_type,
            embedding_size=model_params['decoder_embedding_size'],
            decoder_num_units=decoder_units_per_layer[-1],
            decoder_cells=decoder_cells,
            weighted_encoder_outputs=weighted_encoder_outputs,
            name=scope,
        )
        states_prev = step_model.net.AddExternalInputs(*[
            '{}/{}_prev'.format(scope, s)
            for s in attention_decoder.get_state_names()
        ])
        decoder_outputs, states = attention_decoder.apply(
            model=step_model,
            input_t=embedded_tokens_t_prev,
            seq_lengths=fake_seq_lengths,
            states=states_prev,
            timestep=timestep,
        )

        state_configs = [
            BeamSearchForwardOnly.StateConfig(
                initial_value=initial_state,
                state_prev_link=BeamSearchForwardOnly.LinkConfig(
                    blob=state_prev,
                    offset=0,
                    window=1,
                ),
                state_link=BeamSearchForwardOnly.LinkConfig(
                    blob=state,
                    offset=1,
                    window=1,
                ),
            )
            for initial_state, state_prev, state in zip(
                initial_states,
                states_prev,
                states,
            )
        ]

        with core.NameScope(scope):
            decoder_outputs_flattened, _ = step_model.net.Reshape(
                [decoder_outputs],
                [
                    'decoder_outputs_flattened',
                    'decoder_outputs_and_contexts_combination_old_shape',
                ],
                shape=[-1, attention_decoder.get_output_dim()],
            )
            output_logits = seq2seq_util.output_projection(
                model=step_model,
                decoder_outputs=decoder_outputs_flattened,
                decoder_output_size=attention_decoder.get_output_dim(),
                target_vocab_size=self.target_vocab_size,
                decoder_softmax_size=model_params['decoder_softmax_size'],
            )
            # [1, beam_size, target_vocab_size]
            output_probs = step_model.net.Softmax(
                output_logits,
                'output_probs',
            )
            output_log_probs = step_model.net.Log(
                output_probs,
                'output_log_probs',
            )
            if use_attention:
                attention_weights = attention_decoder.get_attention_weights()
            else:
                attention_weights = step_model.net.ConstantFill(
                    [self.encoder_inputs],
                    'zero_attention_weights_tmp_1',
                    value=0.0,
                )
                attention_weights = step_model.net.Transpose(
                    attention_weights,
                    'zero_attention_weights_tmp_2',
                )
                attention_weights = step_model.net.Tile(
                    attention_weights,
                    'zero_attention_weights_tmp',
                    tiles=self.beam_size,
                    axis=0,
                )

        return (
            state_configs,
            output_log_probs,
            attention_weights,
        )
예제 #27
0
def Parallelize_GPU(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    devices=range(0, workspace.NumCudaDevices()),
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
    use_nccl=False,
):
    '''
    Function to create a model that can run on many GPUs.
      model_helper_obj: an object of ModelHelper, such as CNNModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all GPUs share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient. Loss scale parameter
                        is passed, as you should scale the loss of your model
                        by 1.0 / the total number of gpus.
                        Signature: forward_pass_builder_fun(model, loss_scale)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying.
                        Signature: param_update_builder_fun(model)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type
      optimize_gradient_memory: whether to apply 'memonger' to share blobs
                        in gradient computation to reduce memory footprint

    '''
    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    loss_scale = 1.0 / (len(devices) * num_shards)

    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                log.info("Model for GPU: {}".format(device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
                # Losses are not needed for test net
                if param_update_builder_fun is not None:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses
    _ValidateParams(model_helper_obj.params)

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params, non_datapar_params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(devices, model_helper_obj.computed_params, [])
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()
    model_helper_obj._computed_param_names = computed_params_grouped.keys()

    if (param_update_builder_fun is None):
        log.info("Parameter update function not defined --> only forward")
        _InferBlobDevice(model_helper_obj)
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    _ValidateParams(model_helper_obj.params)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [param_to_grad[p] for p in
                     model_helper_obj.params if p in param_to_grad]
    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]

    gradients_grouped = _GroupByDevice(
        devices,
        grads_ordered,
        non_datapar_grads
    )
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = gradients_grouped.keys()
    model_helper_obj._losses_by_gpu = losses_by_gpu

    _InferBlobDevice(model_helper_obj)

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous)

    if len(model_helper_obj._grad_names) > 0:
        _AllReduceGradients(devices, model_helper_obj, rendezvous, use_nccl)
    else:
        log.info("NOTE: Param builder function did not create any parameters.")


    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    # The following check is necessary for ring reduce to work
    if rendezvous is not None:
        assert num_shards > 1, \
            "Please use more than one shard for distributed training"
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                param_update_builder_fun(model_helper_obj)

    _InferBlobDevice(model_helper_obj)
    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    if (rendezvous is not None):
        _AddDistributedParameterSync(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj.param_init_net,
            rendezvous,
        )

    _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)

    if optimize_gradient_memory:
        _OptimizeGradientMemoryDEPRECATED(model_helper_obj, losses_by_gpu, devices)
예제 #28
0
 def proc1(rec):
     with core.NameScope('proc1'):
         out = NewRecord(ops, rec)
     ops.Add([rec.uid(), rec.uid()], [out.uid()])
     out.value.set(blob=rec.value(), unsafe=True)
     return out
예제 #29
0
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(viewvalues(m.param_to_grad)),
            "name_x/",
            share_activations=False,
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        optim_proto_wacts = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(viewvalues(m.param_to_grad)),
            "name_x/",
            share_activations=True,
            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
        )
        blobs_wact_optim = count_blobs(optim_proto_wacts)
        self.assertLessEqual(blobs_wact_optim, blobs_after)

        # Check that the last activations are not shared
        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
        self.assertTrue(
            has_blob(optim_proto_wacts, "name_x/fc5"),
            "Dont remap final activation",
        )

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)

        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        # Run with the forward optimization
        workspace.RunNetOnce(optim_proto_wacts)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)
예제 #30
0
def Test(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        total_batch_size = args.batch_size * num_gpus
        log.info("Running on GPUs: {}".format(gpus))
        log.info("total_batch_size: {}".format(total_batch_size))
    else:
        total_batch_size = args.batch_size
        log.info("Running on CPU")
        log.info("total_batch_size: {}".format(total_batch_size))

    video_input_args = dict(
        batch_size=args.batch_size,
        clip_per_video=args.clip_per_video,
        decode_type=1,
        length_rgb=args.clip_length_rgb,
        sampling_rate_rgb=args.sampling_rate_rgb,
        scale_h=args.scale_h,
        scale_w=args.scale_w,
        crop_size=args.crop_size,
        video_res_type=args.video_res_type,
        short_edge=min(args.scale_h, args.scale_w),
        num_decode_threads=args.num_decode_threads,
        do_multi_label=args.multi_label,
        num_of_class=args.num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=args.input_type,
        length_of=args.clip_length_of,
        sampling_rate_of=args.sampling_rate_of,
        frame_gap_of=args.frame_gap_of,
        do_flow_aggregation=args.do_flow_aggregation,
        flow_data_type=args.flow_data_type,
        get_rgb=(args.input_type == 0 or args.input_type >= 3),
        get_optical_flow=(args.input_type == 1 or args.input_type >= 4),
        use_local_file=args.use_local_file,
        crop_per_clip=args.crop_per_clip,
    )

    reader_args = dict(
        name="test_reader",
        input_data=args.test_data,
    )

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size * args.clip_per_video *
            args.crop_per_clip,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(args.clip_length_of
                         if args.input_type == 1 else args.clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
            pred_layer_name=args.pred_layer_name,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            conv1_temporal_stride=args.conv1_temporal_stride,
            conv1_temporal_kernel=args.conv1_temporal_kernel,
            use_convolutional_pred=args.use_convolutional_pred,
            use_pool1=args.use_pool1,
        )

    test_model = cnn.CNNModelHelper(
        order="NCHW",
        name="video_model_test",
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
    )

    test_reader, number_of_examples = reader_utils.create_data_reader(
        test_model, **reader_args)

    if args.num_iter <= 0:
        num_iter = int(math.ceil(number_of_examples / total_batch_size))
    else:
        num_iter = args.num_iter

    def test_input_fn(model):
        model_helper.AddVideoInput(test_model, test_reader, **video_input_args)

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        test_model._device_type = caffe2_pb2.CPU
        test_model._devices = [0]
        device_opt = core.DeviceOption(test_model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                test_input_fn(test_model)
                create_model_ops(test_model, 1.0)

    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    if args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        if num_gpus > 0:
            model_loader.LoadModelFromPickleFile(test_model,
                                                 args.load_model_path,
                                                 use_gpu=True,
                                                 root_gpu_id=gpus[0])
        else:
            model_loader.LoadModelFromPickleFile(test_model,
                                                 args.load_model_path,
                                                 use_gpu=False)
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(test_model)

    # metric couters for multilabel
    all_prob_for_map = np.empty(shape=[0, args.num_labels], dtype=np.float)
    all_label_for_map = np.empty(shape=[0, args.num_labels], dtype=np.int32)

    # metric counters for closed-world classification
    clip_acc = 0
    video_top1 = 0
    video_topk = 0
    video_count = 0
    clip_count = 0
    crop_per_video = args.clip_per_video * args.crop_per_clip
    for i in range(num_iter):
        workspace.RunNet(test_model.net.Proto().name)

        num_devices = 1  # default for cpu
        if num_gpus > 0:
            num_devices = num_gpus

        for g in range(num_devices):
            # get labels
            label = workspace.FetchBlob("gpu_{}".format(g) + '/label')
            # get predictions
            if args.multi_label:
                predicts = workspace.FetchBlob("gpu_{}".format(g) + '/prob')
            else:
                predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax')
            assert predicts.shape[0] == args.batch_size * crop_per_video

            for j in range(args.batch_size):
                # get label for one video
                if args.multi_label:
                    sample_label = label[j * crop_per_video, :]
                else:
                    sample_label = label[j * crop_per_video]
                    # get clip accuracy
                    for k in range(crop_per_video):
                        sorted_preds = \
                            np.argsort(predicts[j * crop_per_video + k, :])
                        sorted_preds[:] = sorted_preds[::-1]
                        if sorted_preds[0] == label[j * crop_per_video + k]:
                            clip_acc = clip_acc + 1
                # get all clip predictions for one video
                all_clips = \
                    predicts[
                        j * crop_per_video:(j + 1) * crop_per_video, :
                    ]
                # aggregate predictions into one
                video_pred = PredictionAggregation(all_clips, args.aggregation)

                if args.multi_label:
                    video_pred = np.expand_dims(video_pred, axis=0)
                    sample_label = np.expand_dims(sample_label, axis=0)
                    all_prob_for_map = np.concatenate(
                        (all_prob_for_map, video_pred), axis=0)
                    all_label_for_map = np.concatenate(
                        (all_label_for_map, sample_label), axis=0)
                else:
                    sorted_video_pred = np.argsort(video_pred)
                    sorted_video_pred[:] = sorted_video_pred[::-1]
                    if sorted_video_pred[0] == sample_label:
                        video_top1 = video_top1 + 1
                    if sample_label in sorted_video_pred[0:args.top_k]:
                        video_topk = video_topk + 1

            video_count = video_count + args.batch_size
            clip_count = clip_count + label.shape[0]

        if i > 0 and i % args.display_iter == 0:
            if args.multi_label:
                # mAP
                auc, ap, wap, aps = metric.mean_ap_metric(
                    all_prob_for_map, all_label_for_map)
                log.info(
                    'Iter {}/{}: mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}'.
                    format(i, num_iter, auc, ap, wap, np.mean(aps)))
            else:
                # accuracy
                log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format(
                    i, num_iter, clip_acc / clip_count,
                    video_top1 / video_count, video_topk / video_count))

    if args.multi_label:
        # mAP
        auc, ap, wap, aps = metric.mean_ap_metric(all_prob_for_map,
                                                  all_label_for_map)
        log.info("Test mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}".format(
            auc, ap, wap, np.mean(aps)))
        if args.print_per_class_metrics:
            log.info("Test mAP per class: {}".format(aps))
    else:
        # accuracy
        log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format(
            clip_acc / clip_count, video_top1 / video_count, args.top_k,
            video_topk / video_count))

    if num_gpus > 0:
        flops, params, inters = model_helper.GetFlopsAndParams(
            test_model, gpus[0])
    else:
        flops, params, inters = model_helper.GetFlopsAndParams(test_model)
    log.info('FLOPs: {}, params: {}, inters: {}'.format(flops, params, inters))