コード例 #1
0
 def FeedBlobWrapper(self, tag, val):
     if self.gpu_en:
         _d = core.DeviceOption(caffe2_pb2.CUDA, 0)
         workspace.FeedBlob(tag, val, device_option=_d)
     else:
         workspace.FeedBlob(tag, val)
コード例 #2
0
 def testFeedFetchBlobIDEEP(self):
     arr = np.random.randn(2, 3).astype(np.float32)
     workspace.FeedBlob(
         "testblob_ideep", arr, core.DeviceOption(caffe2_pb2.IDEEP))
     fetched = workspace.FetchBlob("testblob_ideep")
     np.testing.assert_array_equal(arr, fetched)
コード例 #3
0
    def _run(self, net, param_init_net, param_info):
        param = param_info.blob
        grad = param_info.grad
        if self.base_learning_rate == 0:
            return
        assert self.base_learning_rate > 0, (
            "Expect positive base learning rate, got {}".format(
                self.base_learning_rate))

        # TODO(zqq): support LARS for sparse parameters
        if self.lars is not None and not isinstance(grad, core.GradientSlice):
            assert self.lars >= 0, (
                'Lars offset must be nonnegative, got {}'.format(self.lars))
            lr_lars_multiplier = net.Lars(
                [param, grad],
                self.make_unique_blob_name(str(param) + "_lars"),
                offset=self.lars)
            current_scope = scope.CurrentDeviceScope()
            self.add_lr_multiplier(
                lr_lars_multiplier,
                is_gpu_blob=(current_scope is not None
                             and current_scope.device_type == caffe2_pb2.CUDA),
            )

        # We need negative sign for LR when used directly with WeightedSum
        # below.
        lr_sign = -1 if self.momentum else 1
        lr, _ = self.build_lr(net,
                              param_init_net,
                              base_learning_rate=self.base_learning_rate *
                              lr_sign,
                              policy=self.policy,
                              **(self.init_kwargs))

        dev = scope.CurrentDeviceScope()
        if dev is None:
            dev = core.DeviceOption(caffe2_pb2.CPU)

        # Each GPU/CPU must have its own ONE blob, thus modify the name
        # to include device information.
        ONE = param_init_net.ConstantFill([],
                                          "ONE_{}_{}{}".format(
                                              dev.device_type, dev.cuda_gpu_id,
                                              dev.node_name),
                                          shape=[1],
                                          value=1.0)

        self._aux_params.shared.append(ONE)

        if self.momentum > 0:
            momentum_data = param_init_net.ConstantFill(param,
                                                        str(param) +
                                                        "_momentum",
                                                        value=0.)
            self._aux_params.local.append(momentum_data)

        if isinstance(grad, core.GradientSlice):
            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
            if self.momentum > 0.:
                net.SparseMomentumSGDUpdate(
                    [grad.values, momentum_data, lr, param, grad.indices],
                    [grad.values, momentum_data, param],
                    momentum=self.momentum,
                    nesterov=self.nesterov)
            else:
                net.ScatterWeightedSum(
                    [param, ONE, grad.indices, grad.values, lr], param)
        else:
            if self.momentum > 0.:
                net.MomentumSGDUpdate([grad, momentum_data, lr, param],
                                      [grad, momentum_data, param],
                                      momentum=self.momentum,
                                      nesterov=self.nesterov)
            else:
                coeff = lr

                net.WeightedSum([param, ONE, grad, coeff], param)
コード例 #4
0
def CudaDevice(gpu_id):
    """Create a Cuda device."""
    return core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
コード例 #5
0
    for i in range(num_images):
        vis_im = _generate_visualizations(frames[i], i, dets['all_boxes'], dets['all_keyps'], dets['all_tracks'])
        cv2.imwrite(osp.join(args.out_path, args.vid_name + '_vis','%08d.jpg'%(i+1)),vis_im)

if __name__=='__main__':
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
    args = parse_args()
    if args.out_path == None:
        args.out_path = args.video_path
    args.vid_name = args.video_path.split('/')[-1].split('.')[0]

    utils.c2.import_custom_ops()
    utils.c2.import_detectron_ops()
    utils.c2.import_contrib_ops()

    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)
    if args.opts is not None:
        cfg_from_list(args.opts)
    assert_and_infer_cfg()

    if osp.exists(osp.join(args.out_path,args.vid_name + '_vis')):
        shutil.rmtree(osp.join(args.out_path, args.vid_name + '_vis'))
    os.makedirs(osp.join(args.out_path,args.vid_name+ '_vis'))

    num_images = _read_video(args)
    gpu_dev = core.DeviceOption(caffe2_pb2.CUDA, cfg.ROOT_GPU_ID)
    name_scope = 'gpu_{}'.format(cfg.ROOT_GPU_ID)
    main(name_scope, gpu_dev, num_images, args)
コード例 #6
0
class TestGradientCalculation(test_util.TestCase):
    def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
        for op in operatorDefList1:
            op.debug_info = ""
            if op.device_option:
                del op.device_option.extra_info[:]
        for op in operatorDefList2:
            op.debug_info = ""
            if op.device_option:
                del op.device_option.extra_info[:]
        self.assertEqual(operatorDefList1, operatorDefList2)

    @given(device_option=st.sampled_from(
        [None, core.DeviceOption(workspace.GpuDeviceType, 1)]))
    def testDirect(self, device_option):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        if device_option:
            for op in operators:
                op.device_option.CopyFrom(device_option)
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
        ]
        if device_option:
            for op in desired_grad_operators:
                op.device_option.CopyFrom(device_option)
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testDirectImplicitGradientSource(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        desired_grad_operators = [
            CreateOperator("ConstantFill",
                           'out',
                           "out_autogen_grad",
                           value=1.0),
            CreateOperator('DirectGradient', 'out_autogen_grad',
                           'hidden_grad'),
            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
        ]
        for op in desired_grad_operators:
            op.debug_info = ""
        gradients, _ = GradientRegistry.GetBackwardPass(operators, ['out'])
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testDoesNotGenerateUnnecessaryGradients(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
        ]
        for op in desired_grad_operators:
            op.debug_info = ""
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'hidden': 'hidden_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testDirectButNoOutputGradientGiven(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(operators, {})
        self.assertOperatorListEqual(gradients, [])

    def testDirectInPlace(self):
        operators = [
            CreateOperator('Direct', 'in', 'in'),
            CreateOperator('Direct', 'in', 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad', 'in_grad'),
            CreateOperator('DirectGradient', 'in_grad', 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testVersionMismatch(self):
        operators = [
            CreateOperator('Direct', 'x', 'x'),
            CreateOperator('Direct', 'y', 'x'),
            CreateOperator('Direct', 'x', 'y'),
        ]
        try:
            gradients, _ = GradientRegistry.GetBackwardPass(
                operators, {'y': 'y_grad'})
            self.assertFalse(True,
                             "Should raise exception of incorrect version")
        except RuntimeError as e:
            print(e)
            self.assertTrue("version" in str(e))
            pass

    def testUseOutput(self):
        operators = [
            CreateOperator('UseOutput', 'in', 'hidden'),
            CreateOperator('UseOutput', 'hidden', 'out'),
            CreateOperator('Direct', 'out', 'sink'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
            CreateOperator('UseOutputGradient', ['out', 'out_grad'],
                           'hidden_grad'),
            CreateOperator('UseOutputGradient', ['hidden', 'hidden_grad'],
                           'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'sink': 'sink_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testUseOutputInPlace(self):
        operators = [
            CreateOperator('UseOutput', 'in', 'in'),
            CreateOperator('UseOutput', 'in', 'out'),
            CreateOperator('Direct', 'out', 'sink'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
            CreateOperator('UseOutputGradient', ['out', 'out_grad'],
                           'in_grad'),
            CreateOperator('UseOutputGradient', ['in', 'in_grad'], 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'sink': 'sink_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testUseOutputButOutputHasBeenChanged(self):
        operators = [
            CreateOperator('UseOutput', 'in', 'hidden'),
            # Note here: we overwrite hidden, but hidden will be needed by the
            # gradient calculation of the first operator, so the gradient
            # registry should return an error.
            CreateOperator('Direct', 'hidden', 'hidden'),
            CreateOperator('UseOutput', 'hidden', 'out'),
            CreateOperator('Direct', 'out', 'sink'),
        ]
        with self.assertRaises(RuntimeError):
            gradients, _ = GradientRegistry.GetBackwardPass(
                operators, {'sink': 'sink_grad'})

    def testUseInput(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('UseInput', 'hidden', 'out'),
            CreateOperator('Direct', 'out', 'sink'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
            CreateOperator('UseInputGradient', ['hidden', 'out_grad'],
                           'hidden_grad'),
            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'sink': 'sink_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testUseInputButInputHasBeenChanged(self):
        """Test gradient for the following case:

        in -> out, with UseInput
        in -> in

        Since we overwrite in in op#1, but in will be needed by the gradient
        calculation of op#0, the gradient registry should raise an error.
        """
        operators = [
            CreateOperator('UseInput', 'in', 'out'),
            CreateOperator('Direct', 'in', 'in'),
        ]
        with self.assertRaises(RuntimeError):
            gradients, _ = GradientRegistry.GetBackwardPass(
                operators, {'out': 'out_grad'})

    @given(device_option=st.sampled_from(
        [None, core.DeviceOption(workspace.GpuDeviceType, 1)]))
    def testMultiUseInput(self, device_option):
        """Test gradient for the following case:

        in -> hidden1
        in -> hidden2
        hidden1, hidden2 -> out
        """
        operators = [
            CreateOperator('Direct', 'in', 'hidden1'),
            CreateOperator('Direct', 'in', 'hidden2'),
            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
        ]
        if device_option:
            for op in operators:
                op.device_option.CopyFrom(device_option)
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad',
                           ['hidden1_grad', 'hidden2_grad']),
            CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'),
            CreateOperator('DirectGradient', 'hidden1_grad',
                           '_in_grad_autosplit_0'),
            CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'],
                           'in_grad'),
        ]
        if device_option:
            for op in desired_grad_operators:
                op.device_option.CopyFrom(device_option)
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {"out": "out_grad"})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testMultiUseInputButWithNoGradient(self):
        """Test gradient for the following case:

        in -> hidden1
        in -(no gradient)-> hidden2
        hidden1, hidden2 -> out
        """
        operators = [
            CreateOperator('Direct', 'in', 'hidden1'),
            CreateOperator('Nogradient', 'in', 'hidden2'),
            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad',
                           ['hidden1_grad', 'hidden2_grad']),
            CreateOperator('DirectGradient', 'hidden1_grad', 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testMultiUseInputAndMultipleVersions(self):
        """Test gradient for the following case:

        in -> in
        in -> hidden1, hidden2
        hidden1, hidden2 -> out
        """
        operators = [
            CreateOperator('Direct', 'in', 'in'),
            CreateOperator('Direct', 'in', 'hidden1'),
            CreateOperator('Direct', 'in', 'hidden2'),
            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad',
                           ['hidden1_grad', 'hidden2_grad']),
            CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'),
            CreateOperator('DirectGradient', 'hidden1_grad',
                           '_in_grad_autosplit_0'),
            CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'],
                           'in_grad'),
            CreateOperator('DirectGradient', 'in_grad', 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testMultiUseInputAndMultipleVersionsBig(self):
        """Test gradient for the following case:

        in -> in
        in -> hidden1, hidden2
        hidden1, hidden2 -> in
        in -> hidden3, hidden4, hidden5
        hidden3, hidden4, hidden5 -> out
        """
        operators = [
            CreateOperator('Direct', 'in', 'in'),
            CreateOperator('Direct', 'in', 'hidden1'),
            CreateOperator('Direct', 'in', 'hidden2'),
            CreateOperator('Direct', ['hidden1', 'hidden2'], 'in'),
            CreateOperator('Direct', 'in', 'hidden3'),
            CreateOperator('Direct', 'in', 'hidden4'),
            CreateOperator('Direct', 'in', 'hidden5'),
            CreateOperator('Direct', ['hidden3', 'hidden4', 'hidden5'], 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad',
                           ['hidden3_grad', 'hidden4_grad', 'hidden5_grad']),
            CreateOperator('DirectGradient', 'hidden5_grad', 'in_grad'),
            CreateOperator('DirectGradient', 'hidden4_grad',
                           '_in_grad_autosplit_0'),
            CreateOperator('DirectGradient', 'hidden3_grad',
                           '_in_grad_autosplit_1'),
            CreateOperator(
                'Sum',
                ['in_grad', '_in_grad_autosplit_0', '_in_grad_autosplit_1'],
                'in_grad'),
            CreateOperator('DirectGradient', 'in_grad',
                           ['hidden1_grad', 'hidden2_grad']),
            CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'),
            CreateOperator('DirectGradient', 'hidden1_grad',
                           '_in_grad_autosplit_0'),
            CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'],
                           'in_grad'),
            CreateOperator('DirectGradient', 'in_grad', 'in_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        for s in gradients:
            print(str(s))
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testGradientMappingUsingSumOp(self):
        """Since Sum is used in accumulating gradients, we will test if
        it is OK to also explicitly use it in the graph."""
        operators = [
            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
            CreateOperator('Sum', 'fc', 'agg'),
            CreateOperator('AveragedLoss', 'agg', 'loss'),
        ]
        # This should run correctly.
        gradient_ops, _ = GradientRegistry.GetBackwardPass(
            operators, {'loss': 'loss_grad'})
        for s in gradient_ops:
            print(str(s))

    def testGradientCalculationWithPrint(self):
        """Test a common use case where we have Print in the forward pass."""
        operators = [
            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
            CreateOperator('Print', 'fc', []),
            CreateOperator('AveragedLoss', 'fc', 'loss'),
        ]
        desired_grad_operators = [
            CreateOperator('AveragedLossGradient', ['fc', 'loss_grad'],
                           'fc_grad'),
            CreateOperator('FCGradient', ['in', 'w', 'fc_grad'],
                           ['w_grad', 'b_grad', 'in_grad']),
        ]
        for g in desired_grad_operators:
            g.is_gradient_op = 1
        # This should run correctly.
        gradient_ops, _ = GradientRegistry.GetBackwardPass(
            operators, {'loss': 'loss_grad'})
        for s in gradient_ops:
            print(str(s))
        self.assertOperatorListEqual(gradient_ops, desired_grad_operators)

    def testStopGradient(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('StopGradient', 'hidden', 'hidden2'),
            CreateOperator('Direct', 'hidden2', 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad', 'hidden2_grad'),
        ]
        gradients, _ = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)

    def testStopGradientOrphan(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('StopGradient', 'hidden', 'auto_blobx'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        with self.assertRaises(ValueError):
            # This should complain about incorrect use of StopGradient
            gradients, _ = GradientRegistry.GetBackwardPass(
                operators, {'out': 'out_grad'})

    def testStopGradientInplace(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('StopGradient', 'hidden', 'hidden'),
            CreateOperator('Direct', 'hidden', 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
        ]
        gradients, grad_map = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)
        self.assertEqual(grad_map, {'out': 'out_grad'})

    def testStopGradientWithMultiUseOperators(self):
        operators = [
            CreateOperator('Direct', 'in', 'hidden'),
            CreateOperator('Direct', 'hidden', 'hidden2'),
            CreateOperator('StopGradient', 'hidden', 'hidden3'),
            CreateOperator('Direct', ['hidden2', 'hidden3'], 'out'),
        ]
        desired_grad_operators = [
            CreateOperator('DirectGradient', 'out_grad',
                           ['hidden2_grad', 'hidden3_grad']),
            CreateOperator('DirectGradient', 'hidden2_grad', 'hidden_grad'),
            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
        ]
        gradients, grad_map = GradientRegistry.GetBackwardPass(
            operators, {'out': 'out_grad'})
        self.assertOperatorListEqual(gradients, desired_grad_operators)
        self.assertEqual(
            grad_map, {
                'out': 'out_grad',
                'hidden2': 'hidden2_grad',
                'hidden3': 'hidden3_grad',
                'hidden': 'hidden_grad',
                'in': 'in_grad'
            })

    def test_zero_gradient(self):
        net = core.Net("zero_grad_test")

        hidden_prev, cell, gates, seq_lengths, timestep =\
            net.AddExternalInput("h", "c", "g", "s", "t")
        hidden, cell = net.LSTMUnit(
            [hidden_prev, cell, gates, seq_lengths, timestep],
            ["hidden_t", "cell_t"])
        with self.assertRaises(Exception):
            net.AddGradientOperators([hidden])
        net.ZeroGradient(cell, [])
        net.AddGradientOperators([hidden])

    def test_two_grads(self):
        net = core.Net("test_two_grads")
        input, two, three = net.AddExternalInput("input", "two", "three")

        m1 = net.Mul([input, two], "mul_1")
        m2 = net.Mul([m1, three], "mul_2")
        grad_map = net.AddGradientOperators([m2, m1])
        workspace.ResetWorkspace()
        workspace.blobs[input] = np.array([1]).astype(np.float32)
        workspace.blobs[two] = np.array([2]).astype(np.float32)
        workspace.blobs[three] = np.array([3]).astype(np.float32)
        workspace.RunNetOnce(net)
        print(net.Proto())
        for blob in workspace.blobs:
            print(blob, workspace.blobs[blob])
        print("Input grad: ", workspace.blobs[grad_map[str(input)]])
        assert workspace.blobs[grad_map[str(input)]] == 8.0
コード例 #7
0
def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True):
    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
    # enable mkl, then copy the temporary output blob at the break point
    # and all external inputs for lstm part to cpu, and execuate rest of the net
    # (two lstm) on cpu
    # This only works for the xrayocr lstm model which uses the first 'Shape' op
    # to decide the break point, and after two lstm it's external_output
    # directly so there's no need to copy back to ideep/mkl

    def mkl_tmp(name):
        return "{}__MKL__".format(name)

    def cpu_tmp(name):
        return "{}__CPU__".format(name)

    input_blob = net.external_input[0]
    if input_blob != net.op[0].input[0]:
        raise Exception(
            "Input blob: {} is not consumed by first op: {}".format(
                input_blob, net.op[0]))
    # Modify input/outputs to point to copied MKL blobs.
    from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL"
    to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU"
    copy_input_op = core.CreateOperator(from_cpu, input_blob,
                                        mkl_tmp(input_blob))
    net.op[0].input[0] = mkl_tmp(input_blob)

    # the net may contain some external_inputs falsely added during ONNX->Caffe2
    # This should be taken care of in early steps during pytorch_to_caffe2,
    # but if not it can cause issue in follow up steps, so check here to confirm
    for input_blob in net.external_input:
        for op in net.op:
            # look for if the external_input blob is output of any op in the net
            assert input_blob not in op.output

    external_output = None
    external_inputs_to_cpu = set()
    find_first_shape_op = False
    cpu_op_start_idx = -1
    for op_idx, op in enumerate(net.op):
        # the first Shape op mark the starting point of LSTM chunk of the net
        if not find_first_shape_op:
            if op.type == 'Shape':
                external_output = op.input
                find_first_shape_op = True
                cpu_op_start_idx = op_idx
        else:
            # any external input in the LSTM part need to be copied to CPU
            for in_blob in op.input:
                if in_blob in net.external_input:
                    external_inputs_to_cpu.add(in_blob)

    # make sure we found the expected break point of the net
    assert external_output is not None

    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
    copy_extra_input_ops = []
    for in_blob in external_inputs_to_cpu:
        copy_extra_input_ops.append(
            core.CreateOperator(to_cpu, in_blob, cpu_tmp(in_blob)))
        # rename input blobs in LSTM part to use the CPU copy
        for op in net.op[cpu_op_start_idx:]:
            renamed_input = [
                blob if blob != in_blob else cpu_tmp(in_blob)
                for blob in op.input
            ]
            op.input[:] = renamed_input

    copy_output_ops = [
        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
        for output_blob in external_output
    ]

    for output_blob in external_output:
        last_producer_idx = last_producer(net.op, output_blob)
        renamed_outputs = [
            blob if blob != output_blob else mkl_tmp(blob)
            for blob in net.op[last_producer_idx].output
        ]
        net.op[last_producer_idx].output[:] = renamed_outputs

    # rearrange all ops in correct order
    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
    del net.op[:]
    net.op.extend(ops)

    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
    for op in net.op:
        # the first Shape op mark the starting point of LSTM chunk of the net
        if op.type == 'Shape':
            # all LSTM ops should run on CPU
            device = caffe2_pb2.CPU
        op.device_option.MergeFrom(core.DeviceOption(device_type=device))
        op.engine = ""

        # RecurrentNetwork has a nested step_net that needs special treatment
        if op.type == 'RecurrentNetwork':
            for arg in op.arg:
                if arg.name == 'step_net':
                    for nested_op in arg.n.op:
                        # set device to CPU
                        nested_op.device_option.MergeFrom(
                            core.DeviceOption(device_type=device))
                        nested_op.engine = ""

                        # rename inputs in op of nested net
                        renamed_input = []
                        for blob in nested_op.input:
                            renamed_input.append(
                                blob if blob not in
                                external_inputs_to_cpu else cpu_tmp(blob))
                        nested_op.input[:] = renamed_input

                    # rename external inputs of nested net
                    new_external_input = []
                    for blob in arg.n.external_input:
                        new_external_input.append(
                            blob if blob not in
                            external_inputs_to_cpu else cpu_tmp(blob))
                    arg.n.external_input[:] = new_external_input

    if ideep:
        # Temporarily disbale conv+relu fusion until we verify further
        # net.ParseFromString(
        #     C.transform_optimizeForIDEEP(net.SerializeToString()))
        fix_BoxWithNMSLimit(net)
コード例 #8
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1,
                          ("ConstantFill", {}), ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
            combine_spatial_bn=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        # Light test for LR names
        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
        self.assertGreater(len(lr_names), 0)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(
                model._device_prefix + "_0/sync_num",
                np.array([i * 2]).astype(np.float32),
                device_option=core.DeviceOption(model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(
                    model._device_prefix + "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
コード例 #9
0
    def run_model(self, V, gpu_devices, cpu_indices):

        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather(
                        [self.vecs, 'indices'], 'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs, "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1,
                          ("ConstantFill", {}), ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [
                            param_grad.values, param_momentum, param
                        ],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill(
                    [], "vecs", shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [], "ONE_CPU", shape=[1], value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([param, self.ONE_CPU,
                                                  param_grad.indices,
                                                  param_grad.values,
                                                  self.LR],
                                                  self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16
            )
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(
                    self.vecs,
                    orig_vecs
                )
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(
            np.allclose(workspace.FetchBlob(self.vecs), orig_vecs))
        return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
                workspace.FetchBlob("gpu_0/fc_w")]
コード例 #10
0
 def test_device_scope_check(self):
     with self.assertRaises(AssertionError):
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
             data_parallel_model.Parallelize_GPU(None, None, None)
コード例 #11
0
    def run_model(self, devices, gpu):

        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32)
            )
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            name="recurrent_test{}".format(devices),
        )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(
                self.T, self.batch_size, self.hidden_dim
            )

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g), targets
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
コード例 #12
0
def initialize_master_xpu_model_params(model, weights_file, opts, reset_epoch):
    log.info("Initializing model params from file: {}".format(weights_file))
    with open(weights_file, 'r') as fopen:
        blobs = pickle.load(fopen)
    if 'blobs' in blobs:
        blobs = blobs['blobs']

    start_epoch = 0
    best_metric = float('-inf')
    if 'epoch' in blobs:
        log.info('epoch {} is found in model file'.format(blobs['epoch']))
        if not reset_epoch:
            start_epoch = blobs['epoch']
        else:
            log.info('Reset epoch')
    else:
        log.info('no epoch is found in model file')
    lr = opts['model_param']['base_learning_rate']
    if 'lr' in blobs:
        lr = blobs['lr']
    if 'best_metric' in blobs and not reset_epoch:
        best_metric = blobs['best_metric']

    if model is not None:
        log.info('initialize model parameters using weights file: {}'.format(
            weights_file))
        ws_blobs = workspace.Blobs()
        unscoped_blob_names = OrderedDict()
        for blob in model.GetAllParams():
            unscoped_blob_names[unscope_name(str(blob))] = True
        root_xpu_id = opts['distributed']['first_xpu_id']
        device = opts['distributed']['device']
        caffe2_pb2_DEVICE =\
            caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\
            else caffe2_pb2.CPU
        with core.NameScope('{}_{}'.format(device, root_xpu_id)):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, 0)):
                for unscoped_blob_name in unscoped_blob_names.keys():
                    scoped_blob_name = scoped_name(unscoped_blob_name)
                    if unscoped_blob_name not in blobs:
                        log.info('{:s} not found'.format(unscoped_blob_name))
                        continue
                    log.info('{:s} loaded from weights file into: {:s}'.format(
                        unscoped_blob_name, scoped_blob_name))
                    if scoped_blob_name in ws_blobs:
                        ws_blob = workspace.FetchBlob(scoped_blob_name)
                        if not ws_blob.shape == blobs[unscoped_blob_name].shape:
                            log.info(
                                ('Workspace blob {} with shape {} does '
                                 'not match weights file shape {}').format(
                                     unscoped_blob_name, ws_blob.shape,
                                     blobs[unscoped_blob_name].shape))
                        else:
                            workspace.FeedBlob(
                                scoped_blob_name,
                                blobs[unscoped_blob_name].astype(np.float32,
                                                                 copy=False))
    else:
        log.info('Skip initializing model parameters from file: {}'.format(
            weights_file))
    log.info('Complete initialize_master_xpu_model_params')
    return start_epoch, lr, best_metric
コード例 #13
0
    def normalize_dense_matrix(
        self,
        input_matrix: str,
        features: List[int],
        normalization_parameters: Dict[int, NormalizationParameters],
        blobname_prefix: str,
        split_expensive_feature_groups: bool,
    ) -> Tuple[str, List[str]]:
        """
        Normalizes inputs according to parameters. Expects a dense matrix whose ith
        column corresponds to feature i.

        Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so
        we need to use a CPU context.

        :param input_matrix: Input matrix to normalize.
        :param features: Array that maps feature ids to column indices.
        :param normalization_parameters: Mapping from feature names to
            NormalizationParameters.
        :param blobname_prefix: Prefix for input blobs to norm_net.
        :param num_output_features: The number of features in an output processed
            datapoint. If set to None, this function will compute it.
        """
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
            feature_starts = self._get_type_boundaries(
                features, normalization_parameters)

            normalized_input_blobs = []
            parameters: List[str] = []
            for i, feature_type in enumerate(FEATURE_TYPES):
                start_index = feature_starts[i]
                if (i + 1) == len(FEATURE_TYPES):
                    end_index = len(normalization_parameters)
                else:
                    end_index = feature_starts[i + 1]
                if start_index == end_index:
                    continue  # No features of this type
                slices = []

                split_feature_group, split_intervals = self._should_split_feature_group(
                    split_expensive_feature_groups, start_index, end_index,
                    feature_type)

                if split_feature_group:
                    for j in range(len(split_intervals) - 1):
                        slice_blob = self._get_input_blob_indexed(
                            blobname_prefix, feature_type, j)
                        C2.net().Slice(
                            [input_matrix],
                            [slice_blob],
                            starts=[0, split_intervals[j]],
                            ends=[-1, split_intervals[j + 1]],
                        )
                        slices.append((slice_blob, split_intervals[j],
                                       split_intervals[j + 1]))
                else:
                    sliced_input_features = self._get_input_blob(
                        blobname_prefix, feature_type)

                    C2.net().Slice(
                        [input_matrix],
                        [sliced_input_features],
                        starts=[0, start_index],
                        ends=[-1, end_index],
                    )

                    slices.append(
                        (sliced_input_features, start_index, end_index))

                for (slice_blob, start, end) in slices:
                    normalized_input_blob, blob_parameters = self.preprocess_blob(
                        slice_blob,
                        [
                            normalization_parameters[x]
                            for x in features[start:end]
                        ],
                    )
                    logger.info(
                        "Processed split ({}, {}) for feature type {}".format(
                            start, end, feature_type))
                    parameters.extend(blob_parameters)
                    normalized_input_blobs.append(normalized_input_blob)
            for i, inp in enumerate(normalized_input_blobs):
                logger.info("input# {}: {}".format(i, inp))
            concatenated_input_blob, concatenated_input_blob_dim = C2.Concat(
                *normalized_input_blobs, axis=1)
        return concatenated_input_blob, parameters
コード例 #14
0
    def __init__(
        self,
        cli_args,
        model=None,
        tag=None,
        enable_prof=False,
    ):
        super(Wide_and_Deep_Wrapper, self).__init__()
        self.args = cli_args

        # GPU Enable Flags
        gpu_en = self.args.use_gpu

        if gpu_en:
            device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
            ngpus = C.num_cuda_devices  # 1
            print("(Wrapper) Using {} GPU(s)...".format(ngpus))
        else:
            device_opt = core.DeviceOption(caffe2_pb2.CPU)
            print("(Wrapper) Using CPU...")

        self.gpu_en = gpu_en

        num_tables = len(cli_args.arch_embedding_size.split("-"))

        # We require 3 datastructures in caffe2 to enable non-blocking inputs for Wide_and_Deep
        # At a high-level each input needs an input queue. Inputs are enqueued
        # when they arrive on the "server" or "core" and dequeued by the
        # model's inference engine
        # Input Blob -> Input Net -> ID Q ===> Wide_and_Deep model
        self.id_qs = []
        self.id_input_blobs = []
        self.id_input_nets = []

        # Same thing for the lengths inputs
        self.len_qs = []
        self.len_input_blobs = []
        self.len_input_nets = []

        for i in range(num_tables):

            q, input_blob, net = self.build_wnd_sparse_queue(tag="id", qid=i)
            self.id_qs.append(q)
            self.id_input_blobs.append(input_blob)
            self.id_input_nets.append(net)

            q, input_blob, net = self.build_wnd_sparse_queue(tag="len", qid=i)
            self.len_qs.append(q)
            self.len_input_blobs.append(input_blob)
            self.len_input_nets.append(net)

        self.fc_q, self.fc_input_blob, self.fc_input_net = self.build_wnd_fc_queue(
        )

        if self.args.queue:
            with core.DeviceScope(device_opt):
                self.wnd = Wide_and_Deep(cli_args,
                                         model,
                                         tag,
                                         enable_prof,
                                         id_qs=self.id_qs,
                                         len_qs=self.len_qs,
                                         fc_q=self.fc_q)
        else:
            with core.DeviceScope(device_opt):
                self.wnd = Wide_and_Deep(cli_args, model, tag, enable_prof)
コード例 #15
0
    def test_lstm_extract_predictor_net(self):
        model = ModelHelper(name="lstm_extract_test")

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            output, _, _, _ = rnn_cell.LSTM(
                model=model,
                input_blob="input",
                seq_lengths="seqlengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=20,
                dim_out=40,
                scope="test",
                drop_states=True,
                return_last_layer_only=True,
            )
        # Run param init net to get the shapes for all inputs
        shapes = {}
        workspace.RunNetOnce(model.param_init_net)
        for b in workspace.Blobs():
            shapes[b] = workspace.FetchBlob(b).shape

        # But export in CPU
        (predict_net, export_blobs) = ExtractPredictorNet(
            net_proto=model.net.Proto(),
            input_blobs=["input"],
            output_blobs=[output],
            device=core.DeviceOption(caffe2_pb2.CPU, 1),
        )

        # Create the net and run once to see it is valid
        # Populate external inputs with correctly shaped random input
        # and also ensure that the export_blobs was constructed correctly.
        workspace.ResetWorkspace()
        shapes['input'] = [10, 4, 20]
        shapes['cell_init'] = [1, 4, 40]
        shapes['hidden_init'] = [1, 4, 40]

        print(predict_net.Proto().external_input)
        self.assertTrue('seqlengths' in predict_net.Proto().external_input)
        for einp in predict_net.Proto().external_input:
            if einp == 'seqlengths':
                workspace.FeedBlob("seqlengths",
                                   np.array([10] * 4, dtype=np.int32))
            else:
                workspace.FeedBlob(
                    einp,
                    np.zeros(shapes[einp]).astype(np.float32),
                )
                if einp != 'input':
                    self.assertTrue(einp in export_blobs)

        print(str(predict_net.Proto()))
        self.assertTrue(workspace.CreateNet(predict_net.Proto()))
        self.assertTrue(workspace.RunNet(predict_net.Proto().name))

        # Validate device options set correctly for the RNNs
        import google.protobuf.text_format as protobuftx
        for op in predict_net.Proto().op:
            if op.type == 'RecurrentNetwork':
                for arg in op.arg:
                    if arg.name == "step_net":
                        step_proto = caffe2_pb2.NetDef()
                        protobuftx.Merge(arg.s, step_proto)
                        for step_op in step_proto.op:
                            self.assertEqual(0,
                                             step_op.device_option.device_type)
                            self.assertEqual(1,
                                             step_op.device_option.cuda_gpu_id)
                    elif arg.name == 'backward_step_net':
                        self.assertEqual("", arg.s)
コード例 #16
0
    def run_model(self, V, gpu_devices):

        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec, 'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'],
                    ['gpu_vec_gathered_{}'.format(num)]
                )
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill(
                        [], "vec_{}".format(num), shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [],
                        "vec_2", shape=[batch_per_device, 16]
                    )
                )
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [], "ONE_CPU", shape=[1], value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size
            )
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(
                        vec,
                        orig_vec
                    )
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob(
                    'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num))
                for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.format(
                        len(idx), len(grad_slice)
                    )
                )
コード例 #17
0
ファイル: test_trt.py プロジェクト: zhengxle/pytorch
    def test_resnet50_core(self):
        N = 2
        warmup = 20
        repeat = 100
        print("Batch size: {}, repeat inference {} times, warmup {} times".
              format(N, repeat, warmup))
        init_net, pred_net, _ = self._get_c2_model('resnet50')
        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
        input_blob_dims = (N, 3, 224, 224)
        input_name = "real_data"

        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        init_net.device_option.CopyFrom(device_option)
        pred_net.device_option.CopyFrom(device_option)
        for op in pred_net.op:
            op.device_option.CopyFrom(device_option)
            op.engine = 'CUDNN'
        net_outputs = pred_net.external_output
        Y_c2 = None
        data = np.random.randn(*input_blob_dims).astype(np.float32)
        c2_time = 1
        workspace.SwitchWorkspace("gpu_test", True)
        with core.DeviceScope(device_option):
            workspace.FeedBlob(input_name, data)
            workspace.RunNetOnce(init_net)
            workspace.CreateNet(pred_net)
            for _ in range(warmup):
                workspace.RunNet(pred_net.name)
            start = time.time()
            for _ in range(repeat):
                workspace.RunNet(pred_net.name)
            end = time.time()
            c2_time = end - start
            output_values = [workspace.FetchBlob(name) for name in net_outputs]
            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
        workspace.ResetWorkspace()

        # Fill the workspace with the weights
        with core.DeviceScope(device_option):
            workspace.RunNetOnce(init_net)

        # Cut the graph
        start = time.time()
        pred_net_cut = transform_caffe2_net(pred_net,
                                            {input_name: input_blob_dims},
                                            build_serializable_op=True)
        del init_net, pred_net
        #_print_net(pred_net_cut)

        Y_trt = None
        input_name = pred_net_cut.external_input[0]
        print("C2 runtime: {}s".format(c2_time))
        with core.DeviceScope(device_option):
            workspace.FeedBlob(input_name, data)
            workspace.CreateNet(pred_net_cut)
            end = time.time()
            print("Conversion time: {:.2f}s".format(end - start))

            for _ in range(warmup):
                workspace.RunNet(pred_net_cut.name)
            start = time.time()
            for _ in range(repeat):
                workspace.RunNet(pred_net_cut.name)
            end = time.time()
            trt_time = end - start
            print("TRT runtime: {}s, improvement: {}%".format(
                trt_time, (c2_time - trt_time) / c2_time * 100))
            output_values = [workspace.FetchBlob(name) for name in net_outputs]
            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
コード例 #18
0
    def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
        caffe2_res = {}

        alpha = 1.0
        mu = 0.0
        beta = 0.999
        curv_win_width = 20
        epsilon = 1e-6

        net = core.Net("net")
        param_init_net = core.Net("param_init_net")
        workspace.ResetWorkspace()

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
            iteration = param_init_net.ConstantFill([],
                                                    "iteration",
                                                    shape=[1],
                                                    value=0,
                                                    dtype=core.DataType.INT64)
            iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
            net.AtomicIter([iter_mutex, iteration], [iteration])
        pre_grad = param_init_net.ConstantFill([],
                                               "pre_grad",
                                               shape=[n_dim],
                                               value=grad_coef)
        if gpu:
            iteration = net.CopyCPUToGPU([iteration], "iteration_cpu")
        iteration_float = net.Cast([iteration], "iteration_float")
        grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
        w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)

        # a hack to create an object with __dict__
        param_info = lambda: None
        param_info.blob = w
        param_info.grad = grad

        optimizer.YellowFinOptimizer(alpha=alpha,
                                     mu=mu,
                                     beta=beta,
                                     curv_win_width=curv_win_width,
                                     epsilon=epsilon,
                                     zero_debias=zero_debias)._run(
                                         net, param_init_net, param_info)

        workspace.RunNetOnce(param_init_net)
        workspace.CreateNet(net, overwrite=True)
        for i in range(n_iter):
            workspace.RunNet(net)
            scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
            g_norm2_avg = scalars_memory_blob[1]
            g_norm2_min_avg = scalars_memory_blob[2]
            g_norm2_max_avg = scalars_memory_blob[3]
            distance_avg = scalars_memory_blob[4]
            g_avg_blob = workspace.FetchBlob("w_g_avg")
            res_lr = workspace.FetchBlob("w_lr_avg")[0]
            res_mu = workspace.FetchBlob("w_mu_avg")[0]
            g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
            variance = max(
                self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
                g_deb.dot(g_deb), epsilon)
            if i > 0:
                caffe2_res[i] = {
                    'h_max':
                    np.exp(self.deb(g_norm2_max_avg, beta, i + 1,
                                    zero_debias)),
                    'h_min':
                    np.exp(self.deb(g_norm2_min_avg, beta, i + 1,
                                    zero_debias)),
                    'var':
                    variance,
                    'dist':
                    self.deb(distance_avg, beta, i + 1, zero_debias),
                    'lr':
                    res_lr,
                    'mu':
                    res_mu
                }
        return caffe2_res
コード例 #19
0
import sys
sys.path.insert(0, '/home/ernie/caffe2/build')
from caffe2.python import cnn, workspace, core
from caffe2.proto import caffe2_pb2
import numpy as np
import time

#device_opts = caffe2_pb2.DeviceOption()
#device_opts.device_type = caffe2_pb2.CUDA
#device_opts.cuda_gpu_id = 0
device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
net = core.Net("smoothL1Loss_test")
net.SmoothL1LossGradient(["data1", "data2", "avg_loss"],
                         "loss",
                         device_option=device_opts)

print net.Proto()

data1 = np.load('data1.npy')
data2 = np.load('data2.npy')
avg_loss = np.ones(1, dtype=np.float32)

workspace.FeedBlob("data1", data1, device_option=device_opts)
workspace.FeedBlob("data2", data2, device_option=device_opts)
workspace.FeedBlob("avg_loss", avg_loss, device_option=device_opts)
workspace.CreateNet(net.Proto())

workspace.RunNet("smoothL1Loss_test", 1)

caffe2_out = workspace.FetchBlob('loss')
コード例 #20
0
def feature_extractor(load_model_path=None,
                      test_data=None,
                      gpu_list=None,
                      num_gpus=0,
                      batch_size=4,
                      clip_per_video=1,
                      decode_type=2,
                      clip_length_rgb=4,
                      sampling_rate_rgb=1,
                      scale_h=128,
                      scale_w=171,
                      crop_size=112,
                      video_res_type=0,
                      num_decode_threads=4,
                      multi_label=0,
                      num_labels=101,
                      input_type=0,
                      clip_length_of=8,
                      sampling_rate_of=2,
                      frame_gap_of=2,
                      do_flow_aggregation=0,
                      flow_data_type=0,
                      get_video_id=1,
                      get_start_frame=0,
                      use_local_file=1,
                      crop_per_clip=1,
                      db_type='pickle',
                      model_name='r2plus1d',
                      model_depth=18,
                      num_channels=3,
                      output_path=None,
                      use_cudnn=1,
                      layers='final_avg',
                      num_iterations=1,
                      channel_multiplier=1.0,
                      bottleneck_multiplier=1.0,
                      use_pool1=0,
                      use_convolutional_pred=0,
                      use_dropout=0,
                      **kwargs):
    """
    :param gpu_list: list of gpu ids to use
    :param batch_size: batch size
    :param clip_per_video: When clips_per_video > 1, sample this many clips uniformly in time
    :param decode_type: 0: random, 1: uniform sampling, 2: use starting frame
    :param clip_length_rgb: Length of input clips
    :param sampling_rate_rgb: Frame sampling rate
    :param scale_h: Scale image height to
    :param scale_w: Scale image width to
    :param crop_size: Input image size (to crop to)
    :param video_res_type: Video frame scaling option, 0: scaled by height x width; 1: scaled by short edge
    :param num_decode_threads: number of decoding threads
    :param multi_label: Multiple label csv file input
    :param num_labels: Number of labels
    :param input_type: 0=rgb, 1=optical flow
    :param clip_length_of: Frames of optical flow data
    :param sampling_rate_of: Sampling rate for optial flows
    :param frame_gap_of: Frame gap of optical flows
    :param do_flow_aggregation: whether to aggregate optical flow across multiple frames
    :param flow_data_type: 0=Flow2C, 1=Flow3C, 2=FlowWithGray, 3=FlowWithRGB
    :param get_video_id: Output video id
    :param get_start_frame: Output clip start frame
    :param use_local_file: use local file
    :param crop_per_clip: number of spatial crops per clip
    :param db_type: Db type of the testing model
    :param model_name: Model name
    :param model_depth: Model depth
    :param num_channels: Number of channels
    :param load_model_path: Load saved model for testing
    :param test_data: Path to output pickle; defaults to layers.pickle next to <test_data>
    :param output_path: Path to output pickle; defaults to layers.pickle next to <test_data>
    :param use_cudnn: Use CuDNN
    :param layers: Comma-separated list of blob names to fetch
    :param num_iterations: Run only this many iterations
    :param channel_multiplier: Channel multiplier
    :param bottleneck_multiplier: Bottleneck multiplier
    :param use_pool1: use pool1 layer
    :param use_convolutional_pred: using convolutional predictions
    :param use_dropout: Use dropout at the prediction layer
    """
    if load_model_path is None or test_data is None:
        raise Exception('Model path AND test data need to be specified')

    # Initialize Caffe2
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])

    if gpu_list is None:
        if num_gpus == 0:
            raise Exception('Must specify GPUs')
        else:
            gpus = [i for i in range(num_gpus)]
    else:
        gpus = gpu_list
        num_gpus = len(gpus)

    my_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True
    }

    model = cnn.CNNModelHelper(name="Extract features", **my_arg_scope)

    video_input_args = dict(
        batch_size=batch_size,
        clip_per_video=clip_per_video,
        decode_type=decode_type,
        length_rgb=clip_length_rgb,
        sampling_rate_rgb=sampling_rate_rgb,
        scale_h=scale_h,
        scale_w=scale_w,
        crop_size=crop_size,
        video_res_type=video_res_type,
        short_edge=min(scale_h, scale_w),
        num_decode_threads=num_decode_threads,
        do_multi_label=multi_label,
        num_of_class=num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=input_type,
        length_of=clip_length_of,
        sampling_rate_of=sampling_rate_of,
        frame_gap_of=frame_gap_of,
        do_flow_aggregation=do_flow_aggregation,
        flow_data_type=flow_data_type,
        get_rgb=input_type == 0,
        get_optical_flow=input_type == 1,
        get_video_id=get_video_id,
        get_start_frame=get_start_frame,
        use_local_file=use_local_file,
        crop_per_clip=crop_per_clip,
    )

    reader_args = dict(
        name="extract_features" + '_reader',
        input_data=test_data,
    )

    reader, num_examples = reader_utils.create_data_reader(
        model, **reader_args)

    def input_fn(model):
        model_helper.AddVideoInput(model, reader, **video_input_args)

    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=model_name,
            model_depth=model_depth,
            num_labels=num_labels,
            batch_size=batch_size,
            num_channels=num_channels,
            crop_size=crop_size,
            clip_length=(clip_length_of
                         if input_type == 1 else clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
            multi_label=multi_label,
            channel_multiplier=channel_multiplier,
            bottleneck_multiplier=bottleneck_multiplier,
            use_dropout=use_dropout,
            use_convolutional_pred=use_convolutional_pred,
            use_pool1=use_pool1,
        )

    ##
    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,  # 'None' since we aren't training
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        model._device_type = caffe2_pb2.CPU
        model._devices = [0]
        device_opt = core.DeviceOption(model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                input_fn(model)
                create_model_ops(model, 1.0)

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)

    if db_type == 'pickle':
        model_loader.LoadModelFromPickleFile(model, load_model_path)
    elif db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(load_model_path, db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(load_model_path, db_type)
    else:
        log.warning("Unsupported db_type: {}".format(db_type))

    data_parallel_model.FinalizeAfterCheckpoint(model)

    ##
    def fetchActivations(model, outputs, num_iterations):

        all_activations = {}
        for counter in range(num_iterations):
            workspace.RunNet(model.net.Proto().name)

            num_devices = 1  # default for cpu
            if num_gpus > 0:
                num_devices = num_gpus
            for g in range(num_devices):
                for output_name in outputs:
                    blob_name = 'gpu_{}/'.format(g) + output_name
                    activations = workspace.FetchBlob(blob_name)
                    if output_name not in all_activations:
                        all_activations[output_name] = []
                    all_activations[output_name].append(activations)

        # each key holds a list of activations obtained from each minibatch.
        # we now concatenate these lists to get the final arrays.
        # concatenating during the loop requires a realloc and can get slow.
        for key in all_activations:
            all_activations[key] = np.concatenate(all_activations[key])

        return all_activations

    if not isinstance(layers, list):
        layers = [layers]

    if 'video_id' not in layers:
        layers.append('video_id')

    assert len(layers) > 0

    examples_per_iteration = batch_size * num_gpus
    num_iterations = int(num_examples / examples_per_iteration)

    activations = fetchActivations(model, layers, num_iterations)

    # saving extracted layers
    for index in range(len(layers)):
        log.info("Read '{}' with shape {}".format(
            layers[index], activations[layers[index]].shape))

    if output_path:
        log.info('Writing to {}'.format(output_path))
        if save_h5:
            with h5py.File(output_path, 'w') as handle:
                for name, activation in activations.items():
                    handle.create_dataset(name, data=activation)
        else:
            with open(output_path, 'wb') as handle:
                pickle.dump(activations, handle)
    else:
        return activations
コード例 #21
0
def CpuScope():
    """Create a CPU device scope."""
    cpu_dev = core.DeviceOption(caffe2_pb2.CPU)
    with core.DeviceScope(cpu_dev):
        yield
コード例 #22
0
def get_device_option(device):
    m = {
        DeviceType.CPU: caffe2_pb2.CPU,
        DeviceType.CUDA: workspace.GpuDeviceType
    }
    return core.DeviceOption(m[device.type], device.device_id)
コード例 #23
0
ファイル: load_save_test.py プロジェクト: zyxunh/pytorch
    def load_save(self, src_device_type, src_gpu_id, dst_device_type,
                  dst_gpu_id):
        workspace.ResetWorkspace()
        dtypes = [
            np.float16, np.float32, np.float64, np.bool, np.int8, np.int16,
            np.int32, np.int64, np.uint8, np.uint16
        ]
        arrays = [
            np.random.permutation(6).reshape(2, 3).astype(T) for T in dtypes
        ]
        assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0)
        assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0)
        src_device_option = core.DeviceOption(src_device_type, src_gpu_id)
        dst_device_option = core.DeviceOption(dst_device_type, dst_gpu_id)

        for i, arr in enumerate(arrays):
            self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option))
            self.assertTrue(workspace.HasBlob(str(i)))

        try:
            # Saves the blobs to a local db.
            tmp_folder = tempfile.mkdtemp()
            op = core.CreateOperator("Save",
                                     [str(i) for i in range(len(arrays))], [],
                                     absolute_path=1,
                                     db=os.path.join(tmp_folder, "db"),
                                     db_type=self._db_type)
            self.assertTrue(workspace.RunOperatorOnce(op))

            # Reset the workspace so that anything we load is surely loaded
            # from the serialized proto.
            workspace.ResetWorkspace()
            self.assertEqual(len(workspace.Blobs()), 0)

            def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
                """A helper subfunction to test keep and not keep."""
                op = core.CreateOperator("Load", [],
                                         blobs,
                                         absolute_path=1,
                                         db=os.path.join(tmp_folder, "db"),
                                         db_type=self._db_type,
                                         device_option=dst_device_option,
                                         keep_device=keep_device,
                                         load_all=loadAll)
                self.assertTrue(workspace.RunOperatorOnce(op))
                for i, arr in enumerate(arrays):
                    self.assertTrue(workspace.HasBlob(str(i)))
                    fetched = workspace.FetchBlob(str(i))
                    self.assertEqual(fetched.dtype, arr.dtype)
                    np.testing.assert_array_equal(workspace.FetchBlob(str(i)),
                                                  arr)
                    proto = caffe2_pb2.BlobProto()
                    proto.ParseFromString(workspace.SerializeBlob(str(i)))
                    self.assertTrue(proto.HasField('tensor'))
                    self.assertEqual(proto.tensor.device_detail.device_type,
                                     device_type)
                    if device_type == caffe2_pb2.CUDA:
                        self.assertEqual(
                            proto.tensor.device_detail.cuda_gpu_id, gpu_id)

            blobs = [str(i) for i in range(len(arrays))]
            # Load using device option stored in the proto, i.e.
            # src_device_option
            _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
            # Load again, but this time load into dst_device_option.
            _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)
            # Load back to the src_device_option to see if both paths are able
            # to reallocate memory.
            _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
            # Reset the workspace, and load directly into the dst_device_option.
            workspace.ResetWorkspace()
            _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)

            # Test load all which loads all blobs in the db into the workspace.
            workspace.ResetWorkspace()
            _LoadTest(1, src_device_type, src_gpu_id, [], 1)
            # Load again making sure that overwrite functionality works.
            _LoadTest(1, src_device_type, src_gpu_id, [], 1)
            # Load again with different device.
            _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
            workspace.ResetWorkspace()
            _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
        finally:
            # clean up temp folder.
            try:
                shutil.rmtree(tmp_folder)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise
コード例 #24
0
def CpuDevice():
    """Create a Cuda device."""
    return core.DeviceOption(caffe2_pb2.CPU)
コード例 #25
0
ファイル: lstm_benchmark.py プロジェクト: shahrezjan/caffe2
        action="store_true",
        help="Whether to use memory optimized LSTM or not",
    )
    parser.add_argument("--forward_only",
                        action="store_true",
                        help="Whether to run only forward pass")
    parser.add_argument(
        "--num_layers",
        type=int,
        default=1,
        help="Number of LSTM layers. All output dimensions are going to be"
        "of hidden_dim size",
    )

    return parser


if __name__ == '__main__':
    args = GetArgumentParser().parse_args()

    workspace.GlobalInit([
        'caffe2', '--caffe2_log_level=0',
        '--caffe2_print_blob_sizes_at_exit=0', '--caffe2_gpu_memory_tracking=1'
    ])

    device = core.DeviceOption(caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU,
                               0)

    with core.DeviceScope(device):
        Benchmark(args)
コード例 #26
0
def ExtractFeatures(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        log.info("Running on GPUs: {}".format(gpus))
    else:
        log.info("Running on CPU")

    my_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True
    }

    model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope)

    video_input_args = dict(
        batch_size=args.batch_size,
        clip_per_video=args.clip_per_video,
        decode_type=args.decode_type,
        length_rgb=args.clip_length_rgb,
        sampling_rate_rgb=args.sampling_rate_rgb,
        scale_h=args.scale_h,
        scale_w=args.scale_w,
        crop_size=args.crop_size,
        video_res_type=args.video_res_type,
        short_edge=min(args.scale_h, args.scale_w),
        num_decode_threads=args.num_decode_threads,
        do_multi_label=args.multi_label,
        num_of_class=args.num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=args.input_type,
        length_of=args.clip_length_of,
        sampling_rate_of=args.sampling_rate_of,
        frame_gap_of=args.frame_gap_of,
        do_flow_aggregation=args.do_flow_aggregation,
        flow_data_type=args.flow_data_type,
        get_rgb=args.input_type == 0,
        get_optical_flow=args.input_type == 1,
        get_video_id=args.get_video_id,
        get_start_frame=args.get_start_frame,
        use_local_file=args.use_local_file,
        crop_per_clip=args.crop_per_clip,
    )

    reader_args = dict(
        name="extract_features" + '_reader',
        input_data=args.test_data,
    )

    reader, num_examples = reader_utils.create_data_reader(
        model, **reader_args)

    def input_fn(model):
        model_helper.AddVideoInput(model, reader, **video_input_args)

    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(args.clip_length_of
                         if args.input_type == 1 else args.clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            use_convolutional_pred=args.use_convolutional_pred,
            use_pool1=args.use_pool1,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,  # 'None' since we aren't training
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        model._device_type = caffe2_pb2.CPU
        model._devices = [0]
        device_opt = core.DeviceOption(model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                input_fn(model)
                create_model_ops(model, 1.0)

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)

    if args.db_type == 'pickle':
        model_loader.LoadModelFromPickleFile(model, args.load_model_path)
    elif args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(model)

    def fetchActivations(model, outputs, num_iterations):

        all_activations = {}
        for counter in range(num_iterations):
            workspace.RunNet(model.net.Proto().name)

            num_devices = 1  # default for cpu
            if num_gpus > 0:
                num_devices = num_gpus
            for g in range(num_devices):
                for output_name in outputs:
                    blob_name = 'gpu_{}/'.format(g) + output_name
                    activations = workspace.FetchBlob(blob_name)
                    if output_name not in all_activations:
                        all_activations[output_name] = []
                    all_activations[output_name].append(activations)

            if counter % 20 == 0:
                log.info('{}/{} iterations'.format(counter, num_iterations))

        # each key holds a list of activations obtained from each minibatch.
        # we now concatenate these lists to get the final arrays.
        # concatenating during the loop requires a realloc and can get slow.
        for key in all_activations:
            all_activations[key] = np.concatenate(all_activations[key])

        return all_activations

    outputs = [name.strip() for name in args.features.split(',')]
    assert len(outputs) > 0

    if args.num_iterations > 0:
        num_iterations = args.num_iterations
    else:
        if num_gpus > 0:
            examples_per_iteration = args.batch_size * num_gpus
        else:
            examples_per_iteration = args.batch_size
        num_iterations = int(num_examples / examples_per_iteration)

    activations = fetchActivations(model, outputs, num_iterations)

    # saving extracted features
    for index in range(len(outputs)):
        log.info("Read '{}' with shape {}".format(
            outputs[index], activations[outputs[index]].shape))

    if args.output_path:
        output_path = args.output_path
    else:
        output_path = os.path.dirname(args.test_data) + '/features.pickle'

    log.info('Writing to {}'.format(output_path))
    if args.save_h5:
        with h5py.File(output_path, 'w') as handle:
            for name, activation in activations.items():
                handle.create_dataset(name, data=activation)
    else:
        with open(output_path, 'wb') as handle:
            pickle.dump(activations, handle)

    # perform sanity check
    if args.sanity_check == 1:  # check clip accuracy
        assert args.multi_label == 0
        clip_acc = 0
        softmax = activations['softmax']
        label = activations['label']
        for i in range(len(softmax)):
            sorted_preds = \
                np.argsort(softmax[i])
            sorted_preds[:] = sorted_preds[::-1]
            if sorted_preds[0] == label[i]:
                clip_acc += 1
        log.info('Sanity check --- clip accuracy: {}'.format(clip_acc /
                                                             len(softmax)))
    elif args.sanity_check == 2:  # check auc
        assert args.multi_label == 1
        prob = activations['prob']
        label = activations['label']
        mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label)
        log.info('Sanity check --- AUC: {}, mAP: {}, mWAP: {}'.format(
            mean_auc, mean_ap, mean_wap))
コード例 #27
0
    def apply_over_sequence(
        self,
        model,
        inputs,
        seq_lengths,
        initial_states,
        outputs_with_grads=None,
    ):
        inputs = self.cell.prepare_input(model, inputs)

        # Now they are blob references - outputs of splitting the input sequence
        split_inputs = model.net.Split(
            inputs,
            [str(inputs) + "_timestep_{}".format(i)
             for i in range(self.T)],
            axis=0)
        if self.T == 1:
            split_inputs = [split_inputs]

        states = initial_states
        all_states = []
        for t in range(0, self.T):
            scope_name = "timestep_{}".format(t)
            # Parameters of all timesteps are shared
            with ParameterSharing({scope_name: ''}),\
                 scope.NameScope(scope_name):
                timestep = model.param_init_net.ConstantFill(
                    [], "timestep", value=t, shape=[1],
                    dtype=core.DataType.INT32,
                    device_option=core.DeviceOption(caffe2_pb2.CPU))
                states = self.cell._apply(
                    model=model,
                    input_t=split_inputs[t],
                    seq_lengths=seq_lengths,
                    states=states,
                    timestep=timestep,
                )
            all_states.append(states)

        all_states = zip(*all_states)
        all_states = [
            model.net.Concat(
                list(full_output),
                [
                    str(full_output[0])[len("timestep_0/"):] + "_concat",
                    str(full_output[0])[len("timestep_0/"):] + "_concat_info"

                ],
                axis=0)[0]
            for full_output in all_states
        ]
        outputs = tuple(
            six.next(it) for it in
            itertools.cycle([iter(all_states), iter(states)])
        )
        outputs_without_grad = set(range(len(outputs))) - set(
            outputs_with_grads)
        for i in outputs_without_grad:
            model.net.ZeroGradient(outputs[i], [])
        logging.debug("Added 0 gradients for blobs:",
                      [outputs[i] for i in outputs_without_grad])

        final_output = self.cell._prepare_output_sequence(model, outputs)

        return final_output, outputs
コード例 #28
0
def get_device_option_cpu():
    device_option = core.DeviceOption(caffe2_pb2.CPU)
    return device_option
コード例 #29
0
    def _run(self, net, param_init_net, param_info):

        # Note: This is number of persistent scalars in YellowFin optimizer.
        #       It should always be the number of scalars being used. The same
        #       number should be used in class for the operation.
        SCALARS_MEMORY_SIZE = 5

        param = param_info.blob
        grad = param_info.grad
        moment = param_init_net.ConstantFill([param],
                                             param + "_moment",
                                             value=0.0)
        curv_win = param_init_net.ConstantFill([],
                                               param + "_curv_win",
                                               shape=[self.curv_win_width],
                                               value=0.0)
        g_avg = param_init_net.ConstantFill([param],
                                            param + "_g_avg",
                                            value=0.0)
        g2_avg = param_init_net.ConstantFill([param],
                                             param + "_g2_avg",
                                             value=0.0)
        lr_avg = param_init_net.ConstantFill([],
                                             param + "_lr_avg",
                                             shape=[1],
                                             value=self.alpha)
        mu_avg = param_init_net.ConstantFill([],
                                             param + "_mu_avg",
                                             shape=[1],
                                             value=self.mu)
        scalars_memory = param_init_net.ConstantFill(
            [],
            param + "_scalars_memory",
            shape=[SCALARS_MEMORY_SIZE],
            value=0.0)

        assert self.alpha > 0
        assert not isinstance(grad, core.GradientSlice), \
            "YellowFin does not support sparse gradients"

        if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
            # Add training operators.
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                iteration = param_init_net.ConstantFill(
                    [],
                    _OPTIMIZER_ITERATION_NAME,
                    shape=[1],
                    value=0,
                    dtype=core.DataType.INT64)
                iter_mutex = param_init_net.CreateMutex([],
                                                        ["iteration_mutex"])
                net.AtomicIter([iter_mutex, iteration], [iteration])
        else:
            iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)

        self._aux_params.shared.append(iteration)
        self._aux_params.local.append(moment)
        self._aux_params.local.append(lr_avg)
        self._aux_params.local.append(mu_avg)
        self._aux_params.local.append(curv_win)
        self._aux_params.local.append(g_avg)
        self._aux_params.local.append(g2_avg)
        self._aux_params.local.append(scalars_memory)

        yf_in_out_args = [
            param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg,
            scalars_memory
        ]

        net.YellowFin(yf_in_out_args + [grad, iteration],
                      yf_in_out_args,
                      beta=self.beta,
                      epsilon=self.epsilon,
                      curv_win_width=self.curv_win_width,
                      zero_debias=self.zero_debias)
コード例 #30
0
    sys.path.append("..")
    # data generation
    from data_generator.wnd_data_caffe2 import Wide_and_DeepDataGenerator

    from utils.utils import cli

    args = cli()

    ### some basic setup ###
    np.random.seed(args.numpy_rand_seed)
    np.set_printoptions(precision=args.print_precision)

    use_gpu = args.use_gpu
    if use_gpu:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
        ngpus = C.num_cuda_devices  # 1
        print("Using {} GPU(s)...".format(ngpus))
    else:
        device_opt = core.DeviceOption(caffe2_pb2.CPU)
        print("Using CPU...")

    ### prepare training data ###
    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")

    # TODO: Make this Wide_and_Deep generator
    dc = Wide_and_DeepDataGenerator(args)
    if args.data_generation == "dataset":
        print("Error we have disabled this function currently....")
        sys.exit()
        # input and target data