コード例 #1
0
 def inc_current_step(self):
     workspace.FeedBlob(
         self.global_step,
         np.array([self.get_current_step() + 1]),
     )
コード例 #2
0
 def feed(b, v):
     if ws is None:
         workspace.FeedBlob(str(b), v)
     else:
         ws.create_blob(str(b))
         ws.blobs[str(b)].feed(v)
コード例 #3
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
        )
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")

        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        pna_lens_blob = "pna_lens_blob"
        workspace.FeedBlob(pna_lens_blob, pnas_lengths)

        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
        )

        state_pnas_blob = preprocessor.concat_states_and_possible_actions(
            next_state_matrix, possible_next_actions_matrix, pna_lens_blob)

        workspace.RunNetOnce(net)

        states_ndarray = workspace.FetchBlob(state_matrix)
        actions_ndarray = workspace.FetchBlob(action_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        possible_next_actions_ndarray = workspace.FetchBlob(
            possible_next_actions_matrix)
        next_state_pnas_concat = workspace.FetchBlob(state_pnas_blob)
        time_diffs = np.ones(len(states_ndarray))
        episode_values = None
        if samples.reward_timelines is not None:
            episode_values = np.zeros(rewards.shape, dtype=np.float32)
            for i, reward_timeline in enumerate(samples.reward_timelines):
                for time_diff, reward in reward_timeline.items():
                    episode_values[i, 0] += reward * (DISCOUNT**time_diff)

        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    episode_values=episode_values[start:end]
                    if episode_values is not None else None,
                    time_diffs=time_diffs[start:end],
                    possible_next_actions_lengths=pnas_lengths[start:end],
                    next_state_pnas_concat=next_state_pnas_concat,
                ))
        return tdps
コード例 #4
0
    def testShapeInferenceSoftmaxWithLoss(self):
        model = cnn.CNNModelHelper()

        model.SoftmaxWithLoss(
            ["logits", "labels"],
            ["softmax", "loss"],
        )

        # 2D Shape of [batch_size, num_classes]
        workspace.FeedBlob(
            "logits",
            np.random.rand(4, 3).astype(np.float32),
        )

        # Shape of size batch_size with all values [0, num_classes)
        workspace.FeedBlob(
            "labels",
            np.random.randint(low=0, high=3, size=(4, 1)).astype(np.int32),
        )
        self.InferTensorRunAndCompare(model)

        # Testing with 1D labels arg
        workspace.FeedBlob(
            "logits",
            np.random.rand(4, 3).astype(np.float32),
        )

        workspace.FeedBlob(
            "labels",
            np.random.randint(low=0, high=3, size=4).astype(np.int32),
        )
        self.InferTensorRunAndCompare(model)

        # Testing with weight_tensor
        model.SoftmaxWithLoss(
            ["logits", "labels", "weight_tensor"],
            ["softmax", "loss"],
        )

        workspace.FeedBlob(
            "logits",
            np.random.rand(4, 3).astype(np.float32),
        )

        workspace.FeedBlob(
            "labels",
            np.random.randint(low=0, high=3, size=4).astype(np.int32),
        )

        workspace.FeedBlob(
            "weight_tensor",
            np.random.rand(4).astype(np.float32),
        )
        self.InferTensorRunAndCompare(model)

        # Test spatial model
        model = cnn.CNNModelHelper()
        workspace.FeedBlob("img",
                           np.random.rand(32, 19, 33, 28).astype(np.float32))
        workspace.FeedBlob("img_labels",
                           (np.random.rand(32, 33, 28) * 19).astype(np.int32))
        model.SoftmaxWithLoss(["img", "img_labels"], ["softmax_img", "loss"],
                              spatial=1)
        self.InferTensorRunAndCompare(model)
コード例 #5
0
    def test_bn(self, seed, size, input_channels, batch_size):
        workspace.ResetWorkspace()
        np.random.seed(seed)

        order = "NCHW"
        epsilon = 1e-3

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator("SpatialBN",
                                ["X", "scale", "bias", "mean", "var"], ["Y"],
                                order=order,
                                is_test=True,
                                epsilon=epsilon))

        if GLOW_LOWERED_BATCHNORM:
            refopname = "SpatialBNFakeLoweredFp16NNPI"
        else:
            refopname = "SpatialBNFakeFp16NNPI"

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred"
        pred_net_ref.external_input.extend(
            ["X", "scale", "bias", "mean", "var"])
        pred_net_ref.external_output.append("X")
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(refopname,
                                ["X", "scale", "bias", "mean", "var"], ["Y"],
                                order=order,
                                is_test=True,
                                epsilon=epsilon))

        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
        mean = np.random.randn(input_channels).astype(np.float32)
        var = np.random.rand(input_channels).astype(np.float32) + 0.5
        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5

        workspace.FeedBlob("scale", scale)
        workspace.FeedBlob("bias", bias)
        workspace.FeedBlob("mean", mean)
        workspace.FeedBlob("var", var)

        # Use for reference to debug
        # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order)

        pred_net_onnxified = onnxifi_caffe2_net(pred_net, {
            "X": [batch_size, input_channels, size, size],
            "scale": [input_channels],
            "bias": [input_channels],
            "mean": [input_channels],
            "var": [input_channels]
        },
                                                debug=True,
                                                adjust_batch=False,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)

        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(pred_net_ref)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
            diff = np.abs(Y_glow - Y_c2).astype(np.float16)
            print_test_debug_info(
                "bn", {
                    "seed": seed,
                    "scale": scale,
                    "bias": bias,
                    "mean": mean,
                    "var": var,
                    "Y_np": Y_c2,
                    "Y_glow": Y_glow,
                    "diff": diff,
                    "rowwise_diff": np.max(np.abs(diff), -1)
                })
            assert (0)
コード例 #6
0
    def test_sum_reduce(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(4, 5).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=0)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(2, 3).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=0)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=3)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res, decimal=3)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(3, 4).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
        Y = np.random.rand(1).astype(np.float64)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.array(np.sum(X))
        np.testing.assert_array_almost_equal(out, res, decimal=0)

        # broadcasting with single elem dimensions at both ends
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2).reshape(Y.shape)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # fp64 is not supported with the CUDA/HIP op
        dc_cpu_only = [
            d for d in dc if (d.device_type != caffe2_pb2.CUDA
                              or d.device_type != caffe2_pb2.HIP)
        ]
        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
コード例 #7
0
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = cnn.CNNModelHelper()
        with core.NameScope("name_x"):
            fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=False,
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        optim_proto_wacts = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=True,
        )
        blobs_wact_optim = count_blobs(optim_proto_wacts)
        self.assertLessEqual(blobs_wact_optim, blobs_after)

        # Check that the last activations are not shared
        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
        self.assertTrue(
            has_blob(optim_proto_wacts, "name_x/fc5"),
            "Dont remap final activation",
        )

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)

        # Run with the forward optimization
        workspace.RunNetOnce(optim_proto_wacts)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #8
0
    def run_model(self, V, gpu_devices, cpu_indices):

        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather(
                        [self.vecs, 'indices'], 'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs, "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1,
                          ("ConstantFill", {}), ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [
                            param_grad.values, param_momentum, param
                        ],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill(
                    [], "vecs", shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [], "ONE_CPU", shape=[1], value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([param, self.ONE_CPU,
                                                  param_grad.indices,
                                                  param_grad.values,
                                                  self.LR],
                                                  self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16
            )
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(
                    self.vecs,
                    orig_vecs
                )
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(
            np.allclose(workspace.FetchBlob(self.vecs), orig_vecs))
        return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
                workspace.FetchBlob("gpu_0/fc_w")]
コード例 #9
0
    def run_model(self, V, gpu_devices):

        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec, 'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'],
                    ['gpu_vec_gathered_{}'.format(num)]
                )
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill(
                        [], "vec_{}".format(num), shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [],
                        "vec_2", shape=[batch_per_device, 16]
                    )
                )
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [], "ONE_CPU", shape=[1], value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size
            )
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(
                        vec,
                        orig_vec
                    )
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob(
                    'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num))
                for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.format(
                        len(idx), len(grad_slice)
                    )
                )
コード例 #10
0
    def run_model(self, devices, gpu):

        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32)
            )
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            name="recurrent_test{}".format(devices),
        )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(
                self.T, self.batch_size, self.hidden_dim
            )

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g), targets
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
コード例 #11
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1,
                          ("ConstantFill", {}), ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
            combine_spatial_bn=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        # Light test for LR names
        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
        self.assertGreater(len(lr_names), 0)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(
                model._device_prefix + "_0/sync_num",
                np.array([i * 2]).astype(np.float32),
                device_option=core.DeviceOption(model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(
                    model._device_prefix + "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
コード例 #12
0
    def test_prepare_normalization_and_normalize(self):
        features, feature_value_map = preprocessing_util.read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, 10)
        for k, v in normalization_parameters.items():
            if k == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif k == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == k or v.feature_type + "_2" + k

        norm_net = core.Net("net")
        C2.set_net(norm_net)
        preprocessor = PreprocessorNet(norm_net, False)
        input_matrix = np.zeros([10000, len(features)], dtype=np.float32)
        for i, feature in enumerate(features):
            input_matrix[:, i] = feature_value_map[feature]
        input_matrix_blob = 'input_matrix_blob'
        workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32))
        output_blob, _ = preprocessor.normalize_dense_matrix(
            input_matrix_blob, features, normalization_parameters, '')
        workspace.FeedBlob(input_matrix_blob, input_matrix)
        workspace.RunNetOnce(norm_net)
        normalized_feature_matrix = workspace.FetchBlob(output_blob)

        normalized_features = {}
        on_column = 0
        for feature in features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[feature] = \
                normalized_feature_matrix[:, on_column:(
                    on_column + column_size
                )]
            on_column += column_size

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(possible_value_map[original_feature],
                                     np.where(row == 1)[0][0])
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = self._value_to_quantile(
                        original_feature,
                        normalization_parameters[k].quantiles)
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif feature_type == identify_types.CONTINUOUS or \
                    feature_type == identify_types.BOXCOX:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    'mean of feature {} is {}, not 0'.format(k, np.mean(v)))
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            else:
                raise NotImplementedError()
コード例 #13
0
    def CheckSimple(self,
                    op,
                    inputs,
                    input_to_check,
                    outputs_with_grads,
                    grad_ops=None):
        """Checks the operator in a very simple fashion by stacking a sum of squares
    on the top.

    Inputs:
      op: the operator to be checked.
      inputs: the input data in numpy arrays.
      input_to_check: an index specifying which input blob we should
          check.
      outputs_with_grads: indices specifying which output blobs will we
          need to check gradients with. For these outputs, we will collect a
          squared sum and also feed in their gradients.
      grad_operator: the gradient operator. If not given, we will get the
          gradient operator from the gradient registry.
    Outputs:
      boolean: True if it passes, False if it does not pass.
    """
        # Entering the checker workspace
        old_ws_name = workspace.CurrentWorkspace()
        if self._workspace_name != old_ws_name:
            workspace.SwitchWorkspace(self._workspace_name, True)

        op.device_option.CopyFrom(self._device_option)
        if grad_ops is None:
            grad_ops = core.GradientRegistry.GetGradientDefs(op)

        dims_to_check = inputs[input_to_check].size
        # First, feed in the input.
        for i, arr in enumerate(inputs):
            workspace.FeedBlob(op.input[i], arr, self._device_option)

        # Get the loss and gradient for the original.
        input_name = op.input[input_to_check]
        loss, grad = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check],
                                         input_name, outputs_with_grads)
        grad_estimate = np.zeros_like(inputs[input_to_check])
        for current_dim in range(dims_to_check):
            # Positive gradient
            inputs[input_to_check].flat[current_dim] += self._stepsize
            pos_loss, _ = self.GetLossAndGrad(op, grad_ops,
                                              inputs[input_to_check],
                                              input_name, outputs_with_grads)
            # Negative gradient
            inputs[input_to_check].flat[current_dim] -= self._stepsize * 2
            neg_loss, _ = self.GetLossAndGrad(op, grad_ops,
                                              inputs[input_to_check],
                                              input_name, outputs_with_grads)
            # Recover the value
            inputs[input_to_check].flat[current_dim] += self._stepsize
            grad_estimate.flat[current_dim] = (pos_loss -
                                               neg_loss) / self._stepsize / 2
        # Now, check correctness
        scale = np.maximum(np.maximum(np.abs(grad), np.abs(grad_estimate)), 1)
        fail_mat = (np.abs(grad - grad_estimate) > scale * self._threshold)
        if np.any(fail_mat):
            idx = np.flatnonzero(fail_mat)
            #print 'Failed. [idx, grad, grad_estimate] are:'
            #print np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T
            ret = False
        else:
            ret = True
        # After finishing, cleaning up things.
        if self._workspace_name != old_ws_name:
            # We reset the workspace to make sure everything intermediate is cleaned
            # up. Note that there is no need to delete a workspace - when empty it
            # takes a very limited amount of memory.
            workspace.ResetWorkspace()
            workspace.SwitchWorkspace(old_ws_name)
        return ret, grad, grad_estimate
コード例 #14
0
    def assertReferenceChecks(
        self,
        device_option,
        op,
        inputs,
        reference,
        input_device_options=None,
        threshold=1e-4,
        output_to_grad=None,
        grad_reference=None,
        atol=None,
        outputs_to_check=None,
    ):
        """
        This runs the reference Python function implementation
        (effectively calling `reference(*inputs)`, and compares that
        to the output of output, with an absolute/relative tolerance
        given by the `threshold` parameter.

        Useful for checking the implementation matches the Python
        (typically NumPy) implementation of the same functionality.

        Usage example:

            @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
            def test_softsign(self, X, inplace, gc, dc):
                op = core.CreateOperator(
                    "Softsign", ["X"], ["X" if inplace else "Y"])

                def softsign(X):
                    return (X / (1 + np.abs(X)),)

                self.assertReferenceChecks(gc, op, [X], softsign)
        """
        op = copy.deepcopy(op)
        op.device_option.CopyFrom(device_option)

        with temp_workspace():
            if (len(op.input) > len(inputs)):
                raise ValueError(
                    'must supply an input for each input on the op: %s vs %s' %
                    (op.input, inputs))
            _input_device_options = input_device_options or \
                core.InferOpBlobDevicesAsDict(op)[0]
            for (n, b) in zip(op.input, inputs):
                workspace.FeedBlob(n,
                                   b,
                                   device_option=_input_device_options.get(
                                       n, device_option))
            net = core.Net("opnet")
            net.Proto().op.extend([op])
            test_shape_inference = False
            try:
                (shapes, types) = workspace.InferShapesAndTypes([net])
                test_shape_inference = True
            except RuntimeError as e:
                # Temporarily catch runtime errors when inferring shape
                # and type info
                logging.warning(str(e))
                if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1':
                    raise e
            workspace.RunNetOnce(net)
            reference_outputs = reference(*inputs)
            if not (isinstance(reference_outputs, tuple)
                    or isinstance(reference_outputs, list)):
                raise RuntimeError(
                    "You are providing a wrong reference implementation. A "
                    "proper one should return a tuple/list of numpy arrays.")
            if not outputs_to_check:
                self.assertEqual(len(reference_outputs), len(op.output))
                outputs_to_check = list(range(len(op.output)))
            outs = []
            for (output_index, ref) in zip(outputs_to_check,
                                           reference_outputs):
                output_blob_name = op.output[output_index]
                output = workspace.FetchBlob(output_blob_name)
                if output.dtype.kind in ('S', 'O'):
                    np.testing.assert_array_equal(output, ref)
                else:
                    if atol is None:
                        atol = threshold
                    np.testing.assert_allclose(
                        output,
                        ref,
                        atol=atol,
                        rtol=threshold,
                        err_msg=(
                            'Output {0} is not matching the reference'.format(
                                output_blob_name, )),
                    )
                if test_shape_inference:
                    self._assertInferTensorChecks(output_blob_name, shapes,
                                                  types, output)
                outs.append(output)
            if grad_reference is not None:
                assert output_to_grad is not None, \
                    "If grad_reference is set," \
                    "output_to_grad has to be set as well"

                with core.DeviceScope(device_option):
                    self._assertGradReferenceChecks(op,
                                                    inputs,
                                                    reference_outputs,
                                                    output_to_grad,
                                                    grad_reference,
                                                    threshold=threshold)
            return outs
コード例 #15
0
def _prepare_rnn(t,
                 n,
                 dim_in,
                 create_rnn,
                 outputs_with_grads,
                 forget_bias,
                 memory_optim=False,
                 forward_only=False,
                 drop_states=False,
                 T=None,
                 two_d_initial_states=None,
                 dim_out=None,
                 num_states=2,
                 **kwargs):
    if dim_out is None:
        dim_out = [dim_in]
    print("Dims: ", t, n, dim_in, dim_out)

    model = ModelHelper(name='external')

    if two_d_initial_states is None:
        two_d_initial_states = np.random.randint(2)

    def generate_input_state(n, d):
        if two_d_initial_states:
            return np.random.randn(n, d).astype(np.float32)
        else:
            return np.random.randn(1, n, d).astype(np.float32)

    states = []
    for layer_id, d in enumerate(dim_out):
        for i in range(num_states):
            state_name = "state_{}/layer_{}".format(i, layer_id)
            states.append(model.net.AddExternalInput(state_name))
            workspace.FeedBlob(states[-1],
                               generate_input_state(n, d).astype(np.float32))

    # Due to convoluted RNN scoping logic we make sure that things
    # work from a namescope
    with scope.NameScope("test_name_scope"):
        input_blob, seq_lengths = model.net.AddScopedExternalInputs(
            'input_blob', 'seq_lengths')

        outputs = create_rnn(model,
                             input_blob,
                             seq_lengths,
                             states,
                             dim_in=dim_in,
                             dim_out=dim_out,
                             scope="external/recurrent",
                             outputs_with_grads=outputs_with_grads,
                             memory_optimization=memory_optim,
                             forget_bias=forget_bias,
                             forward_only=forward_only,
                             drop_states=drop_states,
                             static_rnn_unroll_size=T,
                             **kwargs)

    workspace.RunNetOnce(model.param_init_net)

    workspace.FeedBlob(
        seq_lengths,
        np.random.randint(1, t + 1, size=(n, )).astype(np.int32))
    return outputs, model.net, states + [input_blob]
コード例 #16
0
    def test_fused_ln_quantize(self, seed, batch_size, size, epsilon,
                               elementwise_affine):
        np.random.seed(seed)

        # Reset the workspace
        workspace.ResetWorkspace()
        axis = 1

        dims = np.array(([batch_size, size]))
        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)

        Y = self._layernorm_transform(X)
        scale, zp = self._get_scale_zp(Y)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "gamma", "beta"])
        pred_net.external_output.extend(["Y_q"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine))
        pred_net.op.add().CopyFrom(
            core.CreateOperator("Int8Quantize", ["Y"], ["Y_q"],
                                Y_scale=scale,
                                Y_zero_point=zp))

        print(pred_net)
        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred_ref"
        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
        pred_net_ref.external_output.extend(["Y_q"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormInt8QuantizeFakeNNPI",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y_q", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine,
                Y_scale=scale,
                Y_zero_point=zp))
        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hits,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("gamma", gamma)
        workspace.FeedBlob("beta", beta)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchInt8Blob("Y_q")

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchInt8Blob("Y_q")

        if not np.allclose(Y_glow.data, Y_c2.data) or \
           Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point:
            diff_Y = np.abs(
                Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32))
            print_test_debug_info(
                "layernorm", {
                    "seed": seed,
                    "size": size,
                    "batch_size": batch_size,
                    "epsilon": epsilon,
                    "gamma": gamma,
                    "beta": beta,
                    "elementwise_affine": elementwise_affine,
                    "X": X,
                    "Y_glow": Y_glow,
                    "Y_c2": Y_c2,
                    "diff_Y": diff_Y,
                })
            assert (0)
コード例 #17
0
 def init_fun(worker_coordinator, global_coordinator):
     workspace.FeedBlob('data', 'initialized')
コード例 #18
0
    def test_layernorm(self, seed, batch_size, size, epsilon,
                       elementwise_affine):
        np.random.seed(seed)
        # Reset the workspace
        workspace.ResetWorkspace()
        axis = 1

        dims = np.array(([batch_size, size]))
        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "gamma", "beta"])
        pred_net.external_output.extend(["Y", "mean", "rstd"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine))

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred_ref"
        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormFakeFP16NNPI",
                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
                ["Y", "mean", "rstd"],
                axis=axis,
                epsilon=epsilon,
                elementwise_affine=elementwise_affine))

        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hits,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("gamma", gamma)
        workspace.FeedBlob("beta", beta)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")

        dims1 = np.array(([1, *dims]))
        X_glow = X.reshape(dims1)
        workspace.FeedBlob("X", X_glow)

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow, Y_c2):
            diff_Y = np.abs(Y_glow - Y_c2)
            print_test_debug_info(
                "layernorm", {
                    "seed": seed,
                    "size": size,
                    "batch_size": batch_size,
                    "epsilon": epsilon,
                    "gamma": gamma,
                    "beta": beta,
                    "elementwise_affine": elementwise_affine,
                    "X": X,
                    "Y_glow": Y_glow,
                    "Y_c2": Y_c2,
                    "diff_Y": diff_Y,
                })
            assert (0)
コード例 #19
0
optimizer.build_sgd(train_model,base_learning_rate=0.01, policy="step", stepsize=15000, gamma=0.1, momentum=0.9)

print "Added training operators"

##################################################################################
#### Run the training procedure

# Initialization.
print "Initializing dataset"
train_dataset = jdh.Jester_Dataset(dictionary_file=train_dictionary,seq_size=10)
print "finished initializing dataset"

# Prime the workspace with some data so we can run init net once
for image, label in train_dataset.read(batch_size=1):
	workspace.FeedBlob("data", image)
	workspace.FeedBlob("label", label)
	break
print "Running param init net once"
# run the param init network once
workspace.RunNetOnce(train_model.param_init_net)
# create the network
workspace.CreateNet(train_model.net, overwrite=True)


# Set the total number of iterations and track the accuracy and loss
accuracy = []
loss = []
cnt = 0
print "Beginning training"
コード例 #20
0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
import glob
import json
import numpy as np

example_files = glob.glob('example_*.c2s')

for ex in example_files:
    print('Running example file', ex)
    with open(ex, 'r') as f:
        inits = json.loads(f.readline())
        net_name = f.readline().strip()
        outputs = json.loads(f.readline())

        CU = core.C.CompilationUnit()
        CU.define(f.read())

    # Initialize workspace with required inputs
    for name, shape, dt in inits:
        workspace.FeedBlob(name, np.random.rand(*shape).astype(np.dtype(dt)))

    net = CU.create_net(net_name)
    net.run()

    print('Success! Interesting outputs:')
    for output in outputs:
        print(output, workspace.FetchBlob(output))
コード例 #21
0
 def testShapeInferenceTwoClass(self):
     model = cnn.CNNModelHelper(name="twoclass")
     model.MakeTwoClass("v", "v2")
     workspace.FeedBlob("v", np.random.rand(32).astype(np.float32))
     self.InferTensorRunAndCompare(model)
コード例 #22
0
def initialize_gpu_from_weights_file(model, weights_file, gpu_id=0):
    """Initialize a network with ops on a specific GPU.

    If you use CUDA_VISIBLE_DEVICES to target specific GPUs, Caffe2 will
    automatically map logical GPU ids (starting from 0) to the physical GPUs
    specified in CUDA_VISIBLE_DEVICES.
    """
    logger.info('Loading weights from: {}'.format(weights_file))
    ws_blobs = workspace.Blobs()
    src_blobs = load_object(weights_file)

    if 'cfg' in src_blobs:
        saved_cfg = load_cfg(src_blobs['cfg'])
        configure_bbox_reg_weights(model, saved_cfg)
    if 'blobs' in src_blobs:
        # Backwards compat--dictionary used to be only blobs, now they are
        # stored under the 'blobs' key
        src_blobs = src_blobs['blobs']
    # Initialize weights on GPU gpu_id only
    unscoped_param_names = OrderedDict()  # Print these out in model order
    for blob in model.params:
        unscoped_param_names[c2_utils.UnscopeName(str(blob))] = True
    with c2_utils.NamedCudaScope(gpu_id):
        for unscoped_param_name in unscoped_param_names.keys():
            if (unscoped_param_name.find(']_') >= 0
                    and unscoped_param_name not in src_blobs):
                # Special case for sharing initialization from a pretrained
                # model:
                # If a blob named '_[xyz]_foo' is in model.params and not in
                # the initialization blob dictionary, then load source blob
                # 'foo' into destination blob '_[xyz]_foo'
                src_name = unscoped_param_name[unscoped_param_name.find(']_') +
                                               2:]
            else:
                src_name = unscoped_param_name
            if src_name not in src_blobs:
                logger.info('{:s} not found'.format(src_name))
                continue
            dst_name = core.ScopedName(unscoped_param_name)
            has_momentum = src_name + '_momentum' in src_blobs
            has_momentum_str = ' [+ momentum]' if has_momentum else ''
            logger.info(
                '{:s}{:} loaded from weights file into {:s}: {}'.format(
                    src_name, has_momentum_str, dst_name,
                    src_blobs[src_name].shape))
            if dst_name in ws_blobs:
                # If the blob is already in the workspace, make sure that it
                # matches the shape of the loaded blob
                ws_blob = workspace.FetchBlob(dst_name)
                assert ws_blob.shape == src_blobs[src_name].shape, \
                    ('Workspace blob {} with shape {} does not match '
                     'weights file shape {}').format(
                        src_name,
                        ws_blob.shape,
                        src_blobs[src_name].shape)
            workspace.FeedBlob(
                dst_name, src_blobs[src_name].astype(np.float32, copy=False))
            if has_momentum:
                workspace.FeedBlob(
                    dst_name + '_momentum',
                    src_blobs[src_name + '_momentum'].astype(np.float32,
                                                             copy=False))

    # We preserve blobs that are in the weights file but not used by the current
    # model. We load these into CPU memory under the '__preserve__/' namescope.
    # These blobs will be stored when saving a model to a weights file. This
    # feature allows for alternating optimization of Faster R-CNN in which blobs
    # unused by one step can still be preserved forward and used to initialize
    # another step.
    for src_name in src_blobs.keys():
        if (src_name not in unscoped_param_names
                and not src_name.endswith('_momentum')
                and src_blobs[src_name] is not None):
            with c2_utils.CpuScope():
                workspace.FeedBlob('__preserve__/{:s}'.format(src_name),
                                   src_blobs[src_name])
                logger.info(
                    '{:s} preserved in workspace (unused)'.format(src_name))
コード例 #23
0
    def testShapeInferenceReshape(self):
        model = cnn.CNNModelHelper()
        model.Reshape("X", ["Reshaped", "Old_Shape"], shape=[8, 0, -1, 2])
        workspace.FeedBlob("X", np.random.rand(4, 26, 32).astype(np.float32))

        self.InferTensorRunAndCompare(model)
コード例 #24
0
    def _test_index_ops(self, entries, dtype, index_create_op):
        workspace.RunOperatorOnce(
            core.CreateOperator(index_create_op, [], ['index'],
                                max_elements=10))
        my_entries = np.array([entries[0], entries[1], entries[2]],
                              dtype=dtype)

        workspace.FeedBlob('entries', my_entries)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexLoad', ['index', 'entries'], ['index']))
        query1 = np.array([entries[0], entries[3], entries[0], entries[4]],
                          dtype=dtype)

        workspace.FeedBlob('query1', query1)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexGet', ['index', 'query1'], ['result1']))
        result1 = workspace.FetchBlob('result1')
        np.testing.assert_array_equal([1, 4, 1, 5], result1)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexFreeze', ['index'], ['index']))

        query2 = np.array(
            [entries[5], entries[4], entries[0], entries[6], entries[7]],
            dtype=dtype)
        workspace.FeedBlob('query2', query2)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexGet', ['index', 'query2'], ['result2']))
        result2 = workspace.FetchBlob('result2')
        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexSize', ['index'], ['index_size']))
        size = workspace.FetchBlob('index_size')
        self.assertEquals(size, 6)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexStore', ['index'], ['stored_entries']))
        stored_actual = workspace.FetchBlob('stored_entries')
        new_entries = np.array([entries[3], entries[4]], dtype=dtype)
        np.testing.assert_array_equal(
            np.concatenate((my_entries, new_entries)), stored_actual)

        workspace.RunOperatorOnce(
            core.CreateOperator(index_create_op, [], ['index2']))

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexLoad', ['index2', 'stored_entries'],
                                ['index2'],
                                skip_first_entry=1))

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexSize', ['index2'], ['index2_size']))
        index2_size = workspace.FetchBlob('index2_size')
        self.assertEquals(index2_size, 5)

        # test serde
        with tempfile.NamedTemporaryFile() as tmp:
            workspace.RunOperatorOnce(
                core.CreateOperator('Save', ['index'], [],
                                    absolute_path=1,
                                    db_type='minidb',
                                    db=tmp.name))
            # frees up the blob
            workspace.FeedBlob('index', np.array([]))
            # reloads the index
            workspace.RunOperatorOnce(
                core.CreateOperator('Load', [], ['index'],
                                    absolute_path=1,
                                    db_type='minidb',
                                    db=tmp.name))
            query3 = np.array(
                [entries[0], entries[3], entries[0], entries[4], entries[4]],
                dtype=dtype)

            workspace.FeedBlob('query3', query3)
            workspace.RunOperatorOnce(
                core.CreateOperator('IndexGet', ['index', 'query3'],
                                    ['result3']))
            result3 = workspace.FetchBlob('result3')
            np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
コード例 #25
0
    def test_int8_quantize(self, n, rand_seed):
        print("n={}, rand_seed={}".format(n, rand_seed))
        np.random.seed(rand_seed)
        workspace.ResetWorkspace()

        X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32)
        W_fp32 = np.identity(n, dtype=np.float32)
        b_fp32 = np.zeros((n, ), dtype=np.float32)

        X_scale, X_zero_point = self._get_scale_zp(X_fp32)

        workspace.FeedBlob("X", X_fp32)
        workspace.FeedBlob("W", W_fp32)
        workspace.FeedBlob("b", b_fp32)

        workspace.RunOperatorOnce(
            core.CreateOperator(
                "Int8FCPackWeight",
                ["W"],
                ["W_int8"],
                engine="DNNLOWP",
                save_unpacked_weights=True,
                in_scale=X_scale,
            ))

        ref_net = core.Net("net")
        ref_net.Int8QuantizeNNPI(["X"], ["X_int8"],
                                 Y_scale=X_scale,
                                 Y_zero_point=X_zero_point)
        ref_net.Int8FCFakeAcc32NNPI(
            ["X_int8", "W_int8", "b"],
            ["Y_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point,
        )
        ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"])
        ref_net.Proto().external_output.append("Y")

        # run ref_net
        workspace.RunNetOnce(ref_net)
        Y_fbgemm = workspace.FetchBlob("Y")

        # run onnxifi net
        ref_net.Proto().op[0].type = "Int8Quantize"
        ref_net.Proto().op[1].type = "Int8FC"
        ref_net.Proto().op[2].type = "Int8Dequantize"
        net_onnxified = onnxifi_caffe2_net(
            ref_net.Proto(),
            {},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
            weight_names=["W_int8", "b"],
        )
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.CreateNet(net_onnxified)
        workspace.RunNet(net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow, Y_fbgemm):
            diff_Y = np.abs(Y_glow - Y_fbgemm)
            print_test_debug_info(
                "int8_fc",
                {
                    "seed": rand_seed,
                    "n": n,
                    "X": X_fp32,
                    "W": W_fp32,
                    "b": b_fp32,
                    "Y_fbgemm": Y_fbgemm,
                    "Y_glow": Y_glow,
                    "diff": diff_Y,
                    "maxdiff": diff_Y.max(axis=1),
                },
            )
            assert 0
コード例 #26
0
    def compare_reference(
            self,
            raw_data,
            pct_raw_data,
            pct_mapping,
            pct_upper,
            pct_lower,
            lengths,
    ):
        def bisect_percentile_op_ref(
            raw_data,
            pct_raw_data,
            pct_mapping,
            pct_lower,
            pct_upper,
            lengths
        ):
            results = np.zeros_like(raw_data)
            indices = [0]
            for j in range(len(lengths)):
                indices.append(indices[j] + lengths[j])
            for i in range(len(raw_data)):
                for j in range(len(raw_data[0])):
                    start = indices[j]
                    end = indices[j + 1]
                    val = raw_data[i][j]
                    pct_raw_data_i = pct_raw_data[start:end]
                    pct_lower_i = pct_lower[start:end]
                    pct_upper_i = pct_upper[start:end]
                    pct_mapping_i = pct_mapping[start:end]

                    # Corner cases
                    if val < pct_raw_data_i[0]:
                        results[i][j] = 0
                        continue
                    if val > pct_raw_data_i[-1]:
                        results[i][j] = 1.
                        continue

                    # interpolation
                    k = bisect.bisect_left(pct_raw_data_i, val)
                    if pct_raw_data_i[k] == val:
                        results[i][j] = pct_mapping_i[k]
                    else:
                        k = k - 1
                        slope = ((pct_lower_i[k + 1] - pct_upper_i[k])
                            / (pct_raw_data_i[k + 1] - pct_raw_data_i[k]))
                        results[i][j] = pct_upper_i[k] + \
                            slope * (val - pct_raw_data_i[k])

            return results

        workspace.ResetWorkspace()
        workspace.FeedBlob("raw_data", raw_data)

        op = core.CreateOperator(
            "BisectPercentile",
            ["raw_data"],
            ["pct_output"],
            percentile_raw=pct_raw_data,
            percentile_mapping=pct_mapping,
            percentile_lower=pct_lower,
            percentile_upper=pct_upper,
            lengths=lengths
        )
        workspace.RunOperatorOnce(op)

        expected_output = bisect_percentile_op_ref(
            raw_data,
            pct_raw_data,
            pct_mapping,
            pct_lower,
            pct_upper,
            lengths
        )
        output = workspace.blobs['pct_output']
        np.testing.assert_array_almost_equal(output, expected_output)
コード例 #27
0
ファイル: test_retinanet.py プロジェクト: jpbirdy/Detectron
def im_detect_bbox(model, im, timers=None):
    """Generate RetinaNet detections on a single image."""
    if timers is None:
        timers = defaultdict(Timer)
    # Although anchors are input independent and could be precomputed,
    # recomputing them per image only brings a small overhead
    anchors = _create_cell_anchors()
    timers['im_detect_bbox'].tic()
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS)
    inputs = {}
    inputs['data'], im_scale, inputs['im_info'] = \
        blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE)
    cls_probs, box_preds = [], []
    for lvl in range(k_min, k_max + 1):
        suffix = 'fpn{}'.format(lvl)
        cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix)))
        box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix)))
    for k, v in list(inputs.items()):
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False))

    workspace.RunNet(model.net.Proto().name)
    cls_probs = workspace.FetchBlobs(cls_probs)
    box_preds = workspace.FetchBlobs(box_preds)

    # here the boxes_all are [x0, y0, x1, y1, score]
    boxes_all = defaultdict(list)

    cnt = 0
    for lvl in range(k_min, k_max + 1):
        # create cell anchors array
        stride = 2. ** lvl
        cell_anchors = anchors[lvl]

        # fetch per level probability
        cls_prob = cls_probs[cnt]
        box_pred = box_preds[cnt]
        cls_prob = cls_prob.reshape((
            cls_prob.shape[0], A, int(cls_prob.shape[1] / A),
            cls_prob.shape[2], cls_prob.shape[3]))
        box_pred = box_pred.reshape((
            box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3]))
        cnt += 1

        if cfg.RETINANET.SOFTMAX:
            cls_prob = cls_prob[:, :, 1::, :, :]

        cls_prob_ravel = cls_prob.ravel()
        # In some cases [especially for very small img sizes], it's possible that
        # candidate_ind is empty if we impose threshold 0.05 at all levels. This
        # will lead to errors since no detections are found for this image. Hence,
        # for lvl 7 which has small spatial resolution, we take the threshold 0.0
        th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0
        candidate_inds = np.where(cls_prob_ravel > th)[0]
        if (len(candidate_inds) == 0):
            continue

        pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds))
        inds = np.argpartition(
            cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:]
        inds = candidate_inds[inds]

        inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose()
        classes = inds_5d[:, 2]
        anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4]
        scores = cls_prob[:, anchor_ids, classes, y, x]

        boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32)
        boxes *= stride
        boxes += cell_anchors[anchor_ids, :]

        if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
            box_deltas = box_pred[0, anchor_ids, :, y, x]
        else:
            box_cls_inds = classes * 4
            box_deltas = np.vstack(
                [box_pred[0, ind:ind + 4, yi, xi]
                 for ind, yi, xi in zip(box_cls_inds, y, x)]
            )
        pred_boxes = (
            box_utils.bbox_transform(boxes, box_deltas)
            if cfg.TEST.BBOX_REG else boxes)
        pred_boxes /= im_scale
        pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape)
        box_scores = np.zeros((pred_boxes.shape[0], 5))
        box_scores[:, 0:4] = pred_boxes
        box_scores[:, 4] = scores

        for cls in range(1, cfg.MODEL.NUM_CLASSES):
            inds = np.where(classes == cls - 1)[0]
            if len(inds) > 0:
                boxes_all[cls].extend(box_scores[inds, :])
    timers['im_detect_bbox'].toc()

    # Combine predictions across all levels and retain the top scoring by class
    timers['misc_bbox'].tic()
    detections = []
    for cls, boxes in list(boxes_all.items()):
        cls_dets = np.vstack(boxes).astype(dtype=np.float32)
        # do class specific nms here
        keep = box_utils.nms(cls_dets, cfg.TEST.NMS)
        cls_dets = cls_dets[keep, :]
        out = np.zeros((len(keep), 6))
        out[:, 0:5] = cls_dets
        out[:, 5].fill(cls)
        detections.append(out)

    # detections (N, 6) format:
    #   detections[:, :4] - boxes
    #   detections[:, 4] - scores
    #   detections[:, 5] - classes
    detections = np.vstack(detections)
    # sort all again
    inds = np.argsort(-detections[:, 4])
    detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :]

    # Convert the detections to image cls_ format (see core/test_engine.py)
    num_classes = cfg.MODEL.NUM_CLASSES
    cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)]
    for c in range(1, num_classes):
        inds = np.where(detections[:, 5] == c)[0]
        cls_boxes[c] = detections[inds, :5]
    timers['misc_bbox'].toc()

    return cls_boxes
コード例 #28
0
    def _run_compare_train_inference(self, model_params):
        tmp_dir = tempfile.mkdtemp()

        model_obj, checkpoint_path = self._build_seq2seq_model(
            model_params,
            tmp_dir=tmp_dir,
            source_vocab_size=20,
            target_vocab_size=20,
            num_gpus=0,
            batch_size=2,
        )
        assert model_obj is not None

        translate_params = dict(
            ensemble_models=[
                dict(
                    source_vocab={i: str(i)
                                  for i in range(20)},
                    target_vocab={i: str(i)
                                  for i in range(20)},
                    model_params=model_params,
                    model_file=checkpoint_path,
                )
            ],
            decoding_params=dict(
                beam_size=3,
                word_reward=0,
                unk_reward=0,
            ),
        )

        beam_decoder_model = Seq2SeqModelCaffe2EnsembleDecoder(
            translate_params)
        beam_decoder_model.load_models()

        encoder_lengths = 5
        decoder_lengths = 7

        for _ in range(3):
            encoder_inputs = np.random.random_integers(
                low=3,  # after GO_ID (1) and EOS_ID (2)
                high=19,
                size=encoder_lengths,
            )
            targets, _, beam_model_score = beam_decoder_model.decode(
                encoder_inputs,
                decoder_lengths,
            )
            targets_2, _, beam_model_score = beam_decoder_model.decode(
                encoder_inputs,
                decoder_lengths,
            )
            self.assertEqual(targets, targets_2)

            workspace.FeedBlob(
                'encoder_inputs',
                np.array([list(reversed(encoder_inputs))
                          ]).transpose().astype(dtype=np.int32))
            workspace.FeedBlob(
                'encoder_lengths',
                np.array([len(encoder_inputs)]).astype(dtype=np.int32),
            )
            decoder_inputs = [seq2seq_util.GO_ID] + targets[:-1]
            workspace.FeedBlob(
                'decoder_inputs',
                np.array([decoder_inputs]).transpose().astype(dtype=np.int32),
            )
            workspace.FeedBlob(
                'decoder_lengths',
                np.array([len(decoder_inputs)]).astype(dtype=np.int32),
            )
            workspace.FeedBlob(
                'targets',
                np.array([targets]).transpose().astype(dtype=np.int32),
            )
            workspace.FeedBlob(
                'target_weights',
                np.array([[1.0] * len(targets)]).astype(dtype=np.float32),
            )

            workspace.RunNet(model_obj.forward_net)
            train_model_score = workspace.FetchBlob('total_loss_scalar')

            np.testing.assert_almost_equal(
                beam_model_score,
                train_model_score,
                decimal=4,
            )
コード例 #29
0
    def test_layernorm(self, seed, size, input_channels, batch_size, epsilon):
        np.random.seed(seed)
        # Reset the workspace
        workspace.ResetWorkspace()

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X"])
        pred_net.external_output.extend(["Y", "mean", "rstd"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X"],
                ["Y", "mean", "rstd"],
                # axis=-1,
                epsilon=epsilon))

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred_ref"
        pred_net_ref.external_input.extend(["X"])
        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormFakeFP16",
                ["X"],
                ["Y", "mean", "rstd"],
                # axis=-1,
                epsilon=epsilon))

        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5

        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net, {"X": [batch_size, input_channels, size, size]},
            debug=True,
            adjust_batch=False,
            use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)

        workspace.CreateNet(pred_net)
        workspace.CreateNet(pred_net_ref)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")
        mean_c2 = workspace.FetchBlob("mean")
        std_c2 = workspace.FetchBlob("rstd")

        workspace.RunNet(pred_net.name)
        Y_glow = workspace.FetchBlob("Y")
        mean_glow = workspace.FetchBlob("mean")
        std_glow = workspace.FetchBlob("rstd")

        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
            diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16)
            diff_std = np.abs(std_glow - std_c2).astype(np.float16)
            diff_mean = np.abs(mean_glow - mean_c2).astype(np.float16)
            print_test_debug_info(
                "layernorm", {
                    "seed": seed,
                    "size": size,
                    "input_channels": input_channels,
                    "batch_size": batch_size,
                    "epsilon": epsilon,
                    "X": X,
                    "Y_glow": Y_glow,
                    "mean_glow": mean_glow,
                    "std_glow": std_glow,
                    "Y_c2": Y_c2,
                    "mean_c2": mean_c2,
                    "std_c2": std_c2,
                    "diff_Y": diff_Y,
                    "diff_mean": diff_mean,
                    "diff_std": diff_std,
                })
            assert (0)
コード例 #30
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustice_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="mobilenet",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        '''
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                )
            )
        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            exit_nets=None)
        '''
    else:
        rendezvous = None

    # Model building functions
    def create_mobilenet_model_ops(model, loss_scale):
        [softmax, loss
         ] = mobilenet.create_mobilenet(model,
                                        "data",
                                        num_input_channels=args.num_channels,
                                        num_labels=args.num_labels,
                                        label="label")
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(200 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        optimizer.build_sgd(model,
                            args.base_learning_rate,
                            momentum=0.9,
                            nesterov=1,
                            policy="step",
                            stepsize=stepsz,
                            gamma=0.1)

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    def add_post_sync_ops(model):
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_mobilenet_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
    )
    #
    # # save network graph
    # graph = net_drawer.GetPydotGraphMinimal(
    #     train_model.net.Proto().op, "mobilenet", rankdir="LR", minimal_dependency=True)
    # with open("mobilenet.png", 'wb') as fid:
    #     fid.write(graph.create_png())

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="mobilenet_test",
                                              arg_scope=test_arg_scope)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_mobilenet_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)
    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and mobilenet epoch

    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params

        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        x = workspace.FetchBlob('optimizer_iteration')
        workspace.FeedBlob('optimizer_iteration', x)

        # mobilenet epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("mobilenet epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")
    # else:
    #     workspace.RunNetOnce(train_model.param_init_net)
    #     workspace.CreateNet(train_model.net)

    expname = "mobilenet_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    # with open(os.path.join(root_folder, "train_net.pbtxt"), 'w') as fo:
    #     fo.write(str(train_model.net.Proto()))
    # with open(os.path.join(root_folder, "train_init_net.pbtxt"), 'w') as fo:
    #     fo.write(str(train_model.param_init_net.Proto()))
    # print(workspace.FetchBlob('iteration_mutex'))
    # print(workspace.FetchBlob('gpu_0/conv1_b'))

    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists

        if os.path.isfile(model_path + str(epoch - 3) + ".mdl"):
            os.remove(model_path + str(epoch - 3) + ".mdl")

#======= Check Flag ==========
        CheckSave()