Exemple #1
0
    def test_small_sls_acc32(self, seed):
        workspace.GlobalInit([
            "caffe2",
            "--glow_global_fp16=0",
            "--glow_global_fused_scale_offset_fp16=0",
            "--glow_global_force_sls_fp16_accum=0",
        ])
        np.random.seed(seed)
        workspace.ResetWorkspace()

        n = 2
        DIM = 3
        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)

        lengths = np.array([n], dtype=np.int32)
        indices = np.array(range(n), dtype=np.int64)
        weights = np.random.uniform(low=0.01, high=0.5,
                                    size=[n]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))

        quantized_data = workspace.FetchBlob("quantized_data")

        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=1,
            max_seq_size=n,
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )
        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(onnxified_net)
        workspace.CreateNet(ref_net)

        workspace.RunNet(onnxified_net.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchBlob("Y")

        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
        max_err = np.max(diff, axis=1)
        num_offenders = (max_err > 0).sum()
        if num_offenders > 0:
            np.set_printoptions(precision=12)
            print(
                "ref",
                Y_ref.astype(np.float16).astype(np.float32),
                "glow",
                Y_glow.astype(np.float16).astype(np.float32),
            )
            print_test_debug_info(
                "test_small_sls_acc32",
                {
                    "seed": seed,
                    "indices": indices,
                    "data": data,
                    "quantized_data": quantized_data,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_glow": Y_glow,
                    "Y_ref": Y_ref,
                    "diff": diff,
                    "rowwise_diff": np.max(diff, axis=1),
                },
            )
            assert 0
    def _test_layernorm(self):
        size = 3
        input_channels = 2
        batch_size = 4
        seed = int(time.time())
        np.random.seed(seed)

        epsilon = 1e-3

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X"])
        pred_net.external_output.extend(["Y", "mean", "rstd"])
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNorm",
                ["X"],
                ["Y", "mean", "rstd"],
                # axis=-1,
                epsilon=epsilon))

        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred"
        pred_net_ref.external_input.extend(["X"])
        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "LayerNormFakeFP16",
                ["X"],
                ["Y", "mean", "rstd"],
                # axis=-1,
                epsilon=epsilon))

        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5

        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net, {"X": [batch_size, input_channels, size, size]},
            debug=True,
            adjust_batch=False,
            use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)

        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(pred_net_ref)

        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y")
        mean_c2 = workspace.FetchBlob("mean")
        std_c2 = workspace.FetchBlob("rstd")

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")
        mean_glow = workspace.FetchBlob("mean")
        std_glow = workspace.FetchBlob("rstd")

        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
            diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16)
            diff_std = np.abs(std_glow - std_c2).astype(np.float16)
            diff_mean = np.abs(mean_glow - mean_c2).astype(np.float16)
            print_test_debug_info(
                "layernorm", {
                    "seed": seed,
                    "X": X,
                    "Y_glow": Y_glow,
                    "Y_c2": Y_c2,
                    "Y": diff_Y,
                    "mean": diff_mean,
                    "std": diff_std,
                })
            assert (0)
Exemple #3
0
    def _test_binary_op_graph(self, name, seed):
        np.random.seed(seed)
        workspace.ResetWorkspace()
        # First dimension is the batch size
        dims = np.concatenate((np.array([1]), np.random.randint(1, 20,
                                                                size=3)))
        A = np.random.uniform(low=-100.0, high=100.0,
                              size=dims).astype(np.float32)
        B = np.random.uniform(low=-100.0, high=100.0,
                              size=dims).astype(np.float32)
        # Avoid dividing by 0
        B[np.abs(B) < 1e-3] = 1e-3
        print(A.shape, B.shape)
        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["A", "B"])
        pred_net.external_output.append("C")
        pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"],
                                                       ["C"]))
        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "ref"
        pred_net_ref.external_input.extend(["A", "B"])
        pred_net_ref.external_output.append("C_ref")
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                name + "FakeFp16",
                ["A", "B"],
                ["C_ref"],
            ))

        shape_hints = {"A": A.shape, "B": B.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hints,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        print(pred_net_onnxified)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.SwitchWorkspace("glow_test_ws", True)
        workspace.FeedBlob("A", A)
        workspace.FeedBlob("B", B)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)
        num_iterations = 10
        for _ in range(num_iterations):
            A = np.random.uniform(low=-100.0, high=100.0,
                                  size=dims).astype(np.float32)
            B = np.random.uniform(low=-100.0, high=100.0,
                                  size=dims).astype(np.float32)
            # Avoid dividing by 0
            B[np.abs(B) < 1e-3] = 1e-3

            workspace.FeedBlob("A", A)
            workspace.FeedBlob("B", B)
            # Run caffe2 net
            workspace.RunNet(pred_net_ref.name)
            Y_c2 = workspace.FetchBlob("C_ref")

            # Run Glow net
            workspace.RunNet(pred_net_onnxified.name)
            Y_glow = workspace.FetchBlob("C")

            Y_glow[Y_glow == np.Inf] = np.finfo(np.float16).max
            Y_glow[Y_glow == np.NINF] = np.finfo(np.float16).min

            # Ignore mismatches solely due to difference in precision
            fp16_finite = np.isfinite(
                A.astype(np.float16) / B.astype(np.float16))

            # Results should be identical since we are comparing with the C2 emulation
            if not np.allclose(Y_c2[fp16_finite], Y_glow[fp16_finite]):
                diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
                print_test_debug_info(
                    name, {
                        "dims": dims,
                        "iter": _,
                        "seed": seed,
                        "A": A,
                        "B": B,
                        "Y_glow": Y_glow,
                        "Y_c2": Y_c2,
                        "diff": diff
                    })
                assert (0)
Exemple #4
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def add_optimizer(model):
            optimizer.build_sgd(model, 0.1, policy="fixed")

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
        )

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Exemple #5
0
    def run_model(self, V, gpu_devices, cpu_indices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather([self.vecs, 'indices'],
                                                    'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs,
                    "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [param_grad.values, param_momentum, param],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill([],
                                                             "vecs",
                                                             shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([
                            param, self.ONE_CPU, param_grad.indices,
                            param_grad.values, self.LR
                        ], self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16)
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(self.vecs, orig_vecs)
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                with open("/tmp/dump.txt", "w") as f:
                    f.write(str(model.net.Proto()))
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs),
                                     orig_vecs))
        return [
            workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
            workspace.FetchBlob("gpu_0/fc_w")
        ]
Exemple #6
0
    def testPartialClone(self):
        params = core.Net('params')
        p1 = params.ConstantFill([], ['p1'])
        workspace.CreateNet(params)
        workspace.RunNetOnce(params)

        n = core.Net('original')
        a1 = n.AddExternalInput('a1')
        a2 = n.AddExternalInput('a2')
        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
        c1 = n.Sum([b1, p1], ['c1'])
        c2 = n.Sum([b2], ['c2'])
        d = n.Sum([c1, c2], ['d'])

        # test that gradient ops are ignored when partial-cloning
        n.AddGradientOperators([d])

        # test some in-place ops
        k = n.Sum([p1], ['k'])
        e = n.Sum([d], ['e'])
        e = n.Sum([e, k], [e])
        e = n.Sum([e], [e])
        f = n.Sum(e, ['f'])

        def net_assert(net, num_ops, inputs, outputs, internals):
            self.assertEqual(len(net.Proto().op), num_ops)
            self.assertEqual(set(net.Proto().external_input), inputs)
            self.assertEqual(set(net.Proto().external_output), outputs)
            all_blobs = set(net.Proto().external_input)
            all_blobs |= set(net.Proto().external_output)
            for op in net.Proto().op:
                all_blobs |= set(op.input) | set(op.output)
            self.assertEqual(all_blobs, inputs | outputs | internals)
            # create net to make sure its valid
            for input in inputs:
                workspace.FeedBlob(input, np.array([]))
            workspace.CreateNet(net)

        n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d])
        net_assert(n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'},
                   {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'})
        self.assertTrue(isinstance(d22, core.BlobReference))
        self.assertEqual(d22.Net(), n2)
        self.assertEqual(str(d22), 'f1/d')

        n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d])
        net_assert(n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'},
                   {'f2/c1', 'f2/c2', 'p1'})
        self.assertEqual(str(d22), 'f2/d')

        n4, (c22, ) = n.ClonePartial('f3', [b1], [c1])
        net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'})
        self.assertEqual(str(c22), 'f3/c1')

        n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2])
        net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'})
        self.assertEqual(str(c11), 'f4/c1')
        self.assertEqual(str(c22), 'f4/c2')

        with self.assertRaises(AssertionError):
            n.ClonePartial('f4', [a1, a2, c2], [d])

        n6, (e22, ) = n.ClonePartial('f5', [d], [e])
        net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'})
        self.assertEqual(str(e22), 'f5/e')

        n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f])
        net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'})
        self.assertEqual(str(e22), 'f7/e')
        self.assertEqual(str(f22), 'f7/f')

        params._CheckLookupTables()
        n._CheckLookupTables()
Exemple #7
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustice_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                ))
        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          exit_nets=None)
    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=args.num_channels,
            num_labels=args.num_labels,
            label="label",
            no_bias=True,
        )
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    # SGD
    def add_parameter_update_ops(model):
        brew.add_weight_decay(model, args.weight_decay)
        ITER = brew.iter(model, "ITER")
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=args.base_learning_rate,
            policy="step",
            stepsize=stepsz,
            gamma=0.1,
        )
        AddMomentumParameterUpdate(model, LR)

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        param_update_builder_fun=add_parameter_update_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
    )

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)
        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(
            train_model,
            GetCheckpointParams(train_model),
        )

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Exemple #8
0
    def export_actor(
        cls,
        trainer,
        state_normalization_parameters,
        action_feature_ids,
        min_action_range_tensor_serving,
        max_action_range_tensor_serving,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch actor forward pass as one
        caffe2 net.

        :param trainer DDPGTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param min_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param max_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param state_normalization_parameters state NormalizationParameters
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """
        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)
        parameters: List[str] = []

        workspace.FeedBlob("input/float_features.lengths",
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys",
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values",
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        C2.net().Copy(["input/float_features.lengths"],
                      [input_feature_lengths])
        C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
        C2.net().Copy(["input/float_features.values"], [input_feature_values])

        preprocessor = PreprocessorNet()
        sparse_to_dense_processor = Caffe2SparseToDenseProcessor()
        sorted_features, _ = sort_features_by_normalization(
            state_normalization_parameters)
        state_dense_matrix, new_parameters = sparse_to_dense_processor(
            sorted_features,
            StackedAssociativeArray(input_feature_lengths, input_feature_keys,
                                    input_feature_values),
        )
        parameters.extend(new_parameters)
        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix(
            state_dense_matrix,
            sorted_features,
            state_normalization_parameters,
            "state_norm",
            False,
        )
        parameters.extend(new_parameters)

        torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net(
            trainer,
            model,
            min_action_range_tensor_serving,
            max_action_range_tensor_serving,
            model_on_gpu,
        )
        parameters.extend(new_parameters)
        net.Copy([state_normalized_dense_matrix], [actor_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        # Scale actors actions from [-1, 1] to serving range
        prev_range = C2.Sub(max_action_training_blob, min_action_training_blob)
        new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob)
        subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob)
        div_by_prev_range = C2.Div(subtract_prev_min, prev_range)
        scaled_for_serving_actions = C2.Add(
            C2.Mul(div_by_prev_range, new_range), min_action_serving_blob)

        output_lengths = "output/float_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [C2.FlattenToVec(C2.ArgMax(actor_output_blob))],
            [output_lengths],
            value=trainer.actor.layers[-1].out_features,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        action_feature_ids_blob = C2.NextBlob("action_feature_ids")
        workspace.FeedBlob(action_feature_ids_blob,
                           np.array(action_feature_ids, dtype=np.int64))
        parameters.append(action_feature_ids_blob)

        output_keys = "output/float_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"),
                                     shape=[1])
        C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys],
                      axis=0)

        output_values = "output/float_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32))
        C2.net().FlattenToVec([scaled_for_serving_actions], [output_values])

        workspace.CreateNet(net)
        return DDPGPredictor(net, torch_init_net, parameters)
Exemple #9
0
def main():
  init_net = core.Net("init")
  # The ground truth parameters.
  W_gt = init_net.GivenTensorFill(
      [], "W_gt", shape=[1, 2], values=[2.0, 1.5])
  B_gt = init_net.GivenTensorFill([], "B_gt", shape=[1], values=[0.5])
  # Constant value ONE is used in weighted sum when updating parameters.
  ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
  # ITER is the iterator count.
  ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0, dtype=core.DataType.INT32)

  # For the parameters to be learned: we randomly initialize weight
  # from [-1, 1] and init bias with 0.0.
  W = init_net.UniformFill([], "W", shape=[1, 2], min=-1., max=1.)
  B = init_net.ConstantFill([], "B", shape=[1], value=0.0)
  print('Created init net.')


  train_net = core.Net("train")
  # First, we generate random samples of X and create the ground truth.
  X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0, run_once=0)
  Y_gt = X.FC([W_gt, B_gt], "Y_gt")
  # We add Gaussian noise to the ground truth
  noise = train_net.GaussianFill([], "noise", shape=[64, 1], mean=0.0, std=1.0, run_once=0)
  Y_noise = Y_gt.Add(noise, "Y_noise")
  # Note that we do not need to propagate the gradients back through Y_noise,
  # so we mark StopGradient to notify the auto differentiating algorithm
  # to ignore this path.
  Y_noise = Y_noise.StopGradient([], "Y_noise")

  # Now, for the normal linear regression prediction, this is all we need.
  Y_pred = X.FC([W, B], "Y_pred")

  # The loss function is computed by a squared L2 distance, and then averaged
  # over all items in the minibatch.
  dist = train_net.SquaredL2Distance([Y_noise, Y_pred], "dist")
  loss = dist.AveragedLoss([], ["loss"])

  # Get gradients for all the computations above.
  gradient_map = train_net.AddGradientOperators([loss])

  # Increment the iteration by one.
  train_net.Iter(ITER, ITER)
  # Compute the learning rate that corresponds to the iteration.
  LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
                              policy="step", stepsize=20, gamma=0.9)

  # Weighted sum
  train_net.WeightedSum([W, ONE, gradient_map[W], LR], W)
  train_net.WeightedSum([B, ONE, gradient_map[B], LR], B)

  workspace.RunNetOnce(init_net)
  workspace.CreateNet(train_net)

  print("Before training, W is: {}".format(workspace.FetchBlob("W")))
  print("Before training, B is: {}".format(workspace.FetchBlob("B")))

  for i in range(100):
      workspace.RunNet(train_net.Proto().name)

  print("After training, W is: {}".format(workspace.FetchBlob("W")))
  print("After training, B is: {}".format(workspace.FetchBlob("B")))
  print("Ground truth W is: {}".format(workspace.FetchBlob("W_gt")))
  print("Ground truth B is: {}".format(workspace.FetchBlob("B_gt")))
Exemple #10
0
    def export(
        cls,
        trainer,
        actions,
        state_normalization_parameters,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch DQN forward pass as one
        caffe2 net.

        :param trainer DQNTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """

        input_dim = trainer.num_features
        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.q_network, input_dim, model_on_gpu
        )
        qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer
        )
        torch_workspace = caffe2_netdef.workspace

        parameters = torch_workspace.Blobs()
        for blob_str in parameters:
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/image", np.zeros([1, 1, 1, 1], dtype=np.int32))
        workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob(
                "input/int_features.lengths", np.zeros(1, dtype=np.int32)
            )
            workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [input_feature_lengths, input_feature_keys, input_feature_values],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"], [input_feature_values])

        if state_normalization_parameters is not None:
            preprocessor = PreprocessorNet(clip_anomalies=True)
            state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
                input_feature_lengths,
                input_feature_keys,
                input_feature_values,
                state_normalization_parameters,
                blobname_prefix="state_norm",
                split_sparse_to_dense=False,
                split_expensive_feature_groups=False,
                normalize=True,
            )
            parameters.extend(new_parameters)
        else:
            # Image input.  Note: Currently this does the wrong thing if
            #   more than one image is passed at a time.
            state_normalized_dense_matrix = "input/image"

        net.Copy([state_normalized_dense_matrix], [qnet_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        new_parameters, q_values = RLPredictor._forward_pass(
            model, trainer, state_normalized_dense_matrix, actions, qnet_output_blob
        )
        parameters.extend(new_parameters)

        # Get 1 x n action index tensor under the max_q policy
        max_q_act_idxs = "max_q_policy_actions"
        C2.net().Flatten([C2.ArgMax(q_values)], [max_q_act_idxs], axis=0)
        shape_of_num_of_states = "num_states_shape"
        C2.net().FlattenToVec([max_q_act_idxs], [shape_of_num_of_states])
        num_states, _ = C2.Reshape(C2.Size(shape_of_num_of_states), shape=[1])

        # Get 1 x n action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32)
        )
        tempered_q_values = C2.Div(q_values, temperature, broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = "softmax_act_idxs_nested"
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_idxs = "softmax_policy_actions"
        C2.net().Flatten([softmax_act_idxs_nested], [softmax_act_idxs], axis=0)

        action_names = C2.NextBlob("action_names")
        parameters.append(action_names)
        workspace.FeedBlob(action_names, np.array(actions))

        # Concat action index tensors to get 2 x n tensor - [[max_q], [softmax]]
        # transpose & flatten to get [a1_maxq, a1_softmax, a2_maxq, a2_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_idxs, to=caffe2_pb2.TensorProto.INT32)
        softmax_act_blob = C2.Cast(softmax_act_idxs, to=caffe2_pb2.TensorProto.INT32)
        C2.net().Append([max_q_act_blob, softmax_act_blob], [max_q_act_blob])
        transposed_action_idxs = C2.Transpose(max_q_act_blob)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        workspace.FeedBlob(OUTPUT_SINGLE_CAT_VALS_NAME, np.zeros(1, dtype=np.int64))
        C2.net().Gather(
            [action_names, flat_transposed_action_idxs], [OUTPUT_SINGLE_CAT_VALS_NAME]
        )

        workspace.FeedBlob(OUTPUT_SINGLE_CAT_LENGTHS_NAME, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [shape_of_num_of_states],
            [OUTPUT_SINGLE_CAT_LENGTHS_NAME],
            value=2,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        workspace.FeedBlob(OUTPUT_SINGLE_CAT_KEYS_NAME, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_states, axis=0)
        C2.net().FlattenToVec([output_key_tile], [OUTPUT_SINGLE_CAT_KEYS_NAME])

        workspace.CreateNet(net)
        return DQNPredictor(net, torch_init_net, parameters, int_features)
srcimg = skimage.io.imread(IMAGE_LOCATION, as_grey=True)
#print srcimg
#[0,1]
#srcimg = skimage.transform.resize(srcimg, (width, height))
#print srcimg
#img = skimage.img_as_float(srcimg).astype(np.float32)

#[-1,1]
img = srcimg - 127.5
img = img / 127.5
img = skimage.transform.resize(img, (width, height))

img = img[np.newaxis, :, :].astype(np.float32)
img = img[np.newaxis, :, :, :].astype(np.float32)

with open(INIT_NET) as f:
    init_net = f.read()
with open(PREDICT_NET) as f:
    predict_net = f.read()

workspace.RunNetOnce(init_net)
workspace.CreateNet(predict_net)
p = workspace.Predictor(init_net, predict_net)
results = p.run([img])
img_out = workspace.FetchBlob(output)
print(type(img_out), img_out.size, img_out.shape)
for i in range(img_out.shape[1]):
    if i % 16 == 0: print("\n")
    print img_out[0][i],
print("\n")
Exemple #12
0
    def Skip_test_tanhquantize(self, scale, zp, size, rand_seed):
        np.random.seed(rand_seed)

        workspace.ResetWorkspace()

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "ref"
        pred_net.external_input.append("X")
        pred_net.external_output.append("Y_q")

        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "Tanh", ["X"], ["Y"]
            )
        )

        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
            )
        )

        X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32)

        pred_net_onnxified = onnxifi_caffe2_net(
            pred_net,
            {"X": X.shape},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
        )
        num_onnxified_ops = sum(
            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
        )
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.FeedBlob("X", X)
        workspace.CreateNet(pred_net_onnxified)
        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchInt8Blob("Y_q")

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.append("X")
        ref_net.external_output.append("Y_q")

        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
            )
        )

        workspace.CreateNet(ref_net)
        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchInt8Blob("Y_q")

        if not np.array_equal(Y_ref.data, Y_glow.data) or \
           not Y_ref.scale == Y_glow.scale or \
           not Y_ref.zero_point == Y_glow.zero_point:
            print_test_debug_info(
                "tanhfusion",
                {
                    "scale": scale,
                    "zp": zp,
                    "input": X,
                    "ideal nonquant": np.tanh(X),
                    "Y_glow": Y_glow,
                    "Y_c2": Y_ref,
                }
            )
            assert(0)
Exemple #13
0
    def test_batch_matmul(self, M, K, N, rand_seed, trans_a, trans_b,
                          run_ints):
        np.random.seed(rand_seed)
        workspace.ResetWorkspace()
        C = 0  # TODO
        batch_dims = np.random.randint(low=1, high=3, size=C,
                                       dtype=np.int64).tolist()

        if run_ints:
            X = np.random.randint(low=1, high=3,
                                  size=((1, M, K))).astype(np.float32)
        else:
            X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(
                np.float32) - 0.5)
        if trans_a:
            X = X.swapaxes(-1, -2)

        if run_ints:
            Y = np.random.randint(low=1, high=3,
                                  size=((1, K, N))).astype(np.float32)
        else:
            Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(
                np.float32) - 0.5)
        if trans_b:
            Y = Y.swapaxes(-1, -2)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", "Y"])
        pred_net.external_output.append("out")
        pred_net.op.add().CopyFrom(
            core.CreateOperator('BatchMatMul', ['X', 'Y'],
                                'out',
                                trans_a=trans_a,
                                trans_b=trans_b))

        pred_net_ref = core.Net("pred_net_ref")
        pred_net_ref.BatchMatMulFP16Acc16Fake(["X", "Y"], ['out'],
                                              trans_a=trans_a,
                                              trans_b=trans_b)

        print("dims", batch_dims, X.shape, Y.shape)
        pred_net_onnxified = onnxifi_caffe2_net(pred_net, {
            "X": X.shape,
            "Y": Y.shape
        },
                                                debug=True,
                                                adjust_batch=False,
                                                use_onnx=False)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(pred_net_ref)

        # Run Glow net
        workspace.RunNet(pred_net_onnxified.name)
        out_glow = workspace.FetchBlob('out')

        # Run caffe2 net
        workspace.RunNet(pred_net_ref)
        out_c2_fakefp16 = workspace.FetchBlob('out')

        diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8))
        rowdiff = np.max(diff, axis=1)

        if not np.allclose(out_glow, out_c2_fakefp16):
            print_test_debug_info(
                "bmm", {
                    "seed": rand_seed,
                    "m": M,
                    "k": K,
                    "n": N,
                    "X": X,
                    "Y": Y,
                    "out_glow": out_glow,
                    "out_c2_fakefp16": out_c2_fakefp16,
                    "diff": diff
                })
            assert (0)
Exemple #14
0
    def test_slws_fused_8bit_rowwise_acc32_nnpi(self, seed, num_rows,
                                                embedding_dim, batch_size,
                                                max_weight):
        workspace.GlobalInit([
            "caffe2",
            "--glow_global_fp16=0",
            "--glow_global_fused_scale_offset_fp16=0",
            "--glow_global_force_sls_fp16_accum=0",
        ])

        workspace.ResetWorkspace()
        np.random.seed(seed)
        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
        lengths = np.random.choice(np.arange(1, num_rows),
                                   batch_size).astype(np.int32)

        indices = []
        for length in lengths:
            indices.extend(np.random.choice(np.arange(1, num_rows), length))
        indices = np.asarray(indices).astype(np.int64)

        weights = np.random.uniform(low=0,
                                    high=max_weight,
                                    size=[len(indices)]).astype(np.float32)

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))
        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            {},
            max_batch_size=batch_size,
            max_seq_size=batch_size * np.max(lengths),
            debug=True,
            adjust_batch=True,
            use_onnx=False,
        )
        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(onnxified_net)
        workspace.CreateNet(ref_net)

        workspace.RunNet(onnxified_net.name)
        Y_glow = workspace.FetchBlob("Y")

        workspace.RunNet(ref_net.name)
        Y_ref = workspace.FetchBlob("Y")

        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
        max_err = np.max(diff, axis=1)
        num_offenders = (max_err > 0).sum()
        if num_offenders > 0:
            print_test_debug_info(
                "test_slws_fused_8bit_rowwise_acc32_nnpi",
                {
                    "indices": indices,
                    "data": data.shape,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_glow": Y_glow,
                    "Y_ref": Y_ref,
                    "diff": diff,
                    "rowwise_diff": np.max(diff, axis=1),
                },
            )
            assert 0
    def run_model(self, gpu_devices):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            for param in model.GetParams():
                grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, grad, LR], param)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(gpu_devices),
        )
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/data".format(g), data)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5])
            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("gpu_0/fc_w")
Exemple #16
0
def load_feature_map(params_file, is_train):
    assert params_file, 'FEATURE_MAP_LOADER.MODEL_PARAMS_FILE is not specified.'
    assert cfg.FEATURE_MAP_LOADER.OUT_DIR, 'FEATURE_MAP_LOADER.OUT_DIR is not specified.'
    logger.info('Inferring feature map from %s' % params_file)

    cfg.FEATURE_MAP_LOADER.ENALBE = True

    cfg.GET_TRAIN_LFB = is_train

    timer = Timer()

    test_model = model_builder_video.ModelBuilder(
        train=False,
        use_cudnn=True,
        cudnn_exhaustive_search=True,
        split=cfg.TEST.DATA_TYPE,
    )

    suffix = 'infer_{}'.format('train' if is_train else 'test')

    if cfg.LFB.ENABLED:
        lfb_path = os.path.join(cfg.LFB.LOAD_LFB_PATH,
                                'train_lfb.pkl' if is_train else 'val_lfb.pkl')
        logger.info('Loading LFB from %s' % lfb_path)
        with open(lfb_path, 'r') as f:
            lfb = pickle.load(f)

        test_model.build_model(
            lfb=lfb,
            suffix=suffix,
            shift=1,
        )

    else:
        test_model.build_model(
            lfb=None,
            suffix=suffix,
            shift=1,
        )

    if cfg.PROF_DAG:
        test_model.net.Proto().type = 'prof_dag'
    else:
        test_model.net.Proto().type = 'dag'

    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    total_test_net_iters = misc.get_total_test_iters(test_model)

    test_model.start_data_loader()

    checkpoints.load_model_from_params_file_for_test(test_model, params_file)

    all_features = {}
    for feat_name in cfg.FEATURE_MAP_LOADER.NAME_LIST:
        all_features[feat_name] = []

    all_metadata = []

    all_labels = []
    all_proposals = []
    all_original_boxes = []

    if cfg.FEATURE_MAP_LOADER.TEST_ITERS > 0:
        total_test_net_iters = cfg.FEATURE_MAP_LOADER.TEST_ITERS

    for test_iter in range(total_test_net_iters):

        timer.tic()
        workspace.RunNet(test_model.net.Proto().name)
        timer.toc()

        if test_iter == 0:
            misc.print_net(test_model)
            os.system('nvidia-smi')
        if test_iter % 10 == 0:
            logger.info("Iter {}/{} Time: {}".format(test_iter,
                                                     total_test_net_iters,
                                                     timer.diff))

        if cfg.DATASET == "ava":
            for feat_name in cfg.FEATURE_MAP_LOADER.NAME_LIST:
                all_features[feat_name].append(get_features(feat_name))

            all_metadata.append(get_features('metadata{}'.format(suffix)))

            all_labels.append(get_features('labels{}'.format(suffix)))
            all_proposals.append(get_features('proposals{}'.format(suffix)))
            all_original_boxes.append(
                get_features('original_boxes{}'.format(suffix)))

#         elif cfg.DATASET in ['charades', 'epic']:
#             all_features.append(get_features('pool5'))
        else:
            raise Exception("Dataset {} not recognized.".format(cfg.DATASET))

    lfb = construct_lfb(all_features, all_metadata, all_labels, all_proposals,
                        all_original_boxes, test_model.input_db, is_train)

    write_lfb(lfb, is_train)

    logger.info("Shutting down data loader...")
    test_model.shutdown_data_loader()

    workspace.ResetWorkspace()
    logger.info("Done ResetWorkspace...")

    cfg.GET_TRAIN_LFB = False
Exemple #17
0
def Test(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    total_batch_size = args.batch_size * num_gpus

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(
                args.clip_length_of if args.input_type == 1
                else args.clip_length_rgb
            ),
            loss_scale=loss_scale,
            is_test=1,
            pred_layer_name=args.pred_layer_name,
        )

    test_model = cnn.CNNModelHelper(
        order="NCHW",
        name="video_model_test",
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
    )

    test_reader, number_of_examples = model_builder.create_data_reader(
        test_model,
        name="test_reader",
        input_data=args.test_data,
    )

    if args.num_iter <= 0:
        num_iter = int(number_of_examples / total_batch_size)
    else:
        num_iter = args.num_iter

    def test_input_fn(model):
        model_helper.AddVideoInput(
            test_model,
            test_reader,
            batch_size=args.batch_size,
            clip_per_video=args.clip_per_video,
            decode_type=1,
            length_rgb=args.clip_length_rgb,
            sampling_rate_rgb=args.sampling_rate_rgb,
            scale_h=args.scale_h,
            scale_w=args.scale_w,
            crop_size=args.crop_size,
            num_decode_threads=4,
            num_of_class=args.num_labels,
            random_mirror=False,
            random_crop=False,
            input_type=args.input_type,
            length_of=args.clip_length_of,
            sampling_rate_of=args.sampling_rate_of,
            frame_gap_of=args.frame_gap_of,
            do_flow_aggregation=args.do_flow_aggregation,
            flow_data_type=args.flow_data_type,
            get_rgb=(args.input_type == 0),
            get_optical_flow=(args.input_type == 1),
            get_video_id=args.get_video_id,
            use_local_file=args.use_local_file,
        )

    data_parallel_model.Parallelize_GPU(
        test_model,
        input_builder_fun=test_input_fn,
        forward_pass_builder_fun=create_model_ops,
        param_update_builder_fun=None,
        devices=gpus,
    )
    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    if args.db_type == 'minidb':
        model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        model_loader.LoadModelFromPickleFile(
            test_model,
            args.load_model_path,
            root_gpu_id=gpus[0]
        )
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(test_model)

    # metric counters for classification
    clip_acc = 0
    video_top1 = 0
    video_topk = 0
    video_count = 0
    clip_count = 0

    for i in range(num_iter):
        workspace.RunNet(test_model.net.Proto().name)
        for g in test_model._devices:
            # get labels
            label = workspace.FetchBlob(
                "gpu_{}".format(g) + '/label'
            )
            # get predictions
            predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax')
            assert predicts.shape[0] == args.batch_size * args.clip_per_video

            for j in range(args.batch_size):
                # get label for one video
                sample_label = label[j * args.clip_per_video]
                # get clip accuracy
                for k in range(args.clip_per_video):
                    c1, _ = metric.accuracy_metric(
                        predicts[j * args.clip_per_video + k, :],
                        label[j * args.clip_per_video + k])
                    clip_acc = clip_acc + c1
                # get all clip predictions for one video
                all_clips = predicts[
                    j * args.clip_per_video:(j + 1) * args.clip_per_video, :]
                # aggregate predictions into one
                video_pred = PredictionAggregation(all_clips, args.aggregation)
                c1, ck = metric.accuracy_metric(
                    video_pred, sample_label, args.top_k)
                video_top1 = video_top1 + c1
                video_topk = video_topk + ck

            video_count = video_count + args.batch_size
            clip_count = clip_count + label.shape[0]

        if i > 0 and i % args.display_iter == 0:
            log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format(
                i,
                num_iter,
                clip_acc / clip_count,
                video_top1 / video_count,
                video_topk / video_count))

    log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format(
        clip_acc / clip_count,
        video_top1 / video_count,
        args.top_k,
        video_topk / video_count
    ))

    flops, params = model_helper.GetFlopsAndParams(test_model, args.gpus[0])
    log.info('FLOPs: {}, params: {}'.format(flops, params))
Exemple #18
0
                       db=os.path.join(data_folder, 'mnist-test-nchw-lmdb'),
                       db_type='lmdb')
softmax = AddModel(test_model, data)

# Deployment model. We simply need the main AddModel part.
deploy_model = model_helper.ModelHelper(name="mnist_deploy",
                                        arg_scope=arg_scope,
                                        init_params=False)
AddModel(deploy_model, "data")

# The parameter initialization network only needs to be run once.
# Now all the parameter blobs are going to be initialized in the workspace.
workspace.RunNetOnce(train_model.param_init_net)

# overwrite=True allows you to run this cell several times and avoid errors
workspace.CreateNet(train_model.net, overwrite=True)

# Set the iterations number and track the accuracy & loss
total_iters = 200
accuracy = np.zeros(total_iters)
loss = np.zeros(total_iters)

print("The blobs in the workspace pre-train: {}".format(workspace.Blobs()))

# Now, we will manually run the network for 200 iterations.
for i in range(total_iters):
    workspace.RunNet(train_model.net)
    accuracy[i] = workspace.blobs['accuracy']
    loss[i] = workspace.blobs['loss']

print("The blobs in the workspace post-train: {}".format(workspace.Blobs()))
    def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None):
        '''
        Runs shape inference, and then the model to check
        that the inferred shapes agree with the actual ones

        'expected_uninferred_blobs' is the list of blobs for which type and
        shape cannot be inferred.
        '''
        (shapes, types) = workspace.InferShapesAndTypes(
            [model.param_init_net, model.net], )

        # .. Create net
        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net, True)
        workspace.RunNet(model.Proto().name)

        # ... and then check the shapes mismatch
        correct_shapes = {}
        correct_types = {}
        for b in workspace.Blobs():
            arr = workspace.FetchBlob(b)
            correct_shapes[b] = arr.shape
            if type(arr) is np.ndarray:
                if arr.dtype == np.dtype('float32'):
                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT
                elif arr.dtype == np.dtype('int32'):
                    correct_types[b] = caffe2_pb2.TensorProto.INT32
                # BYTE
                # STRING
                elif arr.dtype == np.dtype('bool'):
                    correct_types[b] = caffe2_pb2.TensorProto.BOOL
                elif arr.dtype == np.dtype('uint8'):
                    correct_types[b] = caffe2_pb2.TensorProto.UINT8
                elif arr.dtype == np.dtype('int8'):
                    correct_types[b] = caffe2_pb2.TensorProto.INT8
                elif arr.dtype == np.dtype('uint16'):
                    correct_types[b] = caffe2_pb2.TensorProto.UINT16
                elif arr.dtype == np.dtype('int16'):
                    correct_types[b] = caffe2_pb2.TensorProto.INT16
                elif arr.dtype == np.dtype('int64'):
                    correct_types[b] = caffe2_pb2.TensorProto.INT64
                elif arr.dtype == np.dtype('float16'):
                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT16
                elif arr.dtype == np.dtype('float64'):
                    correct_types[b] = caffe2_pb2.TensorProto.DOUBLE
                else:
                    correct_types[b] = "unknown {}".format(arr.dtype)
            else:
                correct_types[b] = str(type(arr))

        if expected_uninferred_blobs is None:
            expected_uninferred_blobs = []
        for b in correct_shapes:
            # skip blobs for which shape couldn't be inferred
            if b in expected_uninferred_blobs:
                continue
            self.assertTrue(
                np.array_equal(
                    np.array(shapes[b]).astype(np.int32),
                    np.array(correct_shapes[b]).astype(np.int32)),
                "Shape {} mismatch: {} vs. correct {}".format(
                    b, shapes[b], correct_shapes[b]))
            self.assertFalse(
                b not in types and b in correct_types,
                "Type for {} not defined".format(b),
            )
            self.assertEqual(
                types[b], correct_types[b],
                "Type {} mismatch: {} vs. {}".format(
                    b,
                    types[b],
                    correct_types[b],
                ))
Exemple #20
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            ('id_score_pairs',
             Map(
                 Scalar(np.int32),
                 Map(Scalar(np.int64),
                     Scalar(np.float32),
                     keys_name='ids',
                     values_name='scores'),
             )),
            # additional scalar information
            ('metadata',
             Struct(
                 ('user_id', Scalar(np.int64)),
                 ('user_embed', Scalar((np.float32, 2))),
                 ('query', Scalar(str)),
             )),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        written as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(expected_fields, schema.field_names(),
                     schema.field_types())
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The dataset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2,
             32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            (
                [], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        Trim a dataset
        """
        trim_net = core.Net('trim_ds')
        ds.trim(trim_net, multiple_of=2)
        workspace.RunNetOnce(trim_net)
        trimmed = FetchRecord(ds.content())
        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
        self.assertEquals(EXPECTED_SIZES, actual_sizes)
Exemple #21
0
    def testEqualToCudnn(self):
        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType)):
            T = 8
            batch_size = 4
            input_dim = 8
            hidden_dim = 31

            workspace.FeedBlob(
                "seq_lengths",
                np.array([T] * batch_size, dtype=np.int32)
            )
            workspace.FeedBlob("target", np.zeros(
                [T, batch_size, hidden_dim], dtype=np.float32
            ))
            workspace.FeedBlob("hidden_init", np.zeros(
                [1, batch_size, hidden_dim], dtype=np.float32
            ))
            workspace.FeedBlob("cell_init", np.zeros(
                [1, batch_size, hidden_dim], dtype=np.float32
            ))

            own_model = model_helper.ModelHelper(name="own_lstm")

            input_shape = [T, batch_size, input_dim]
            cudnn_model = model_helper.ModelHelper(name="cudnn_lstm")
            input_blob = cudnn_model.param_init_net.UniformFill(
                [], "input", shape=input_shape)
            workspace.FeedBlob("CUDNN/hidden_init_cudnn", np.zeros(
                [1, batch_size, hidden_dim], dtype=np.float32
            ))
            workspace.FeedBlob("CUDNN/cell_init_cudnn", np.zeros(
                [1, batch_size, hidden_dim], dtype=np.float32
            ))

            cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM(
                model=cudnn_model,
                input_blob=input_blob,
                initial_states=("hidden_init_cudnn", "cell_init_cudnn"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="CUDNN",
                return_params=True,
            )
            cudnn_loss = cudnn_model.AveragedLoss(
                cudnn_model.SquaredL2Distance(
                    [cudnn_output, "target"], "CUDNN/dist"
                ), "CUDNN/loss"
            )

            own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM(
                model=own_model,
                input_blob=input_blob,
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="OWN",
                return_params=True,
            )
            own_loss = own_model.AveragedLoss(
                own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"),
                "OWN/loss"
            )

            # Add gradients
            cudnn_model.AddGradientOperators([cudnn_loss])
            own_model.AddGradientOperators([own_loss])

            # Add parameter updates
            LR = cudnn_model.param_init_net.ConstantFill(
                [], shape=[1], value=0.01
            )
            ONE = cudnn_model.param_init_net.ConstantFill(
                [], shape=[1], value=1.0
            )
            for param in cudnn_model.GetParams():
                cudnn_model.WeightedSum(
                    [param, ONE, cudnn_model.param_to_grad[param], LR], param
                )
            for param in own_model.GetParams():
                own_model.WeightedSum(
                    [param, ONE, own_model.param_to_grad[param], LR], param
                )

            # Copy states over
            own_model.net.Copy(own_last_hidden, "hidden_init")
            own_model.net.Copy(own_last_state, "cell_init")
            cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn")
            cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn")

            workspace.RunNetOnce(cudnn_model.param_init_net)
            workspace.CreateNet(cudnn_model.net)

            ##
            ##  CUDNN LSTM MODEL EXECUTION
            ##
            # Get initial values from CuDNN LSTM so we can feed them
            # to our own.
            (param_extract_net, param_extract_mapping) = param_extract
            workspace.RunNetOnce(param_extract_net)
            cudnn_lstm_params = {
                input_type: {
                    k: workspace.FetchBlob(v[0])
                    for k, v in viewitems(pars)
                }
                for input_type, pars in viewitems(param_extract_mapping)
            }

            # Run the model 3 times, so that some parameter updates are done
            workspace.RunNet(cudnn_model.net.Proto().name, 3)

            ##
            ## OWN LSTM MODEL EXECUTION
            ##
            # Map the cuDNN parameters to our own
            workspace.RunNetOnce(own_model.param_init_net)
            rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params)

            # Run the model 3 times, so that some parameter updates are done
            workspace.CreateNet(own_model.net)
            workspace.RunNet(own_model.net.Proto().name, 3)

            ##
            ## COMPARE RESULTS
            ##
            # Then compare that final results after 3 runs are equal
            own_output_data = workspace.FetchBlob(own_output)
            own_last_hidden = workspace.FetchBlob(own_last_hidden)
            own_loss = workspace.FetchBlob(own_loss)

            cudnn_output_data = workspace.FetchBlob(cudnn_output)
            cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden)
            cudnn_loss = workspace.FetchBlob(cudnn_loss)

            self.assertTrue(np.allclose(own_output_data, cudnn_output_data))
            self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden))
            self.assertTrue(np.allclose(own_loss, cudnn_loss))
    def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim,
                                     batch_size, max_weight):
        workspace.ResetWorkspace()
        np.random.seed(seed)
        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
        data = data * 1e-3

        lengths = np.random.choice(np.arange(1, num_rows),
                                   batch_size).astype(np.int32)
        indices = []
        for length in lengths:
            indices.extend(np.random.choice(np.arange(1, num_rows), length))
        indices = np.asarray(indices).astype(np.int64)

        weights = np.random.uniform(
            low=0, high=max_weight, size=[len(indices)]).astype(
                np.float32) - max_weight / 2.0
        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused4BitRowwise",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        ref_net = caffe2_pb2.NetDef()
        ref_net.name = "ref"
        ref_net.external_input.extend(
            ["quantized_data", "weights", "indices", "lengths"])
        ref_net.external_output.append("Y")
        ref_net.op.add().CopyFrom(
            core.CreateOperator(
                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
                ["quantized_data", "weights", "indices", "lengths"],
                ["Y"],
            ))

        workspace.FeedBlob("data", data)
        workspace.RunOperatorOnce(
            core.CreateOperator("FloatToFused4BitRowwiseQuantized", ["data"],
                                ["quantized_data"]))

        pred_net_onnxified = onnxifi_caffe2_net(pred_net, {},
                                                max_batch_size=batch_size,
                                                max_seq_size=np.max(lengths),
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)

        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)
        workspace.FeedBlob("weights", weights)

        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(ref_net)

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob('Y')

        workspace.RunNet(ref_net.name)
        Y_c2 = workspace.FetchBlob('Y')

        if not np.allclose(Y_c2, Y_glow):
            print_test_debug_info(
                "slws_fused_4bit_rowwise", {
                    "seed": seed,
                    "indices": indices,
                    "data": data.shape,
                    "lengths": lengths,
                    "weights": weights,
                    "Y_c2": Y_c2.shape,
                    "Y_glow": Y_glow.shape,
                    "diff": Y_glow - Y_c2,
                    "rowwise_diff": (Y_glow - Y_c2)[:, 0]
                })
            assert (0)
Exemple #23
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32))
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(
                model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(self.T, self.batch_size,
                                         self.hidden_dim)

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g),
                        targets)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(
            model._device_prefix))
 def test_slws_fused_4bit_rowwise_all_same(self, seed):
     np.random.seed(seed)
     workspace.ResetWorkspace()
     n = 1
     m = 2
     data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
     max_segments = 5
     max_segment_length = 100
     num_lengths = np.random.randint(1, max_segments + 1)
     # number of segments to run
     lengths = np.random.randint(0,
                                 max_segment_length + 1,
                                 size=num_lengths).astype(np.int32)
     num_indices = np.sum(lengths)
     indices = np.zeros(num_indices, dtype=np.int64)
     weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\
         .astype(np.float32)
     weights = np.ones(len(indices)).astype(np.float32)
     pred_net = caffe2_pb2.NetDef()
     pred_net.name = "pred"
     pred_net.external_input.extend(
         ["quantized_data", "weights", "indices", "lengths"])
     pred_net.external_output.append("Y")
     pred_net.op.add().CopyFrom(
         core.CreateOperator(
             "SparseLengthsWeightedSumFused4BitRowwise",
             ["quantized_data", "weights", "indices", "lengths"],
             ["Y"],
         ))
     ref_net = caffe2_pb2.NetDef()
     ref_net.name = "ref"
     ref_net.external_input.extend(
         ["quantized_data", "weights", "indices", "lengths"])
     ref_net.external_output.append("Y")
     ref_net.op.add().CopyFrom(
         core.CreateOperator(
             "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
             ["quantized_data", "weights", "indices", "lengths"],
             ["Y"],
         ))
     workspace.FeedBlob("data", data)
     workspace.RunOperatorOnce(
         core.CreateOperator("FloatToFused4BitRowwiseQuantized", ['data'],
                             ['quantized_data']))
     print("quantized", workspace.FetchBlob("quantized_data"))
     pred_net_onnxified = onnxifi_caffe2_net(pred_net, {},
                                             max_batch_size=max_segments,
                                             max_seq_size=max_segments *
                                             max_segment_length,
                                             debug=True,
                                             adjust_batch=True,
                                             use_onnx=False)
     num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                             for o in pred_net_onnxified.op)
     np.testing.assert_equal(num_onnxified_ops, 1)
     workspace.FeedBlob("indices", indices)
     workspace.FeedBlob("lengths", lengths)
     workspace.FeedBlob("weights", weights)
     workspace.CreateNet(pred_net_onnxified)
     workspace.CreateNet(ref_net)
     workspace.RunNet(pred_net_onnxified.name)
     Y_glow = workspace.FetchBlob('Y')
     workspace.RunNet(ref_net.name)
     Y_c2 = workspace.FetchBlob('Y')
     if not np.allclose(Y_c2, Y_glow):
         print_test_debug_info(
             "slws_fused_4bit_rowwise", {
                 "seed": seed,
                 "indices": indices,
                 "data": data,
                 "lengths": lengths,
                 "weights": weights,
                 "Y_c2": Y_c2,
                 "Y_glow": Y_glow,
                 "diff": Y_glow - Y_c2,
                 "rowwise_diff": (Y_glow - Y_c2)[:, 0]
             })
         assert (0)
Exemple #25
0
    def run_model(self, V, gpu_devices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec,
                    'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)])
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill([],
                                                     "vec_{}".format(num),
                                                     shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [], "vec_2", shape=[batch_per_device, 16]))
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size)
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(vec, orig_vec)
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format(
                    g, num)) for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.
                    format(len(idx), len(grad_slice)))
Exemple #26
0
pred = m.net.Sigmoid(fc_1, "pred")
softmax, loss = m.net.SoftmaxWithLoss([pred, "label"], ["softmax", "loss"])

# print(m.net.Proto())
# print(m.param_init_net.Proto())

# save the model as graph
graph = net_drawer.GetPydotGraph(m.net, rankdir="BT")
graph.write_png("hello.png")

# init, create and run
m.AddGradientOperators([loss])  # add gradient
# print(m.net.Proto())          # observe gradient

workspace.RunNetOnce(m.param_init_net)
workspace.CreateNet(m.net)

for ii in range(100):
    data = np.random.rand(16, 100).astype(np.float32)
    label = (np.random.rand(16) * 10).astype(np.int32)

    workspace.FeedBlob("data", data)
    workspace.FeedBlob("label", label)

    workspace.RunNet(m.name, 10)  # run for 10 times
    # print("Run: ", ii)

# save the model with grad
graph = net_drawer.GetPydotGraph(m.net, rankdir="BT")
graph.write_png("hello_with_grad.png")
Exemple #27
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnet50(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-prceision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,  # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Exemple #28
0
    def testIf(self):
        W_a_values = [2.0, 1.5]
        B_a_values = [0.5]
        W_b_values = [7.0, 3.5]
        B_b_values = [1.5]

        with NetBuilder(_use_control_ops=True) as init_nb:
            W_a = ops.UniformFill([], "W_a", shape=[1, 2], min=-1., max=1.)
            B_a = ops.ConstantFill([], "B_a", shape=[1], value=0.0)
            W_b = ops.UniformFill([], "W_b", shape=[1, 2], min=-1., max=1.)
            B_b = ops.ConstantFill([], "B_b", shape=[1], value=0.0)

            W_gt_a = ops.GivenTensorFill([],
                                         "W_gt_a",
                                         shape=[1, 2],
                                         values=W_a_values)
            B_gt_a = ops.GivenTensorFill([],
                                         "B_gt_a",
                                         shape=[1],
                                         values=B_a_values)
            W_gt_b = ops.GivenTensorFill([],
                                         "W_gt_b",
                                         shape=[1, 2],
                                         values=W_b_values)
            B_gt_b = ops.GivenTensorFill([],
                                         "B_gt_b",
                                         shape=[1],
                                         values=B_b_values)

        params = [W_gt_a, B_gt_a, W_a, B_a, W_gt_b, B_gt_b, W_b, B_b]

        with NetBuilder(_use_control_ops=True,
                        initial_scope=params) as train_nb:
            Y_pred = ops.ConstantFill([], "Y_pred", shape=[1], value=0.0)
            Y_noise = ops.ConstantFill([], "Y_noise", shape=[1], value=0.0)

            switch = ops.UniformFill([],
                                     "switch",
                                     shape=[1],
                                     min=-1.,
                                     max=1.,
                                     run_once=0)
            zero = ops.ConstantFill([], "zero", shape=[1], value=0.0)
            X = ops.GaussianFill([],
                                 "X",
                                 shape=[4096, 2],
                                 mean=0.0,
                                 std=1.0,
                                 run_once=0)
            noise = ops.GaussianFill([],
                                     "noise",
                                     shape=[4096, 1],
                                     mean=0.0,
                                     std=1.0,
                                     run_once=0)

            with ops.IfNet(ops.LT([switch, zero])):
                Y_gt = ops.FC([X, W_gt_a, B_gt_a], "Y_gt")
                ops.Add([Y_gt, noise], Y_noise)
                ops.FC([X, W_a, B_a], Y_pred)
            with ops.Else():
                Y_gt = ops.FC([X, W_gt_b, B_gt_b], "Y_gt")
                ops.Add([Y_gt, noise], Y_noise)
                ops.FC([X, W_b, B_b], Y_pred)

            dist = ops.SquaredL2Distance([Y_noise, Y_pred], "dist")
            loss = dist.AveragedLoss([], ["loss"])

        assert len(init_nb.get()) == 1, "Expected a single init net produced"
        assert len(train_nb.get()) == 1, "Expected a single train net produced"

        train_net = train_nb.get()[0]
        gradient_map = train_net.AddGradientOperators([loss])

        init_net = init_nb.get()[0]
        ITER = init_net.ConstantFill([],
                                     "ITER",
                                     shape=[1],
                                     value=0,
                                     dtype=core.DataType.INT32)
        train_net.Iter(ITER, ITER)
        LR = train_net.LearningRate(ITER,
                                    "LR",
                                    base_lr=-0.1,
                                    policy="step",
                                    stepsize=20,
                                    gamma=0.9)
        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
        train_net.WeightedSum([W_a, ONE, gradient_map[W_a], LR], W_a)
        train_net.WeightedSum([B_a, ONE, gradient_map[B_a], LR], B_a)
        train_net.WeightedSum([W_b, ONE, gradient_map[W_b], LR], W_b)
        train_net.WeightedSum([B_b, ONE, gradient_map[B_b], LR], B_b)

        workspace.RunNetOnce(init_net)
        workspace.CreateNet(train_net)
        # print("Before training, W_a is: {}".format(workspace.FetchBlob("W_a")))
        # print("Before training, B_a is: {}".format(workspace.FetchBlob("B_a")))
        # print("Before training, W_b is: {}".format(workspace.FetchBlob("W_b")))
        # print("Before training, B_b is: {}".format(workspace.FetchBlob("B_b")))

        for _epoch in range(1000):
            workspace.RunNet(train_net.Proto().name)

        # print("After training, W_a is: {}".format(workspace.FetchBlob("W_a")))
        # print("After training, B_a is: {}".format(workspace.FetchBlob("B_a")))
        # print("After training, W_b is: {}".format(workspace.FetchBlob("W_b")))
        # print("After training, B_b is: {}".format(workspace.FetchBlob("B_b")))
        # print("Ground truth W_a is: {}".format(workspace.FetchBlob("W_gt_a")))
        # print("Ground truth B_a is: {}".format(workspace.FetchBlob("B_gt_a")))
        # print("Ground truth W_b is: {}".format(workspace.FetchBlob("W_gt_b")))
        # print("Ground truth B_b is: {}".format(workspace.FetchBlob("B_gt_b")))

        values_map = {
            "W_a": W_a_values,
            "B_a": B_a_values,
            "W_b": W_b_values,
            "B_b": B_b_values,
        }

        train_eps = 0.01

        for blob_name, values in values_map.items():
            trained_values = workspace.FetchBlob(blob_name)
            if trained_values.ndim == 2:
                self.assertEqual(trained_values.shape[0], 1)
                trained_values = trained_values[0][:]
            else:
                self.assertEqual(trained_values.ndim, 1)

            self.assertEqual(trained_values.size, len(values))
            for idx in range(len(trained_values)):
                self.assertTrue(
                    abs(trained_values[idx] - values[idx]) < train_eps)
Exemple #29
0
    def TrainModel(self):
        log.debug("Training model")

        workspace.RunNetOnce(self.model.param_init_net)

        # As though we predict the same probablity for each character
        smooth_loss = -np.log(1.0 / self.D) * self.seq_length
        last_n_iter = 0
        last_n_loss = 0.0
        num_iter = 0
        N = len(self.text)

        # We split text into batch_size peaces. Each peace will be used only
        # by a corresponding batch during the training process
        text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
        text_block_size = N // self.batch_size
        text_block_starts = range(0, N, text_block_size)
        text_block_sizes = [text_block_size] * self.batch_size
        text_block_sizes[self.batch_size - 1] += N % self.batch_size
        assert sum(text_block_sizes) == N

        # Writing to output states which will be copied to input
        # states within the loop below
        workspace.FeedBlob(
            self.hidden_output,
            np.zeros([1, self.batch_size, self.hidden_size], dtype=np.float32))
        workspace.FeedBlob(
            self.cell_state,
            np.zeros([1, self.batch_size, self.hidden_size], dtype=np.float32))
        workspace.CreateNet(self.prepare_state)

        # We iterate over text in a loop many times. Each time we peak
        # seq_length segment and feed it to LSTM as a sequence
        last_time = datetime.now()
        progress = 0
        while True:
            workspace.FeedBlob(
                "seq_lengths",
                np.array([self.seq_length] * self.batch_size, dtype=np.int32))
            workspace.RunNet(self.prepare_state.Name())

            input = np.zeros([self.seq_length, self.batch_size,
                              self.D]).astype(np.float32)
            target = np.zeros([self.seq_length * self.batch_size
                               ]).astype(np.int32)

            for e in range(self.batch_size):
                for i in range(self.seq_length):
                    pos = text_block_starts[e] + text_block_positions[e]
                    input[i][e][self._idx_at_pos(pos)] = 1
                    target[i * self.batch_size + e] =\
                        self._idx_at_pos((pos + 1) % N)
                    text_block_positions[e] = (text_block_positions[e] +
                                               1) % text_block_sizes[e]
                    progress += 1

            workspace.FeedBlob('input_blob', input)
            workspace.FeedBlob('target', target)

            CreateNetOnce(self.model.net)
            workspace.RunNet(self.model.net.Name())

            num_iter += 1
            last_n_iter += 1

            if num_iter % self.iters_to_report == 0:
                new_time = datetime.now()
                print("Characters Per Second: {}".format(
                    int(progress / (new_time - last_time).total_seconds())))
                print("Iterations Per Second: {}".format(
                    int(self.iters_to_report /
                        (new_time - last_time).total_seconds())))

                last_time = new_time
                progress = 0

                print("{} Iteration {} {}".format('-' * 10, num_iter,
                                                  '-' * 10))

            loss = workspace.FetchBlob(self.loss) * self.seq_length
            smooth_loss = 0.999 * smooth_loss + 0.001 * loss
            last_n_loss += loss

            if num_iter % self.iters_to_report == 0:
                self.GenerateText(500, np.random.choice(self.vocab))

                log.debug("Loss since last report: {}".format(last_n_loss /
                                                              last_n_iter))
                log.debug("Smooth loss: {}".format(smooth_loss))

                last_n_loss = 0.0
                last_n_iter = 0
Exemple #30
0
    def from_trainers(cls, trainer, features, actions,
                      normalization_parameters):
        """ Creates DiscreteActionPredictor from a list of action trainers

        :param trainer DiscreteActionTrainer
        :param features list of state feature names
        :param actions list of action names
        """
        int_features = [int(feature) for feature in features]
        inputs = [
            'input/float_features.lengths', 'input/float_features.keys',
            'input/float_features.values'
        ]
        workspace.FeedBlob('input/float_features.lengths',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.keys',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.values',
                           np.zeros(1, dtype=np.float32))
        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        dense_input = net.NextBlob('dense_input')
        workspace.FeedBlob(dense_input, np.zeros(1, dtype=np.float32))
        default_input_value = net.NextBlob('default_input_value')
        workspace.FeedBlob(default_input_value,
                           np.array([MISSING_VALUE], dtype=np.float32))
        net.GivenTensorFill([], [default_input_value],
                            shape=[],
                            values=[MISSING_VALUE])
        net.SparseToDenseMask([
            'input/float_features.keys',
            'input/float_features.values',
            default_input_value,
            'input/float_features.lengths',
        ], [dense_input],
                              mask=int_features)
        for i, feature in enumerate(features):
            net.Slice(
                [dense_input],
                [feature],
                starts=[0, i],
                ends=[-1, (i + 1)],
            )
        normalizer = PreprocessorNet(net, True)
        parameters = list(normalizer.parameters[:])
        parameters.append(default_input_value)
        normalized_input_blobs = []
        zero = "ZERO_from_trainers"
        workspace.FeedBlob(zero, np.array(0))
        parameters.append(zero)
        for feature in features:
            normalized_input_blob, blob_parameters = normalizer.preprocess_blob(
                feature,
                normalization_parameters[feature],
            )
            parameters.extend(blob_parameters)
            normalized_input_blobs.append(normalized_input_blob)

        concatenated_input_blob = "PredictorInput"
        output_dim = "PredictorOutputDim"
        for i, inp in enumerate(normalized_input_blobs):
            logger.info("input# {}: {}".format(i, inp))
        net.Concat(normalized_input_blobs,
                   [concatenated_input_blob, output_dim],
                   axis=1)
        net.NanCheck(concatenated_input_blob, concatenated_input_blob)

        q_values = "q_values"
        workspace.FeedBlob(q_values, np.zeros(1, dtype=np.float32))
        trainer.build_predictor(model, concatenated_input_blob, q_values)
        parameters.extend(model.GetAllParams())

        action_names = net.NextBlob("action_names")
        parameters.append(action_names)
        workspace.FeedBlob(action_names, np.array(actions))
        action_range = net.NextBlob("action_range")
        parameters.append(action_range)
        workspace.FeedBlob(action_range, np.array(list(range(len(actions)))))

        output_shape = net.NextBlob("output_shape")
        workspace.FeedBlob(output_shape, np.zeros(1, dtype=np.int64))
        net.Shape([q_values], [output_shape])
        output_shape_row_count = net.NextBlob("output_shape_row_count")
        net.Slice([output_shape], [output_shape_row_count],
                  starts=[0],
                  ends=[1])

        output_row_shape = net.NextBlob("output_row_shape")
        workspace.FeedBlob(output_row_shape, np.zeros(1, dtype=np.int64))
        net.Slice([q_values], [output_row_shape], starts=[0, 0], ends=[-1, 1])

        output_feature_keys = 'output/string_weighted_multi_categorical_features.keys'
        workspace.FeedBlob(output_feature_keys, np.zeros(1, dtype=np.int64))
        output_feature_keys_matrix = net.NextBlob('output_feature_keys_matrix')
        net.ConstantFill([output_row_shape], [output_feature_keys_matrix],
                         value=0,
                         dtype=caffe2_pb2.TensorProto.INT64)
        net.FlattenToVec(
            [output_feature_keys_matrix],
            [output_feature_keys],
        )

        output_feature_lengths = \
            'output/string_weighted_multi_categorical_features.lengths'
        workspace.FeedBlob(output_feature_lengths, np.zeros(1, dtype=np.int32))
        output_feature_lengths_matrix = net.NextBlob(
            'output_feature_lengths_matrix')
        net.ConstantFill([output_row_shape], [output_feature_lengths_matrix],
                         value=1,
                         dtype=caffe2_pb2.TensorProto.INT32)
        net.FlattenToVec(
            [output_feature_lengths_matrix],
            [output_feature_lengths],
        )

        output_keys = 'output/string_weighted_multi_categorical_features.values.keys'
        workspace.FeedBlob(output_keys, np.array(['a']))
        net.Tile([action_names, output_shape_row_count], [output_keys], axis=1)

        output_lengths_matrix = net.NextBlob('output_lengths_matrix')
        net.ConstantFill([output_row_shape], [output_lengths_matrix],
                         value=len(actions),
                         dtype=caffe2_pb2.TensorProto.INT32)
        output_lengths = \
            'output/string_weighted_multi_categorical_features.values.lengths'
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        net.FlattenToVec(
            [output_lengths_matrix],
            [output_lengths],
        )

        output_values = \
            'output/string_weighted_multi_categorical_features.values.values'
        workspace.FeedBlob(output_values, np.array([1.0]))
        net.FlattenToVec([q_values], [output_values])

        output_blobs = [
            output_feature_keys,
            output_feature_lengths,
            output_keys,
            output_lengths,
            output_values,
        ]
        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        predictor = cls(net, inputs, output_blobs, parameters,
                        workspace.CurrentWorkspace())
        return predictor