コード例 #1
0
def optimize_gradient_memory_resnet(model, loss):
    model.net._net = memonger.share_grad_blobs(
        model.net,
        loss,
        set(model.param_to_grad.values()),
        namescope="",
        share_activations=False)
コード例 #2
0
def OptimizeGradientMemory(model,
                           input_shapes,
                           excluded_blobs,
                           recycle_activations):
    """
    Optimize memory usage of the backward pass by recycling blobs for gradient
    inputs that have been 'used'.
    input_shapes:  dict of blob name to shape for the inputs of the model.
                   Pass empty dictionary if not known.
    excluded_blobs: list of blobs that cannot be recycled. These are blobs
                   that you will access externally.
    recycle_activations: whether to also recycle forward pass activations
    """
    input_shapes_all_devices = {}
    for b, shp in input_shapes.items():
        for d in model._devices:
            input_shapes_all_devices["gpu_{}/{}".format(d, b)] = shp

    (shapes, types) = workspace.InferShapesAndTypes(
        [model.param_init_net, model.net],
        input_shapes_all_devices,
    )

    for device in model._devices:
        namescope = "gpu_{}/".format(device)
        excluded_blobs_by_device = set([namescope + b for b in excluded_blobs])
        model.net._net = memonger.share_grad_blobs(
            model.net,
            model._losses_by_gpu[device],
            set(model.param_to_grad.values()),
            namescope,
            dont_share_blobs=excluded_blobs_by_device,
            share_activations=recycle_activations,
            blob_shapes=shapes,
        )
コード例 #3
0
ファイル: Multi-GPU_Training.py プロジェクト: Yangqing/caffe2
def optimize_gradient_memory(model, loss):
    model.net._net = memonger.share_grad_blobs(
        model.net,
        loss,
        set(model.param_to_grad.values()),
        namescope="imonaboat",
        share_activations=False,
        )
コード例 #4
0
ファイル: mode_test.py プロジェクト: Randy-jun/VOC-tools
def OptimizeGradientMemory(model, loss):
    model.net._net = memonger.share_grad_blobs(
        model.net,
        loss,
        set(model.param_to_grad.values()),
        namescope="test",
        share_activations=False,
    )
コード例 #5
0
ファイル: memonger_test.py プロジェクト: tigerdavid/caffe2
    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
        m = cnn.CNNModelHelper()
        with core.NameScope("name_x"):
            fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")
        input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])

        def count_blobs(proto):
            blob_set = set()
            for op in proto.op:
                for inp in op.input:
                    blob_set.add(inp)
                for outp in op.output:
                    blob_set.add(outp)
            return len(blob_set)

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss1", "name_x/loss2"],
            set(m.param_to_grad.values()),
            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        print(str(optim_proto))

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(
            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #6
0
def _OptimizeGradientMemory(model, losses_by_gpu, devices):
    for device in devices:
        namescope = "gpu_{}/".format(device)
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses_by_gpu[device],
            set(model.param_to_grad.values()),
            namescope,
        )
コード例 #7
0
    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
        m = cnn.CNNModelHelper()
        with core.NameScope("name_x"):
            fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")
        input_to_grad = m.AddGradientOperators(
            ["name_x/loss1", "name_x/loss2"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss1", "name_x/loss2"],
            set(m.param_to_grad.values()),
            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
            share_activations=True,
            dont_share_blobs=set([
                'name_x/fc6', 'name_x/fc5',
                str(input_to_grad["name_x/fc1_w"])
            ]),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)
        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #8
0
def _OptimizeGradientMemory(model, losses_by_gpu, devices):
    for device in devices:
        namescope = "gpu_{}/".format(device)
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses_by_gpu[device],
            set(model.param_to_grad.values()),
            namescope,
        )
コード例 #9
0
    def test_rnn(self):
        from caffe2.python import rnn_cell
        T = 5
        model = model_helper.ModelHelper()
        seq_lengths, labels = \
            model.net.AddExternalInputs(
                'seq_lengths', 'labels',
            )
        init_blobs = []
        for i in range(2):
            hidden_init, cell_init = model.net.AddExternalInputs(
                "hidden_init_{}".format(i),
                "cell_init_{}".format(i)
            )
            init_blobs.extend([hidden_init, cell_init])
        model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10])
        output, last_hidden, _, last_state = rnn_cell.LSTM(
            model=model,
            input_blob="input",
            seq_lengths=seq_lengths,
            initial_states=init_blobs,
            dim_in=10,
            dim_out=[10, 10],
            scope="lstm1",
            forward_only=False,
            drop_states=True,
            return_last_layer_only=True,
        )
        softmax, loss = model.net.SoftmaxWithLoss(
            [model.Flatten(output), "labels"],
            ['softmax', 'loss'],
        )

        model.AddGradientOperators([loss])
        blobs_before = count_blobs(model.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            model.net,
            ["loss"],
            set(viewvalues(model.param_to_grad)),
            "",
            share_activations=True,
            dont_share_blobs=set(),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Run once to see all blobs are set up correctly
        for init_blob in init_blobs:
            workspace.FeedBlob(init_blob, np.zeros(
                [1, 4, 10], dtype=np.float32
            ))
        workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32))
        workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32))

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(model.net)
コード例 #10
0
ファイル: memonger_test.py プロジェクト: Ralfhund/caffe2
    def test_rnn(self):
        from caffe2.python import rnn_cell
        T = 5
        model = model_helper.ModelHelper()
        seq_lengths, labels = \
            model.net.AddExternalInputs(
                'seq_lengths', 'labels',
            )
        init_blobs = []
        for i in range(2):
            hidden_init, cell_init = model.net.AddExternalInputs(
                "hidden_init_{}".format(i),
                "cell_init_{}".format(i)
            )
            init_blobs.extend([hidden_init, cell_init])
        model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10])
        output, last_hidden, _, last_state = rnn_cell.LSTM(
            model=model,
            input_blob="input",
            seq_lengths=seq_lengths,
            initial_states=init_blobs,
            dim_in=10,
            dim_out=[10, 10],
            scope="lstm1",
            forward_only=False,
            drop_states=True,
            return_last_layer_only=True,
        )
        softmax, loss = model.net.SoftmaxWithLoss(
            [model.Flatten(output), "labels"],
            ['softmax', 'loss'],
        )

        model.AddGradientOperators([loss])
        blobs_before = count_blobs(model.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            model.net,
            ["loss"],
            set(viewvalues(model.param_to_grad)),
            "",
            share_activations=True,
            dont_share_blobs=set(),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Run once to see all blobs are set up correctly
        for init_blob in init_blobs:
            workspace.FeedBlob(init_blob, np.zeros(
                [1, 4, 10], dtype=np.float32
            ))
        workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32))
        workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32))

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(model.net)
コード例 #11
0
ファイル: train.py プロジェクト: ForrestHeiYing/Detectron
def optimize_memory(model):
    """Save GPU memory through blob sharing."""
    for device in range(cfg.NUM_GPUS):
        namescope = 'gpu_{}/'.format(device)
        losses = [namescope + l for l in model.losses]
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses,
            set(model.param_to_grad.values()),
            namescope,
            share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS)
コード例 #12
0
ファイル: memonger_test.py プロジェクト: Ralfhund/caffe2
    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")
        input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss1", "name_x/loss2"],
            set(viewvalues(m.param_to_grad)),
            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
            share_activations=True,
            dont_share_blobs=set(['name_x/fc6', 'name_x/fc5',
                                   str(input_to_grad["name_x/fc1_w"])]),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)
        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(
            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #13
0
def _OptimizeGradientMemorySimple(model, losses_by_gpu, devices):
    log.warning("------- DEPRECATED API, please use " +
                   "data_parallel_model.OptimizeGradientMemory() ----- ")
    for device in devices:
        namescope = "gpu_{}/".format(device)
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses_by_gpu[device],
            set(model.param_to_grad.values()),
            namescope,
            share_activations=False,
        )
コード例 #14
0
ファイル: train_net.py プロジェクト: Alphonses/Detectron
def optimize_memory(model):
    """Save GPU memory through blob sharing."""
    for device in range(cfg.NUM_GPUS):
        namescope = 'gpu_{}/'.format(device)
        losses = [namescope + l for l in model.losses]
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses,
            set(model.param_to_grad.values()),
            namescope,
            share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS
        )
コード例 #15
0
    def test_memonger_mix_cpu_gpu(self):
        '''
        Check that memonger does not make blobs cross CPU/GPU boundary
        '''
        m = model_helper.ModelHelper()
        with core.DeviceScope(
                core.DeviceOption(
                    caffe2_pb2.HIP
                    if workspace.has_hip_support else caffe2_pb2.CUDA, 0)):
            fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
            fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
            fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
            fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
            fc7_cpu.Relu([], fc7_cpu) \
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        m.AddGradientOperators(["loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["loss"],
            set(viewvalues(m.param_to_grad)),
            "",
            share_activations=True,
            dont_share_blobs=set(),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Create set of blobs on CPU side and GPU side and check they don't
        # overlap
        if workspace.has_hip_support:
            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.HIP: set()}
        else:
            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
        for op in optim_proto.op:
            if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
                dev = op.device_option.device_type
                for b in list(op.input) + list(op.output):
                    device_blobs[dev].add(b)

        device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
            device_blobs[caffe2_pb2.HIP] if workspace.
            has_hip_support else device_blobs[caffe2_pb2.CUDA])
        self.assertEquals(device_crossers, set())
コード例 #16
0
    def optimize_gradient_memory(model, loss):
        """A naive implementation of memory optimization

        :param model_helper.ModelHelper model: Model to add update parameters operators for.
        :param list loss: A list of losses.
        """
        model.net._net = memonger.share_grad_blobs(
            model.net,
            loss,
            set(model.param_to_grad.values()),
            namescope='',
            share_activations=False,
        )
コード例 #17
0
ファイル: memonger_test.py プロジェクト: Ralfhund/caffe2
    def test_memonger_mix_cpu_gpu(self):
        '''
        Check that memonger does not make blobs cross CPU/GPU boundary
        '''
        m = model_helper.ModelHelper()
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
            fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
            fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
            fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
            fc7_cpu.Relu([], fc7_cpu) \
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        m.AddGradientOperators(["loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["loss"],
            set(viewvalues(m.param_to_grad)),
            "",
            share_activations=True,
            dont_share_blobs=set(),
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Create set of blobs on CPU side and GPU side and check they don't
        # overlap
        device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
        for op in optim_proto.op:
            if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
                dev = op.device_option.device_type
                for b in list(op.input) + list(op.output):
                    device_blobs[dev].add(b)

        device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
            device_blobs[caffe2_pb2.CUDA]
        )
        self.assertEquals(device_crossers, set())
コード例 #18
0
ファイル: train_wsl.py プロジェクト: zyg11/NA-fWebSOD
def optimize_memory(model):
    """Save GPU memory through blob sharing."""
    for device in range(cfg.NUM_GPUS):
        namescope = 'gpu_{}/'.format(device)
        losses = [namescope + l for l in model.losses]
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses,
            set(model.param_to_grad.values()),
            namescope,
            share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS)
        if cfg.WSL.DEEP_MEM:
            from detectron.utils.wsl_memonger import share_freeze_blobs
            model.net._net = share_freeze_blobs(
                model.net,
                namescope,
            )
コード例 #19
0
def optimize_memory_cpg(model, device):
    namescope = 'gpu_{}/'.format(device)
    dont_free_blobs = set([
        namescope + cfg.WSL.CPG_PRE_BLOB + '_grad',
        namescope + cfg.WSL.CPG_DATA_BLOB + '_grad'
    ])
    import detectron.utils.cpg_memonger as cpg_memonger
    model.net._net = cpg_memonger.deep_release_blobs_when_used(
        model.net._net, dont_free_blobs)
    return

    namescope = 'gpu_{}/'.format(device)
    dont_share_blobs = set([
        namescope + cfg.WSL.CPG_PRE_BLOB + '_grad',
        namescope + cfg.WSL.CPG_DATA_BLOB + '_grad'
    ])
    import detectron.utils.cpg_memonger as cpg_memonger
    cpg_memonger.deep_share_blobs(
        model.net,
        namescope,
        dont_share_blobs=dont_share_blobs,
    )
    return

    optimize_memory_cpg_new(model, device)
    return
    """Save GPU memory through blob sharing."""
    # for device in range(cfg.NUM_GPUS):
    if device >= 0:
        namescope = 'gpu_{}/'.format(device)
        # it seem dont_share_blobs not working
        dont_share_blobs = set([
            namescope + cfg.WSL.CPG_PRE_BLOB + '_grad',
            namescope + cfg.WSL.CPG_DATA_BLOB + '_grad'
        ])
        losses = [namescope + l for l in model.losses]
        model.net._net = memonger.share_grad_blobs(
            model.net,
            losses,
            set(model.param_to_grad.values()),
            namescope,
            share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS,
            dont_share_blobs=dont_share_blobs,
        )
コード例 #20
0
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = cnn.CNNModelHelper()
        with core.NameScope("name_x"):
            fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #21
0
def test_shared_grads(
    with_shapes,
    create_model,
    conv_blob,
    last_out_blob,
    data_blob='gpu_0/data',
    label_blob='gpu_0/label',
    num_labels=1000,
):
    model = cnn.CNNModelHelper(
        order="NCHW",
        name="test",
        cudnn_exhaustive_search=True,
    )
    with core.NameScope("gpu_0"):
        data = model.net.AddExternalInput(data_blob)
        label = model.net.AddExternalInput(label_blob)
        (_softmax, loss) = create_model(
            model,
            data,
            num_input_channels=3,
            num_labels=num_labels,
            label=label,
            is_test=False,
        )

    param_to_grad = model.AddGradientOperators([loss])

    (shapes, types) = workspace.InferShapesAndTypes(
        [model.param_init_net, model.net],
        {
            data_blob: [4, 3, 227, 227],
            label_blob: [4]
        },
    )

    count_before = count_blobs(model.net.Proto())
    optim_proto = memonger.share_grad_blobs(
        model.net,
        ["gpu_0/loss"],
        set(model.param_to_grad.values()),
        "gpu_0/",
        share_activations=True,
        dont_share_blobs=set([str(param_to_grad[conv_blob])]),
        blob_shapes=shapes if with_shapes else None,
    )
    count_after = count_blobs(optim_proto)

    # Run model and compare results. We check that the loss is same
    # and also that the final gradient (conv1_w_grad is same)
    workspace.RunNetOnce(model.param_init_net)
    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
    label = (np.random.rand(4) * num_labels).astype(np.int32)

    workspace.FeedBlob(data_blob, data)
    workspace.FeedBlob(label_blob, label)

    workspace.RunNetOnce(model.net)
    model.net.Proto().type = 'dag'
    model.net.Proto().num_workers = 4
    loss1 = workspace.FetchBlob(last_out_blob)
    conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
    workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0]))

    workspace.RunNetOnce(optim_proto)
    optimized_loss1 = workspace.FetchBlob(last_out_blob)
    optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])

    return [(count_after, count_before), (loss1, optimized_loss1),
            (conv1_w_grad, optim_conv1_w_grad)]
コード例 #22
0
ファイル: memonger_test.py プロジェクト: Ralfhund/caffe2
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(viewvalues(m.param_to_grad)),
            "name_x/",
            share_activations=False,
        )
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        optim_proto_wacts = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(viewvalues(m.param_to_grad)),
            "name_x/",
            share_activations=True,
            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
        )
        blobs_wact_optim = count_blobs(optim_proto_wacts)
        self.assertLessEqual(blobs_wact_optim, blobs_after)

        # Check that the last activations are not shared
        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
        self.assertTrue(
            has_blob(optim_proto_wacts, "name_x/fc5"),
            "Dont remap final activation",
        )

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(
            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)

        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        # Run with the forward optimization
        workspace.RunNetOnce(optim_proto_wacts)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)
コード例 #23
0
    def test_resnet_shared_grads(self, with_shapes, gc, dc):
        model = cnn.CNNModelHelper(
            order="NCHW",
            name="test",
            cudnn_exhaustive_search=True,
        )
        with core.NameScope("gpu_0"):
            data = model.net.AddExternalInput("gpu_0/data")
            label = model.net.AddExternalInput("gpu_0/label")
            (_softmax, loss) = resnet.create_resnet50(
                model,
                data,
                num_input_channels=3,
                num_labels=1000,
                label=label,
                is_test=False,
            )

        param_to_grad = model.AddGradientOperators([loss])

        (shapes, types) = workspace.InferShapesAndTypes(
            [model.param_init_net, model.net],
            {'gpu_0/data': [4, 3, 227, 227],
                         'gpu_0/label': [4]},
        )

        count_before = count_blobs(model.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            model.net,
            ["gpu_0/loss"],
            set(model.param_to_grad.values()),
            "gpu_0/",
            share_activations=True,
            dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
            blob_shapes=shapes if with_shapes else None,
        )
        count_after = count_blobs(optim_proto)
        self.assertTrue(count_after < count_before)

        # Run model and compare results. We check that the loss is same
        # and also that the final gradient (conv1_w_grad is same)
        workspace.RunNetOnce(model.param_init_net)
        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
        label = (np.random.rand(4) * 1000).astype(np.int32)

        workspace.FeedBlob("gpu_0/data", data)
        workspace.FeedBlob("gpu_0/label", label)

        workspace.RunNetOnce(model.net)
        model.net.Proto().type = 'dag'
        model.net.Proto().num_workers = 4
        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
        workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])

        print("before: {} after: {}".format(count_before, count_after))

        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
コード例 #24
0
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=False,
        )
        self.assertTrue(memonger.verify_graph_equality(m.Proto(), optim_proto))
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        optim_proto_wacts = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=True,
            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
        )
        self.assertTrue(
            memonger.verify_graph_equality(m.Proto(), optim_proto_wacts))
        blobs_wact_optim = count_blobs(optim_proto_wacts)
        self.assertLessEqual(blobs_wact_optim, blobs_after)

        # Check that the last activations are not shared
        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
        self.assertTrue(
            has_blob(optim_proto_wacts, "name_x/fc5"),
            "Dont remap final activation",
        )

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)

        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        # Run with the forward optimization
        workspace.RunNetOnce(optim_proto_wacts)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)