예제 #1
0
    def test_verify_graph_inequality(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        m.Proto().type = "dag"
        m.Proto().num_workers = 4
        with core.NameScope("name_x"):
            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
            brew.sum(m, [fc2, fc3], "out")

        m2 = model_helper.ModelHelper()
        m2.Proto().type = "dag"
        m2.Proto().num_workers = 4
        with core.NameScope("name_x"):
            fc1 = brew.fc(m2,
                          "data",
                          "x",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            brew.sum(m2, [fc2, fc3], "out")

        self.assertFalse(
            memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
예제 #2
0
    def test_verify_graph_inequality(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        m.Proto().type = "dag"
        m.Proto().num_workers = 4
        with core.NameScope("name_x"):
            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
            brew.sum(m, [fc2, fc3], "out")

        m2 = model_helper.ModelHelper()
        m2.Proto().type = "dag"
        m2.Proto().num_workers = 4
        with core.NameScope("name_x"):
            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
            fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
            brew.sum(m2, [fc2, fc3], "out")

        self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
예제 #3
0
    def test_resnet_forward_only(self):
        model = cnn.CNNModelHelper(
            order="NCHW",
            name="test",
            cudnn_exhaustive_search=True,
        )
        with core.NameScope("gpu_0"):
                data = model.net.AddExternalInput("gpu_0/data")
                resnet.create_resnet50(
                    model,
                    data,
                    num_input_channels=3,
                    num_labels=1000,
                    is_test=True
                )

        count_before = count_blobs(model.net.Proto())
        optim_proto = memonger.optimize_inference_for_dag(
            model.net, ["gpu_0/data"], "gpu_0/"
        )
        count_after = count_blobs(optim_proto)
        num_shared_blobs = count_shared_blobs(optim_proto)

        # Run model and compare results

        workspace.RunNetOnce(model.param_init_net)
        data = np.random.rand(4, 3, 227, 227).astype(np.float32)

        workspace.FeedBlob("gpu_0/data", data)
        workspace.RunNetOnce(model.net)
        model.net.Proto().type = 'dag'
        model.net.Proto().num_workers = 4
        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        self.assertTrue(memonger.verify_graph_equality(
            model.net.Proto(), optim_proto))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        self.assertTrue(count_after < count_before)
        self.assertTrue(num_shared_blobs < 7)
        np.testing.assert_almost_equal(loss1, optimized_loss1)
예제 #4
0
    def test_gradient_optim(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5)\
               .Softmax([], "pred") \
               .LabelCrossEntropy(["label"], ["xent"]) \
               .AveragedLoss([], "loss")
        input_to_grad = m.AddGradientOperators(["name_x/loss"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=False,
        )
        self.assertTrue(memonger.verify_graph_equality(m.Proto(), optim_proto))
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)

        optim_proto_wacts = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss"],
            set(m.param_to_grad.values()),
            "name_x/",
            share_activations=True,
            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
        )
        self.assertTrue(
            memonger.verify_graph_equality(m.Proto(), optim_proto_wacts))
        blobs_wact_optim = count_blobs(optim_proto_wacts)
        self.assertLessEqual(blobs_wact_optim, blobs_after)

        # Check that the last activations are not shared
        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
        self.assertTrue(
            has_blob(optim_proto_wacts, "name_x/fc5"),
            "Dont remap final activation",
        )

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss = workspace.FetchBlob("name_x/loss")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.RunNetOnce(optim_proto)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)

        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        # Run with the forward optimization
        workspace.RunNetOnce(optim_proto_wacts)
        optimized_loss = workspace.FetchBlob("name_x/loss")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss, optimized_loss)
        np.testing.assert_almost_equal(grad, optimized_grad)
예제 #5
0
    def test_forward_optim_tree_harder(self, input_dim, output_dim,
                                       batch_size):
        m = model_helper.ModelHelper()
        m.net.Proto().type = "dag"
        m.net.Proto().num_workers = 4
        m.net.AddExternalInput("label")
        m.net.AddExternalInput("data")

        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)

            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)

            # Branch
            fc3b = brew.fc(m,
                           fc2,
                           "fc3b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc4b = brew.fc(m,
                           fc3b,
                           "fc4b",
                           dim_in=output_dim,
                           dim_out=output_dim)
            fc5b = brew.fc(m,
                           fc4b,
                           "fc5b",
                           dim_in=output_dim,
                           dim_out=output_dim)

            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
            fc5sum.Relu([], "relu1") \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.optimize_inference_for_dag(
            m.net, ["name_x/data"], "name_x/")
        self.assertTrue(
            memonger.verify_graph_equality(m.net.Proto(), optim_proto))
        blobs_after = count_blobs(optim_proto)
        print(str(optim_proto))
        self.assertLess(blobs_after, blobs_before)

        # Test networks produce exactly same results
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
예제 #6
0
    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
        m = model_helper.ModelHelper()
        with core.NameScope("name_x"):
            fc1 = brew.fc(m,
                          "data",
                          "fc1",
                          dim_in=input_dim,
                          dim_out=output_dim)
            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
            fc5.Relu([], fc5) \
               .Softmax([], "pred1") \
               .LabelCrossEntropy(["label"], ["xent1"]) \
               .AveragedLoss([], "loss1")
            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
            fc6.Relu([], fc6) \
               .Softmax([], "pred2") \
               .LabelCrossEntropy(["label"], ["xent2"]) \
               .AveragedLoss([], "loss2")
        input_to_grad = m.AddGradientOperators(
            ["name_x/loss1", "name_x/loss2"])

        blobs_before = count_blobs(m.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            m.net,
            ["name_x/loss1", "name_x/loss2"],
            set(m.param_to_grad.values()),
            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
            share_activations=True,
            dont_share_blobs=set([
                'name_x/fc6', 'name_x/fc5',
                str(input_to_grad["name_x/fc1_w"])
            ]),
        )
        self.assertTrue(
            memonger.verify_graph_equality(m.net.Proto(), optim_proto))
        blobs_after = count_blobs(optim_proto)
        self.assertLess(blobs_after, blobs_before)
        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))

        # Test networks produce exactly same gradients
        data = np.random.randn(batch_size, input_dim).astype(np.float32)
        label = np.random.randint(low=0, high=output_dim,
                                  size=(batch_size, )).astype(np.int32)
        workspace.RunNetOnce(m.param_init_net)
        workspace.FeedBlob("name_x/data", data)
        workspace.FeedBlob("name_x/label", label)
        workspace.RunNetOnce(m.net)
        loss1 = workspace.FetchBlob("name_x/loss1")
        loss2 = workspace.FetchBlob("name_x/loss2")
        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
        optimized_grad = workspace.FetchBlob(str(
            input_to_grad["name_x/fc1_w"]))
        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(loss2, optimized_loss2)
        np.testing.assert_almost_equal(grad, optimized_grad)
예제 #7
0
def share_freeze_blobs(
    net,
    namescope,
):

    log.warn("NOTE: Executing memonger to optimize gradient memory")

    # Collect ops that have something to do with gradients
    if namescope != "" and not namescope.endswith("/"):
        namescope += "/"

    netproto = copy.deepcopy(net.Proto())
    new_net = copy.deepcopy(net)
    activations = []
    external_input = set(new_net.Proto().external_input)
    external_output = set(new_net.Proto().external_output)

    start_idx = -1
    end_idx = -1

    # ops
    for idx, op in enumerate(new_net._net.op):
        # print(op)
        if namescope not in op.input[0]:
            continue
        if op.type == 'Conv' and start_idx < 0:
            start_idx = idx
        if op.type == 'StopGradient':
            end_idx = idx

    # print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx)

    # Hacky way to get activations, think of a better way
    for idx, op in enumerate(new_net._net.op[start_idx:end_idx]):
        if namescope not in op.input[0]:
            continue
        for b in op.output:
            if b not in external_output:
                activations.append(b)

    # print('activations: ', activations)

    used_activations = []
    for a in activations:
        if a in used_activations:
            continue
        share_pool = [
            namescope + '_shared_' + str(i) for i in range(1000, 10000)
        ]
        # print(a)
        first_idx = -1
        for idx, op in enumerate(new_net._net.op):
            if namescope not in op.input[0]:
                continue
            if a in list(op.input) + list(op.output):
                first_idx = idx
                break

        assert first_idx >= 0, first_idx

        for idx, op in enumerate(new_net._net.op[first_idx:]):
            if namescope not in op.input[0]:
                continue
            for b in list(op.input) + list(op.output):
                if b in share_pool:
                    share_pool.remove(b)

        for idx, op in enumerate(new_net._net.op):
            if namescope not in op.input[0]:
                continue
            op_input = copy.deepcopy(op.input)
            is_found = False
            for i, b in enumerate(op_input):
                if a == b:
                    op_input[i] = share_pool[-1]
                    is_found = True
            if is_found:
                del new_net._net.op[idx].input[:]
                new_net._net.op[idx].input.extend(op_input)

            op_output = copy.deepcopy(op.output)
            is_found = False
            for i, b in enumerate(op_output):
                if a == b:
                    op_output[i] = share_pool[-1]
                    is_found = True
            if is_found:
                del new_net._net.op[idx].output[:]
                new_net._net.op[idx].output.extend(op_output)

        used_activations.append(a)

    assert verify_graph_equality(net.Proto(), new_net.Proto()), \
        "Memonger graph is not equal to original."
    assert verify_inplace_blobs(net.Proto(), new_net.Proto()), \
        "Inplace assignments differ in memonger net."

    share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)]
    share_pool_used = {}
    for idx, op in enumerate(new_net._net.op):
        if namescope not in op.input[0]:
            continue
        for b in list(op.input) + list(op.output):
            if b in share_pool:
                share_pool_used[b] = idx

    for idx, op in enumerate(new_net._net.op[end_idx:]):
        if namescope not in op.input[0]:
            continue
        for b in list(op.input) + list(op.output):
            if b in share_pool_used.keys():
                share_pool_used.pop(b)

    ops = list(new_net._net.op)
    for inp in share_pool_used.keys():
        # print('free: ', inp)
        # new_net.Free([inp], [inp])

        ops.insert(share_pool_used[inp] + 1,
                   core.CreateOperator("Free", [inp], [inp]))
    del new_net._net.op[:]
    new_net._net.op.extend(ops)

    return new_net.Proto()
예제 #8
0
def share_freeze_blobs_c2(
    net,
    namescope,
):

    log.warn("NOTE: Executing memonger to optimize gradient memory")

    # Collect ops that have something to do with gradients
    if namescope != "" and not namescope.endswith("/"):
        namescope += "/"

    netproto = copy.deepcopy(net.Proto())
    activations = []
    external_input = set(net.Proto().external_input)
    external_output = set(net.Proto().external_output)

    start_idx = -1
    end_idx = -1

    # ops
    for idx, op in enumerate(netproto.op):
        # print(op)
        if namescope not in op.input[0]:
            continue
        if op.type == 'Conv' and start_idx < 0:
            start_idx = idx
        if op.type == 'StopGradient':
            end_idx = idx

    print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx)

    # Hacky way to get activations, think of a better way
    for idx, op in enumerate(netproto.op[start_idx:end_idx]):
        for b in op.output:
            if b not in external_output:
                activations.append(b)

    print('activations: ', activations)

    share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)]
    map_pool = {}

    heads = [namescope + 'data']
    print('heads: ', heads)

    # Remove last activations, as they are usually accessed externally
    activations = set(activations[:-1])
    print('activations: ', activations)

    shared_blobs = activations
    dont_share_blobs = None
    blob_shapes = None
    op_indices = [
        index for index, op in enumerate(netproto.op[start_idx:end_idx + 2])
    ]

    print(op_indices)

    start_time = time.time()
    optim_str = C.memonger_compute_blob_recycling_for_dag(
        netproto.SerializeToString(), [str(s).encode('utf-8') for s in heads],
        op_indices, set(str(s).encode('utf-8') for s in shared_blobs),
        namescope.encode('utf-8'),
        set() if dont_share_blobs is None else dont_share_blobs,
        {} if blob_shapes is None else blob_shapes)

    optim = caffe2_pb2.NetDef()
    optim.ParseFromString(optim_str)
    assert verify_graph_equality(net.Proto(), optim), \
        "Memonger graph is not equal to original."
    assert verify_inplace_blobs(net.Proto(), optim), \
        "Inplace assignments differ in memonger net."
    return optim
예제 #9
0
    def test_resnet_shared_grads(self, with_shapes, gc, dc):
        model = cnn.CNNModelHelper(
            order="NCHW",
            name="test",
            cudnn_exhaustive_search=True,
        )
        with core.NameScope("gpu_0"):
            data = model.net.AddExternalInput("gpu_0/data")
            label = model.net.AddExternalInput("gpu_0/label")
            (_softmax, loss) = resnet.create_resnet50(
                model,
                data,
                num_input_channels=3,
                num_labels=1000,
                label=label,
                is_test=False,
            )

        param_to_grad = model.AddGradientOperators([loss])

        (shapes, types) = workspace.InferShapesAndTypes(
            [model.param_init_net, model.net],
            {'gpu_0/data': [4, 3, 227, 227],
                         'gpu_0/label': [4]},
        )

        count_before = count_blobs(model.net.Proto())
        optim_proto = memonger.share_grad_blobs(
            model.net,
            ["gpu_0/loss"],
            set(model.param_to_grad.values()),
            "gpu_0/",
            share_activations=True,
            dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
            blob_shapes=shapes if with_shapes else None,
        )
        self.assertTrue(memonger.verify_graph_equality(model.net.Proto(), optim_proto))
        count_after = count_blobs(optim_proto)
        self.assertTrue(count_after < count_before)

        # Run model and compare results. We check that the loss is same
        # and also that the final gradient (conv1_w_grad is same)
        workspace.RunNetOnce(model.param_init_net)
        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
        label = (np.random.rand(4) * 1000).astype(np.int32)

        workspace.FeedBlob("gpu_0/data", data)
        workspace.FeedBlob("gpu_0/label", label)

        workspace.RunNetOnce(model.net)
        model.net.Proto().type = 'dag'
        model.net.Proto().num_workers = 4
        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
        workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))

        workspace.RunNetOnce(optim_proto)
        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
        optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])

        print("before: {} after: {}".format(count_before, count_after))

        np.testing.assert_almost_equal(loss1, optimized_loss1)
        np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)