Beispiel #1
0
    def build_pin_pos(self, params, placedb, data_collections, device):
        """
        @brief sum up the pins for each cell 
        @param params parameters 
        @param placedb placement database 
        @param data_collections a collection of all data and variables required for constructing the ops 
        @param device cpu or cuda 
        """
        # Yibo: I found CPU version of this is super slow, more than 2s for ISPD2005 bigblue4 with 10 threads.
        # So I implemented a custom CPU version, which is around 20ms
        #pin2node_map = data_collections.pin2node_map.long()
        #def build_pin_pos_op(pos):
        #    pin_x = data_collections.pin_offset_x.add(torch.index_select(pos[0:placedb.num_physical_nodes], dim=0, index=pin2node_map))
        #    pin_y = data_collections.pin_offset_y.add(torch.index_select(pos[placedb.num_nodes:placedb.num_nodes+placedb.num_physical_nodes], dim=0, index=pin2node_map))
        #    pin_pos = torch.cat([pin_x, pin_y], dim=0)

        #    return pin_pos
        #return build_pin_pos_op

        return pin_pos.PinPos(
            pin_offset_x=data_collections.pin_offset_x,
            pin_offset_y=data_collections.pin_offset_y,
            pin2node_map=data_collections.pin2node_map,
            flat_node2pin_map=data_collections.flat_node2pin_map,
            flat_node2pin_start_map=data_collections.flat_node2pin_start_map,
            num_physical_nodes=placedb.num_physical_nodes,
            algorithm="node-by-node")
Beispiel #2
0
    def test_pin_pos_random(self):
        dtype = torch.float32
        num_threads = 1
        pos = np.array([[1, 10], [2, 20], [3, 30]], dtype=np.float32)
        node2pin_map = np.array([np.array([0, 4]), np.array([1, 2, 3])])
        num_physical_nodes = len(node2pin_map)
        num_pins = 0
        for pins in node2pin_map:
            num_pins += len(pins)
        pin2node_map = np.zeros(num_pins, dtype=np.int32)
        for node_id, pins in enumerate(node2pin_map):
            for pin in pins:
                pin2node_map[pin] = node_id

        pin_offset_x = np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=pos.dtype)
        pin_offset_y = np.array([0.01, 0.02, 0.03, 0.04, 0.05],
                                dtype=pos.dtype)

        # construct flat_node2pin_map and flat_node2pin_start_map
        # flat nodepin map, length of #pins
        flat_node2pin_map = np.zeros(num_pins, dtype=np.int32)
        # starting index in nodepin map for each node, length of #nodes+1, the last entry is #pins
        flat_node2pin_start_map = np.zeros(len(node2pin_map) + 1,
                                           dtype=np.int32)
        count = 0
        for i in range(len(node2pin_map)):
            flat_node2pin_map[count:count +
                              len(node2pin_map[i])] = node2pin_map[i]
            flat_node2pin_start_map[i] = count
            count += len(node2pin_map[i])
        flat_node2pin_start_map[len(node2pin_map)] = len(pin2node_map)

        print("flat_node2pin_map = ", flat_node2pin_map)
        print("flat_node2pin_start_map = ", flat_node2pin_start_map)

        pos_var = Variable(torch.from_numpy(pos).reshape([-1]),
                           requires_grad=True)

        golden_value = build_pin_pos(pos_var, torch.from_numpy(pin_offset_x),
                                     torch.from_numpy(pin_offset_y),
                                     torch.from_numpy(pin2node_map),
                                     num_physical_nodes)
        golden_loss = golden_value.sum()
        print("golden_value = ", golden_value)
        print("golden_loss = ", golden_loss)
        golden_loss.backward()
        golden_grad = pos_var.grad.clone()
        print("golden grad = ", golden_grad)
        golden_value = golden_value.detach().numpy()
        golden_grad = golden_grad.detach().numpy()

        # test cpu
        print(pos_var)
        # clone is very important, because the custom op cannot deep copy the data
        custom = pin_pos.PinPos(
            pin_offset_x=torch.from_numpy(pin_offset_x),
            pin_offset_y=torch.from_numpy(pin_offset_y),
            pin2node_map=torch.from_numpy(pin2node_map),
            flat_node2pin_map=torch.from_numpy(flat_node2pin_map),
            flat_node2pin_start_map=torch.from_numpy(flat_node2pin_start_map),
            num_physical_nodes=num_physical_nodes,
            num_threads=num_threads)
        result = custom.forward(pos_var)
        custom_loss = result.sum()
        print("custom = ", result)
        pos_var.grad.zero_()
        custom_loss.backward()
        grad = pos_var.grad.clone()
        print("custom_grad = ", grad)

        np.testing.assert_allclose(result.data.detach().numpy(),
                                   golden_value,
                                   atol=1e-6)
        np.testing.assert_allclose(grad.data.detach().numpy(),
                                   golden_grad,
                                   atol=1e-6)

        # test gpu
        if torch.cuda.device_count():
            pos_var.grad.zero_()
            custom_cuda = pin_pos.PinPos(
                pin_offset_x=torch.from_numpy(pin_offset_x).cuda(),
                pin_offset_y=torch.from_numpy(pin_offset_y).cuda(),
                pin2node_map=torch.from_numpy(pin2node_map).cuda(),
                flat_node2pin_map=torch.from_numpy(flat_node2pin_map).cuda(),
                flat_node2pin_start_map=torch.from_numpy(
                    flat_node2pin_start_map).cuda(),
                num_physical_nodes=num_physical_nodes,
                num_threads=num_threads)
            result_cuda = custom_cuda.forward(pos_var.cuda())
            custom_cuda_loss = result_cuda.sum()
            print("custom_cuda_result = ", result_cuda.data.cpu())
            custom_cuda_loss.backward()
            grad_cuda = pos_var.grad.clone()
            print("custom_grad_cuda = ", grad_cuda.data.cpu())

            np.testing.assert_allclose(result_cuda.data.cpu().numpy(),
                                       golden_value,
                                       atol=1e-6)
            np.testing.assert_allclose(grad_cuda.data.cpu().numpy(),
                                       grad.data.numpy(),
                                       rtol=1e-6,
                                       atol=1e-6)