Ejemplo n.º 1
0
    def test_cluster_list(self):
        a = [1, 2, 3, 4]
        assert utils.cluster_list(a) == [[x] for x in a]
        assert utils.cluster_list(a, tolerance=1) == [a]

        a = [1, 2, 5, 6]
        assert utils.cluster_list(a, tolerance=1) == [[1, 2], [5, 6]]
Ejemplo n.º 2
0
    def test_pandas(self):

        rect_x0_clusters = cluster_list([ r["x0"]
            for r in self.pdf.pages[1].rects ], tolerance=3)

        v_lines = [ x[0] for x in rect_x0_clusters ]

        def parse_page(page):
            data = page.extract_table(v=v_lines)
            without_spaces = [ fix_row_spaces(row) for row in data ]
            return without_spaces

        parsed = parse_page(self.pdf.pages[0])

        assert(parsed[0] == [
            "NoticeDate",
            "Effective",
            "Received",
            "Company",
            "City",
            "No. Of",
            "Layoff/Closure",
        ])

        assert(parsed[1] == [
            "06/22/2015",
            "03/25/2016",
            "07/01/2015",
            "Maxim Integrated Product",
            "San Jose",
            "150",
            "Closure Permanent",
        ])
Ejemplo n.º 3
0
    def get_edge_positions(self, orientation, edge_type=None, min_length=1, tolerance=1):

        edges = self.filter_edges(orientation, edge_type=edge_type, min_length=min_length)

        pos_var = "x0" if orientation == "v" else "top"
        edges_uniq = set(e[pos_var] for e in edges)
        edges_clust = utils.cluster_list(edges_uniq, tolerance=tolerance)
        edge_means = list(sorted(sum(c) / len(c) for c in edges_clust))

        return edge_means
Ejemplo n.º 4
0
    def get_edge_positions(self,
                           orientation,
                           edge_type=None,
                           min_length=1,
                           tolerance=1):

        edges = self.filter_edges(orientation,
                                  edge_type=edge_type,
                                  min_length=min_length)

        pos_var = "x0" if orientation == "v" else "top"
        edges_uniq = set(e[pos_var] for e in edges)
        edges_clust = utils.cluster_list(edges_uniq, tolerance=tolerance)
        edge_means = list(sorted(sum(c) / len(c) for c in edges_clust))

        return edge_means
    def test_pandas(self):

        rect_x0_clusters = utils.cluster_list(
            [r["x0"] for r in self.pdf.pages[1].rects], tolerance=3)

        v_lines = [x[0] for x in rect_x0_clusters]

        def parse_page(page):
            data = page.extract_table({
                "vertical_strategy": "explicit",
                "explicit_vertical_lines": v_lines
            })
            without_spaces = [fix_row_spaces(row) for row in data]
            return without_spaces

        parsed = parse_page(self.pdf.pages[0])

        assert (parsed[0] == [
            "NoticeDate",
            "Effective",
            "Received",
            "Company",
            "City",
            "No. Of",
            "Layoff/Closure",
        ])

        assert (parsed[1] == [
            "06/22/2015",
            "03/25/2016",
            "07/01/2015",
            "Maxim Integrated Product",
            "San Jose",
            "150",
            "Closure Permanent",
        ])