Example #1
0
    def identify_num_columns(self):
        column_count = dict()
        self.column_dim = list()
        column_dim_list = list()
        for page in self.pages:
            column_dim = list()
            for segment in page.segments:
                if segment.font_family == self.default_font_family and math.fabs(segment.font_size - self.default_size) < 0.1:
                    column_dim_copy = list(column_dim)
                    updated_dim = False

                    for dim in column_dim_copy:
                        if math.fabs(segment.bbox[0] - dim[0]) < 10.0:
                            column_dim.remove(dim)
                            minx = segment.bbox[0] if segment.bbox[0] < dim[0] else dim[0]
                            maxx = segment.bbox[2] if segment.bbox[2] > dim[1] else dim[1]
                            column_dim.append( (minx, maxx) )
                            updated_dim = True
                            break

                    if updated_dim == False:
                        column_dim.append( (segment.bbox[0], segment.bbox[2]) )


                    minx_flow = self.flow_bbox[0] if self.flow_bbox[0] < segment.bbox[0] else segment.bbox[0]
                    miny_flow = self.flow_bbox[1] if self.flow_bbox[1] < segment.bbox[1] else segment.bbox[1]
                    maxx_flow = self.flow_bbox[2] if self.flow_bbox[2] > segment.bbox[2] else segment.bbox[2]
                    maxy_flow = self.flow_bbox[3] if self.flow_bbox[3] > segment.bbox[3] else segment.bbox[3]

                    self.flow_bbox = (minx_flow, miny_flow, maxx_flow, maxy_flow)

            column_dim_list.append( column_dim )

            if len(column_dim) in column_count:
                column_count[ len(column_dim) ] += 1
            else:
                column_count[ len(column_dim) ] = 1

        self.num_columns = find_most_frequent_item(column_count)

        for column_dim in column_dim_list:
            for dim in column_dim:
                updated = False
                for column_index in range(self.num_columns):

                    if dim[0] >= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * column_index and \
                        dim[1] <= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * (column_index+1):
                        for i in range(len(self.column_dim)):
                            test_dim = self.column_dim[i]
                            if math.fabs(test_dim[0] - dim[0]) < 10.0 and math.fabs(test_dim[1] - dim[1]) < 10.0:
                                minx = test_dim[0] if test_dim[0] < dim[0] else dim[0]
                                maxx = test_dim[1] if test_dim[1] > dim[1] else dim[1]
                                self.column_dim[i] = (minx, maxx)
                                updated = True
                                break

                        if updated == False:
                            self.column_dim.append( (dim[0], dim[1]) )
Example #2
0
    def find_default_fonts(self):
        page_font_count = dict()
        size_count = dict()
        for page in self.pages:
            for segment in page.segments:
                if segment.contains_text():
                    for k, v in segment.font_count.items():
                        if k in page_font_count:
                            page_font_count[k] += v
                        else:
                            page_font_count[k] = v

                    if segment.font_size in size_count:
                        size_count[segment.font_size] += segment.font_count[segment.font]
                    else:
                        size_count[segment.font_size] = segment.font_count[segment.font]


        font = find_most_frequent_item(page_font_count).split(",")
        self.default_font_family = font[0]
        self.default_font_type =  "Regular" if len(font) == 1 else font[1]
        self.default_size = find_most_frequent_item(size_count)
Example #3
0
    def find_default_fonts(self):
        page_font_count = dict()
        size_count = dict()
        for page in self.pages:
            for segment in page.segments:
                if segment.contains_text():
                    for k, v in segment.font_count.items():
                        if k in page_font_count:
                            page_font_count[k] += v
                        else:
                            page_font_count[k] = v

                    if segment.font_size in size_count:
                        size_count[segment.font_size] += segment.font_count[
                            segment.font]
                    else:
                        size_count[segment.font_size] = segment.font_count[
                            segment.font]

        font = find_most_frequent_item(page_font_count).split(",")
        self.default_font_family = font[0]
        self.default_font_type = "Regular" if len(font) == 1 else font[1]
        self.default_size = find_most_frequent_item(size_count)
Example #4
0
    def identify_num_columns(self):
        column_count = dict()
        self.column_dim = list()
        column_dim_list = list()
        for page in self.pages:
            column_dim = list()
            for segment in page.segments:
                if segment.font_family == self.default_font_family and math.fabs(
                        segment.font_size - self.default_size) < 0.1:
                    column_dim_copy = list(column_dim)
                    updated_dim = False

                    for dim in column_dim_copy:
                        if math.fabs(segment.bbox[0] - dim[0]) < 10.0:
                            column_dim.remove(dim)
                            minx = segment.bbox[
                                0] if segment.bbox[0] < dim[0] else dim[0]
                            maxx = segment.bbox[
                                2] if segment.bbox[2] > dim[1] else dim[1]
                            column_dim.append((minx, maxx))
                            updated_dim = True
                            break

                    if updated_dim == False:
                        column_dim.append((segment.bbox[0], segment.bbox[2]))

                    minx_flow = self.flow_bbox[0] if self.flow_bbox[
                        0] < segment.bbox[0] else segment.bbox[0]
                    miny_flow = self.flow_bbox[1] if self.flow_bbox[
                        1] < segment.bbox[1] else segment.bbox[1]
                    maxx_flow = self.flow_bbox[2] if self.flow_bbox[
                        2] > segment.bbox[2] else segment.bbox[2]
                    maxy_flow = self.flow_bbox[3] if self.flow_bbox[
                        3] > segment.bbox[3] else segment.bbox[3]

                    self.flow_bbox = (minx_flow, miny_flow, maxx_flow,
                                      maxy_flow)

            column_dim_list.append(column_dim)

            if len(column_dim) in column_count:
                column_count[len(column_dim)] += 1
            else:
                column_count[len(column_dim)] = 1

        self.num_columns = find_most_frequent_item(column_count)

        for column_dim in column_dim_list:
            for dim in column_dim:
                updated = False
                for column_index in range(self.num_columns):

                    if dim[0] >= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * column_index and \
                        dim[1] <= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * (column_index+1):
                        for i in range(len(self.column_dim)):
                            test_dim = self.column_dim[i]
                            if math.fabs(test_dim[0] -
                                         dim[0]) < 10.0 and math.fabs(
                                             test_dim[1] - dim[1]) < 10.0:
                                minx = test_dim[
                                    0] if test_dim[0] < dim[0] else dim[0]
                                maxx = test_dim[
                                    1] if test_dim[1] > dim[1] else dim[1]
                                self.column_dim[i] = (minx, maxx)
                                updated = True
                                break

                        if updated == False:
                            self.column_dim.append((dim[0], dim[1]))