def identify_num_columns(self): column_count = dict() self.column_dim = list() column_dim_list = list() for page in self.pages: column_dim = list() for segment in page.segments: if segment.font_family == self.default_font_family and math.fabs(segment.font_size - self.default_size) < 0.1: column_dim_copy = list(column_dim) updated_dim = False for dim in column_dim_copy: if math.fabs(segment.bbox[0] - dim[0]) < 10.0: column_dim.remove(dim) minx = segment.bbox[0] if segment.bbox[0] < dim[0] else dim[0] maxx = segment.bbox[2] if segment.bbox[2] > dim[1] else dim[1] column_dim.append( (minx, maxx) ) updated_dim = True break if updated_dim == False: column_dim.append( (segment.bbox[0], segment.bbox[2]) ) minx_flow = self.flow_bbox[0] if self.flow_bbox[0] < segment.bbox[0] else segment.bbox[0] miny_flow = self.flow_bbox[1] if self.flow_bbox[1] < segment.bbox[1] else segment.bbox[1] maxx_flow = self.flow_bbox[2] if self.flow_bbox[2] > segment.bbox[2] else segment.bbox[2] maxy_flow = self.flow_bbox[3] if self.flow_bbox[3] > segment.bbox[3] else segment.bbox[3] self.flow_bbox = (minx_flow, miny_flow, maxx_flow, maxy_flow) column_dim_list.append( column_dim ) if len(column_dim) in column_count: column_count[ len(column_dim) ] += 1 else: column_count[ len(column_dim) ] = 1 self.num_columns = find_most_frequent_item(column_count) for column_dim in column_dim_list: for dim in column_dim: updated = False for column_index in range(self.num_columns): if dim[0] >= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * column_index and \ dim[1] <= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * (column_index+1): for i in range(len(self.column_dim)): test_dim = self.column_dim[i] if math.fabs(test_dim[0] - dim[0]) < 10.0 and math.fabs(test_dim[1] - dim[1]) < 10.0: minx = test_dim[0] if test_dim[0] < dim[0] else dim[0] maxx = test_dim[1] if test_dim[1] > dim[1] else dim[1] self.column_dim[i] = (minx, maxx) updated = True break if updated == False: self.column_dim.append( (dim[0], dim[1]) )
def find_default_fonts(self): page_font_count = dict() size_count = dict() for page in self.pages: for segment in page.segments: if segment.contains_text(): for k, v in segment.font_count.items(): if k in page_font_count: page_font_count[k] += v else: page_font_count[k] = v if segment.font_size in size_count: size_count[segment.font_size] += segment.font_count[segment.font] else: size_count[segment.font_size] = segment.font_count[segment.font] font = find_most_frequent_item(page_font_count).split(",") self.default_font_family = font[0] self.default_font_type = "Regular" if len(font) == 1 else font[1] self.default_size = find_most_frequent_item(size_count)
def find_default_fonts(self): page_font_count = dict() size_count = dict() for page in self.pages: for segment in page.segments: if segment.contains_text(): for k, v in segment.font_count.items(): if k in page_font_count: page_font_count[k] += v else: page_font_count[k] = v if segment.font_size in size_count: size_count[segment.font_size] += segment.font_count[ segment.font] else: size_count[segment.font_size] = segment.font_count[ segment.font] font = find_most_frequent_item(page_font_count).split(",") self.default_font_family = font[0] self.default_font_type = "Regular" if len(font) == 1 else font[1] self.default_size = find_most_frequent_item(size_count)
def identify_num_columns(self): column_count = dict() self.column_dim = list() column_dim_list = list() for page in self.pages: column_dim = list() for segment in page.segments: if segment.font_family == self.default_font_family and math.fabs( segment.font_size - self.default_size) < 0.1: column_dim_copy = list(column_dim) updated_dim = False for dim in column_dim_copy: if math.fabs(segment.bbox[0] - dim[0]) < 10.0: column_dim.remove(dim) minx = segment.bbox[ 0] if segment.bbox[0] < dim[0] else dim[0] maxx = segment.bbox[ 2] if segment.bbox[2] > dim[1] else dim[1] column_dim.append((minx, maxx)) updated_dim = True break if updated_dim == False: column_dim.append((segment.bbox[0], segment.bbox[2])) minx_flow = self.flow_bbox[0] if self.flow_bbox[ 0] < segment.bbox[0] else segment.bbox[0] miny_flow = self.flow_bbox[1] if self.flow_bbox[ 1] < segment.bbox[1] else segment.bbox[1] maxx_flow = self.flow_bbox[2] if self.flow_bbox[ 2] > segment.bbox[2] else segment.bbox[2] maxy_flow = self.flow_bbox[3] if self.flow_bbox[ 3] > segment.bbox[3] else segment.bbox[3] self.flow_bbox = (minx_flow, miny_flow, maxx_flow, maxy_flow) column_dim_list.append(column_dim) if len(column_dim) in column_count: column_count[len(column_dim)] += 1 else: column_count[len(column_dim)] = 1 self.num_columns = find_most_frequent_item(column_count) for column_dim in column_dim_list: for dim in column_dim: updated = False for column_index in range(self.num_columns): if dim[0] >= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * column_index and \ dim[1] <= self.flow_bbox[0] + ((self.flow_bbox[2] - self.flow_bbox[0]) / self.num_columns) * (column_index+1): for i in range(len(self.column_dim)): test_dim = self.column_dim[i] if math.fabs(test_dim[0] - dim[0]) < 10.0 and math.fabs( test_dim[1] - dim[1]) < 10.0: minx = test_dim[ 0] if test_dim[0] < dim[0] else dim[0] maxx = test_dim[ 1] if test_dim[1] > dim[1] else dim[1] self.column_dim[i] = (minx, maxx) updated = True break if updated == False: self.column_dim.append((dim[0], dim[1]))