def get_candidates_and_features_page_num(self, page_num): elems = self.elems[page_num] font_stat = self.font_stats[page_num] lines_bboxes = self.get_candidates_lines(page_num, elems) boxes = [] # Filter out bboxes that are zero width or height for bbox in lines_bboxes: if (bbox[5] - bbox[3] > 0 and bbox[6] - bbox[4] > 0): boxes += [bbox] alignments_bboxes, alignment_features = self.get_candidates_alignments( page_num, elems) self.log.info( "Page Num: {}, Line bboxes: {}, Alignment bboxes: {}".format( page_num, len(lines_bboxes), len(alignments_bboxes))) alignment_features += get_alignment_features(lines_bboxes, elems, font_stat) # Filter out bboxes that are zero width or height for bbox in alignments_bboxes: if (bbox[5] - bbox[3] > 0 and bbox[6] - bbox[4] > 0): boxes += [bbox] # boxes = alignments_bboxes + lines_bboxes if len(boxes) == 0: return [], [] lines_features = get_lines_features(boxes, elems) features = np.concatenate( (np.array(alignment_features), np.array(lines_features)), axis=1) return boxes, features
def get_candidates_and_features_page_num(self, page_num): elems = self.elems[page_num] # font_stat = self.font_stats[page_num] # lines_bboxes = self.get_candidates_lines(page_num, elems) alignments_bboxes, alignment_features = self.get_candidates_alignments( page_num, elems) boxes = alignments_bboxes if len(boxes) == 0: self.log.info("No boxes were found on page {}.".format(page_num)) return [], [] lines_features = get_lines_features(boxes, elems) features = np.concatenate( (np.array(alignment_features), np.array(lines_features)), axis=1) return boxes, features