def page_extraction(model_dir: str, filenames_to_process: List[str], output_dir: str, draw_extractions: bool=False, config: tf.compat.v1.ConfigProto=None): os.makedirs(output_dir, exist_ok=True) if draw_extractions: drawing_dir = os.path.join(output_dir, 'drawings') os.makedirs(drawing_dir) with tf.compat.v1.Session(config=config): # Load the model m = LoadedModel(model_dir, predict_mode='filename') for filename in tqdm(filenames_to_process, desc='Prediction'): # Inference prediction = m.predict(filename) probs = prediction['probs'][0] original_shape = prediction['original_shape'] probs = probs / np.max(probs) # Normalize to be in [0, 1] # Binarize the predictions page_bin = page_post_processing_fn(probs, threshold=-1) # Upscale to have full resolution image (cv2 uses (w,h) and not (h,w) for giving shapes) bin_upscaled = cv2.resize(page_bin.astype(np.uint8, copy=False), tuple(original_shape[::-1]), interpolation=cv2.INTER_NEAREST) # Find quadrilateral enclosing the page pred_page_coords = find_boxes(bin_upscaled.astype(np.uint8, copy=False), mode='min_rectangle', min_area=0.2, n_max_boxes=1) if pred_page_coords is not None: # Write corners points into a .txt file # Create page region and XML file page_border = PAGE.Border(coords=PAGE.Point.cv2_to_point_list(pred_page_coords[:, None, :])) if draw_extractions: # Draw page box on original image and export it. Add also box coordinates to the txt file original_img = imread(filename, pilmode='RGB') cv2.polylines(original_img, [pred_page_coords[:, None, :]], True, (0, 0, 255), thickness=5) basename = os.path.basename(filename).split('.')[0] imsave(os.path.join(drawing_dir, '{}_boxes.jpg'.format(basename)), original_img) else: print('No box found in {}'.format(filename)) page_border = PAGE.Border() page_xml = PAGE.Page(image_filename=filename, image_width=original_shape[1], image_height=original_shape[0], page_border=page_border) xml_filename = os.path.join(output_dir, '{}.xml'.format(basename)) page_xml.write_to_file(xml_filename, creator_name='PageExtractor')
def extract_lines(npy_filename: str, output_dir: str, original_shape: list, post_process_params: dict, channel_baselines: int = 1, mask_dir: str = None, debug: bool = False): """ From the prediction files (probs) (.npy) finds and extracts the lines into PAGE-XML format. :param npy_filename: filename of saved predictions (probs) in range (0,255) :param output_dir: output direcoty to save the xml files :param original_shape: shpae of the original input image (to rescale the extracted lines if necessary) :param post_process_params: pramas for lines detection (sigma, thresholds, ...) :param channel_baselines: channel where the baseline class is detected :param mask_dir: directory containing masks of the page in order to improve the line extraction :param debug: if True will output the binary image of the extracted lines :return: contours of lines (open cv format), binary image of lines (lines mask) """ os.makedirs(output_dir, exist_ok=True) basename = os.path.basename(npy_filename).split('.')[0] pred = np.load(npy_filename) / 255 # type: np.ndarray lines_prob = pred[:, :, channel_baselines] if mask_dir is not None: mask = imread(os.path.join(mask_dir, basename + '.png'), mode='L') mask = np.array( PIL.Image.fromarray(mask, mode='L').resize(lines_prob.shape, resample=PIL.Image.BILINEAR)) lines_prob[mask == 0] = 0. contours, lines_mask = line_extraction_v1(lines_prob, **post_process_params) if debug: imsave(os.path.join(output_dir, '{}_bin.jpg'.format(basename)), lines_mask) ratio = (original_shape[0] / pred.shape[0], original_shape[1] / pred.shape[1]) xml_filename = os.path.join(output_dir, basename + '.xml') PAGE.save_baselines(xml_filename, contours, ratio, predictions_shape=pred.shape[:2]) return contours, lines_mask
def baseline_extraction(model_dir: str, filenames_to_process: List[str], output_dir: str, draw_extractions: bool = False, config: tf.compat.v1.ConfigProto = None) -> None: """ Given a model directory this function will load the model and apply it to the given files. :param model_dir: Directory containing the saved model :param filenames_to_process: filenames of the images to process :param output_dir: output directory to save the predictions (probability images) :param draw_extractions: :param config: ``ConfigProto`` object for ``tf.Session``. :return: """ os.makedirs(output_dir, exist_ok=True) if draw_extractions: drawing_dir = os.path.join(output_dir, 'drawings') os.makedirs(drawing_dir) with tf.compat.v1.Session(config=config): # Load the model m = LoadedModel(model_dir, predict_mode='filename_original_shape') for filename in tqdm(filenames_to_process, desc='Prediction'): # Inference prediction = m.predict(filename) # Take the first element of the 'probs' dictionary (batch size = 1) probs = prediction['probs'][0] original_shape = probs.shape # The baselines probs are on the second channel baseline_probs = probs[:, :, 1] contours, _ = line_extraction_v1(baseline_probs, low_threshold=0.2, high_threshold=0.4, sigma=1.5) basename = os.path.basename(filename).split('.')[0] # Compute the ratio to save the coordinates in the original image coordinates reference. ratio = (original_shape[0] / probs.shape[0], original_shape[1] / probs.shape[1]) xml_filename = os.path.join(output_dir, basename + '.xml') page_object = PAGE.save_baselines( xml_filename, contours, ratio, predictions_shape=probs.shape[:2]) # If specified, saves the images with the annotated baslines if draw_extractions: image = imread(filename) page_object.draw_baselines(image, color=(255, 0, 0), thickness=5) basename = os.path.basename(filename) imsave(os.path.join(drawing_dir, basename), image)
def get_original_shape_from_image_file_name(filename): """ I think it is a hard code, filename is "../../data/cbad-mask/complex/test/images\\basename.jpg" I want to get image original size from "../../data/cbad-mask/complex/test/gt/basename.xml" :param filename: :return: original h, w of image """ # get xml file name base_name = filename.split("\\")[-1].split(".")[0] + ".xml" dir_name = filename.split("\\")[0].replace('images', 'gt') gt_xml_filename = os.path.join(dir_name, base_name) gt_page_xml = PAGE.parse_file(gt_xml_filename) original_shape = [gt_page_xml.image_height, gt_page_xml.image_width] return original_shape
mode='min_rectangle', min_area=0.2, n_max_boxes=1) # Draw page box on original image and export it. Add also box coordinates to the txt file original_img = imread(filename, pilmode='RGB') if pred_page_coords is not None: cv2.polylines(original_img, [pred_page_coords[:, None, :]], True, (0, 0, 255), thickness=5) # Write corners points into a .txt file txt_coordinates += '{},{}\n'.format( filename, format_quad_to_string(pred_page_coords)) # Create page region and XML file page_border = PAGE.Border(coords=PAGE.Point.cv2_to_point_list( pred_page_coords[:, None, :])) else: print('No box found in {}'.format(filename)) page_border = PAGE.Border() basename = os.path.basename(filename).split('.')[0] imsave(os.path.join(output_dir, '{}_boxes.jpg'.format(basename)), original_img) page_xml = PAGE.Page(image_filename=filename, image_width=original_shape[1], image_height=original_shape[0], page_border=page_border) xml_filename = os.path.join(output_pagexml_dir, '{}.xml'.format(basename)) page_xml.write_to_file(xml_filename, creator_name='PageExtractor')
imsave("prob_masks/" + name, probs[:, :, 1]) imsave("bin_masks/" + name, binary_map[:, :, 0]) # Draw page box on original image and export it. Add also box coordinates to the txt file original_img = imread(filename, pilmode='RGB') if boxes_resized is not None: for box in boxes_resized: cv2.polylines(original_img, [box[:, None, :]], True, (0, 0, 255), thickness=5) # Write corners points into a .txt file txt_coordinates += '{},{}\n'.format( filename, format_quad_to_string(boxes_resized)) # Create page region mark_border = PAGE.Border( coords=PAGE.Point.cv2_to_point_list(box[:, None, :])) else: print('No box found in {}'.format(filename)) mark_border = PAGE.Border() basename = os.path.basename(filename).split('.')[0] imsave(os.path.join(output_dir, '{}_boxes.jpg'.format(basename)), original_img) # export text_regions = [ PAGE.TextRegion(id='txt-reg-{}'.format(i), coords=PAGE.Point.array_to_point(coords), custom_attribute="structure{type:drop-cap;}") for i, coords in enumerate(boxes_resized) ]
def annotate_one_page(image_filename: str, output_dir: str, size: int = None, draw_baselines: bool = True, draw_lines: bool = False, draw_endpoints: bool = False, baseline_thickness: float = 0.2, diameter_endpoint: int = 20) -> Tuple[str, str]: """ Creates an annotated mask and corresponding original image and saves it in 'labels' and 'images' folders. Also copies the corresponding .xml file into 'gt' folder. :param image_filename: filename of the image to process :param output_dir: directory to output the annotated label image :param size: Size of the resized image (# pixels) :param draw_baselines: Draws the baselines (boolean) :param draw_lines: Draws the polygon's lines (boolean) :param draw_endpoints: Predict beginning and end of baselines (True, False) :param baseline_thickness: Thickness of annotated baseline (percentage of the line's height) :param diameter_endpoint: Diameter of annotated start/end points :return: (output_image_path, output_label_path) """ page_filename = get_page_filename(image_filename) # Parse xml file and get TextLines page = PAGE.parse_file(page_filename) text_lines = [tl for tr in page.text_regions for tl in tr.text_lines] img = imread(image_filename, pilmode='RGB') # Create empty mask gt = np.zeros_like(img) if text_lines: if draw_baselines: # Thickness : should be a percentage of the line height, for example 0.2 # First, get the mean line height. mean_line_height, _, _ = _compute_statistics_line_height(page) absolute_baseline_thickness = int( max(gt.shape[0] * 0.002, baseline_thickness * mean_line_height)) # Draw the baselines gt_baselines = np.zeros_like(img[:, :, 0]) gt_baselines = cv2.polylines( gt_baselines, [PAGE.Point.list_to_cv2poly(tl.baseline) for tl in text_lines], isClosed=False, color=255, thickness=absolute_baseline_thickness) gt[:, :, np.argmax(DRAWING_COLOR_BASELINES)] = gt_baselines if draw_lines: # Draw the lines gt_lines = np.zeros_like(img[:, :, 0]) for tl in text_lines: gt_lines = cv2.fillPoly( gt_lines, [PAGE.Point.list_to_cv2poly(tl.coords)], color=255) gt[:, :, np.argmax(DRAWING_COLOR_LINES)] = gt_lines if draw_endpoints: # Draw endpoints of baselines gt_points = np.zeros_like(img[:, :, 0]) for tl in text_lines: try: gt_points = cv2.circle( gt_points, (tl.baseline[0].x, tl.baseline[0].y), radius=int((diameter_endpoint / 2 * (gt_points.shape[0] / TARGET_HEIGHT))), color=255, thickness=-1) gt_points = cv2.circle( gt_points, (tl.baseline[-1].x, tl.baseline[-1].y), radius=int((diameter_endpoint / 2 * (gt_points.shape[0] / TARGET_HEIGHT))), color=255, thickness=-1) except IndexError: print('Length of baseline is {}'.format(len(tl.baseline))) gt[:, :, np.argmax(DRAWING_COLOR_POINTS)] = gt_points # Make output filenames image_label_basename = get_image_label_basename(image_filename) output_image_path = os.path.join(output_dir, 'images', '{}.jpg'.format(image_label_basename)) output_label_path = os.path.join(output_dir, 'labels', '{}.png'.format(image_label_basename)) # Resize (if necessary) and save image and label save_and_resize(img, output_image_path, size=size) save_and_resize(gt, output_label_path, size=size, nearest=True) # Copy XML file to 'gt' folder shutil.copy( page_filename, os.path.join(output_dir, 'gt', '{}.xml'.format(image_label_basename))) return os.path.abspath(output_image_path), os.path.abspath( output_label_path)
def eval_fn(input_dir: str, groudtruth_dir: str, output_dir: str = None, post_process_params: dict = PP_PARAMS, channel_baselines: int = 1, jar_tool_path: str = CBAD_JAR, masks_dir: str = None) -> dict: """ Evaluates a model against the selected set ('groundtruth_dir' contains XML files) :param input_dir: Input directory containing probability maps (.npy) :param groudtruth_dir: directory containg XML groundtruths :param output_dir: output directory for results :param post_process_params: parameters form post processing of probability maps :param channel_baselines: the baseline class chanel :param jar_tool_path: path to cBAD evaluation tool (.jar file) :param masks_dir: optional, directory where binary masks of the page are stored (.png) :return: """ if output_dir is None: output_dir = input_dir # Apply post processing and find lines for file in tqdm(glob(os.path.join(input_dir, '*.npy'))): basename = os.path.basename(file).split('.')[0] gt_xml_filename = os.path.join(groudtruth_dir, basename + '.xml') gt_page_xml = PAGE.parse_file(gt_xml_filename) original_shape = [gt_page_xml.image_height, gt_page_xml.image_width] _, _ = extract_lines(file, output_dir, original_shape, post_process_params, channel_baselines=channel_baselines, mask_dir=masks_dir) # Create pairs predicted XML - groundtruth XML to be evaluated xml_pred_filenames_list = glob(os.path.join(output_dir, '*.xml')) xml_filenames_tuples = list() for xml_filename in xml_pred_filenames_list: basename = os.path.basename(xml_filename) gt_xml_filename = os.path.join(groudtruth_dir, basename) xml_filenames_tuples.append((gt_xml_filename, xml_filename)) gt_pages_list_filename = os.path.join(output_dir, 'gt_pages_simple.lst') generated_pages_list_filename = os.path.join(output_dir, 'generated_pages_simple.lst') with open(gt_pages_list_filename, 'w') as f: f.writelines('\n'.join([s[0] for s in xml_filenames_tuples])) with open(generated_pages_list_filename, 'w') as f: f.writelines('\n'.join([s[1] for s in xml_filenames_tuples])) # Evaluation using JAVA Tool cmd = 'java -jar {} {} {}'.format(jar_tool_path, gt_pages_list_filename, generated_pages_list_filename) result = subprocess.check_output(cmd, shell=True).decode() with open(os.path.join(output_dir, 'scores.txt'), 'w') as f: f.write(result) parse_score_txt(result, os.path.join(output_dir, 'scores.csv')) # Parse results from output of tool lines = result.splitlines() avg_precision = float( next(filter(lambda l: 'Avg (over pages) P value:' in l, lines)).split()[-1]) avg_recall = float( next(filter(lambda l: 'Avg (over pages) R value:' in l, lines)).split()[-1]) f_measure = float( next(filter(lambda l: 'Resulting F_1 value:' in l, lines)).split()[-1]) print('P {}, R {}, F {}'.format(avg_precision, avg_recall, f_measure)) return { 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'f_measure': f_measure }
async def run(self): while True: # # get item off work queue # start_wait = time.time() g = self.work_queue.pop() finish_wait = time.time() self.counter += 1 labels_all, probs_all, filename, original_shape, inference_time_sec, page_number = self.work_queue.ungroup( g) basename = os.path.basename(filename).split('.')[0] self.feat.start(basename) if self.enable_debug: # write out an image of the per pixel labels label_viz = np.zeros( (labels_all.shape[0], labels_all.shape[1], 3), np.uint8) for h in range(0, labels_all.shape[0]): for w in range(0, labels_all.shape[1]): c = self.label_val_to_color(labels_all[h, w]) label_viz[h, w, 0] = c[0] label_viz[h, w, 1] = c[1] label_viz[h, w, 2] = c[2] imsave( os.path.join(self.output_dir, f"{basename}_label_viz.png"), label_viz) # what pixel labels do we have? hist_label_counts = np.bincount(labels_all.flatten()).tolist() while len(hist_label_counts) < max(label_bins) + 1: hist_label_counts.append(0) # now hist_label_counts contains counts of pixel labels self._put_results_log( f"processing: file={filename} histogram={hist_label_counts} " f"infer_timing={inference_time_sec} original_shape={original_shape}" ) original_img = imread(filename, pilmode='RGB') if self.enable_debug: original_img_box_viz = np.array(original_img) original_img_box_viz_modified = False # # handle rectangles here! # for label_slice in label_bins: if label_slice == 0: continue # skip background color_tuple = self.label_val_to_color(label_slice) # area of all the pixel labels for a particular class, might be multiple regions area = hist_label_counts[label_slice] if area < 500: # minimum size # reject small label areas continue probs = probs_all[:, :, label_slice] # make an image showing probability map for this label before postprocessing # (it can include multiple blobs) if self.enable_debug: prob_img = np.zeros((probs.shape[0], probs.shape[1], 3), np.uint8) for h in range(0, probs.shape[0]): for w in range(0, probs.shape[1]): c = probs[h, w] * 255 prob_img[h, w, 0] = c prob_img[h, w, 1] = c prob_img[h, w, 2] = c imsave( os.path.join( self.output_dir, f"{basename}_{label_slice}_label_prob.png"), prob_img) # Binarize the predictions page_bin = self.page_make_binary_mask(probs) # Upscale to have full resolution image (cv2 uses (w,h) and not (h,w) for giving shapes) bin_upscaled = cv2.resize(page_bin.astype(np.uint8, copy=False), tuple(original_shape[::-1]), interpolation=cv2.INTER_NEAREST) # upscale probs the same way so we can calculate confidence later probs_upscaled = cv2.resize(probs.astype(np.float32, casting='same_kind'), tuple(original_shape[::-1]), interpolation=cv2.INTER_NEAREST) # Find quadrilateral(s) enclosing the label area(s). # allow more than reasonable number of boxes so we can use spurious boxes as a reject signal pred_region_coords_list = boxes_detection.find_boxes( bin_upscaled.astype(np.uint8, copy=False), mode='rectangle', min_area=0.001, n_max_boxes=4) # coord is [[a,b], [c,b], [c,d], [a,d]] (a path for drawing a polygon, clockwise) # origin is upper left [x,y]: # [a,b] [c,b] # rectangle # [a,d] [c,d] # which means a<c and b<d if pred_region_coords_list is not None: # Draw region box on original image and export it. Add also box coordinates to the txt file region_count = len(pred_region_coords_list) count = 0 for pred_region_coords in pred_region_coords_list: # cut out rectangle for region based on original image size a = pred_region_coords[0, 0] b = pred_region_coords[0, 1] c = pred_region_coords[1, 0] d = pred_region_coords[2, 1] probs_rectangle = probs_upscaled[ b:d + 1, a:c + 1] # values are in range [0,1] overall_confidence = (sum(sum(probs_rectangle))) / ( (c - a) * (d - b)) aspect_ratio = (c - a) / (d - b) # w/h page_width_fraction = (c - a) / original_shape[0] page_height_fraction = (d - b) / original_shape[1] normalized_x = a / original_shape[0] normalized_y = b / original_shape[1] region_size = page_width_fraction * page_height_fraction cmts = f"Prediction {a},{b},{c},{d} confidence={overall_confidence} aspect={aspect_ratio} widthfrac={page_width_fraction} heightfrac={page_height_fraction} normalized_x={normalized_x} normalized_y={normalized_y} dimensions={c - a}x{d - b} spec={basename}_{label_slice}-{count}" self._put_results_log(cmts) img_rectangle = original_img[b:d + 1, a:c + 1] tag_rect_x0 = a tag_rect_y0 = b tag_rect_x1 = c tag_rect_y1 = d if self.enable_debug: # draw box to visualize rectangle cv2.polylines(original_img_box_viz, [pred_region_coords[:, None, :]], True, (color_tuple[0], color_tuple[1], color_tuple[2]), thickness=5) original_img_box_viz_modified = True imsave( os.path.join( self.output_dir, f"{basename}_{label_slice}-{count}_{overall_confidence}_rect.jpg" ), img_rectangle) # Write corners points into a .txt file # txt_coordinates += '{},{}\n'.format(filename, self.format_quad_to_string(pred_region_coords)) # store info on area for use after all areas in image are gathered self.feat.put(label_slice, count, region_size, overall_confidence, aspect_ratio, page_width_fraction, page_height_fraction, normalized_x, normalized_y, tag_rect_x0, tag_rect_y0, tag_rect_x1, tag_rect_y1, img_rectangle, cmts) # Create page region and XML file page_border = PAGE.Border( coords=PAGE.Point.cv2_to_point_list( pred_region_coords[:, None, :])) count += 1 else: # No box found for label # page_border = PAGE.Border() continue if self.enable_debug: # boxes for all labels, using mask colors if original_img_box_viz_modified: imsave( os.path.join(self.output_dir, f"{basename}__boxes.jpg"), original_img_box_viz) self.feat.finish( ) # finish image, in non-production this saves feature vector for post model page_prediction_msg = "" prediction_summary_txt = "" if self.production_mode: # # apply post-model to determine page type # v = np.zeros((1, self.feat.vec_length())) v[0] = self.feat.get_post_model_vec() y = self.post_model.predict(v) page_type = int(y[0]) page_prediction_msg = f"PagePrediction: {basename} " # # take actions # if page_type == 0: # other page, skip page_prediction_msg += f"type=0" pass elif page_type == 1: # start page of article, save info page_prediction_msg += f"type=1" title_info = self.feat.get_label_instance(1, 0) title_rect_x0 = 2 * title_info["tag_rect_x0"] title_rect_y0 = 2 * title_info["tag_rect_y0"] title_rect_x1 = 2 * title_info["tag_rect_x1"] title_rect_y1 = 2 * title_info["tag_rect_y1"] title_normalized_y = title_info["normalized_y"] author_info = self.feat.get_label_instance(2, 0) author_rect_x0 = 2 * author_info["tag_rect_x0"] author_rect_y0 = 2 * author_info["tag_rect_y0"] author_rect_x1 = 2 * author_info["tag_rect_x1"] author_rect_y1 = 2 * author_info["tag_rect_y1"] author_normalized_y = author_info["normalized_y"] acceptable = True # qualifications if title_info["confidence"] < .5 or author_info[ "confidence"]: # too low, could be 0 (missing) msg = f" REJECT confidence too low " self._put_results_log(msg) prediction_summary_txt += msg + "\n" acceptable = False if title_rect_y0 > author_rect_y0: # unusual, author appears above title msg = f" REJECT author appears above title " self._put_results_log(msg) prediction_summary_txt += msg + "\n" acceptable = False if title_normalized_y > 0.5 or author_normalized_y > 0.5: msg = f" REJECT: title or author appears in lower half of page " self._put_results_log(msg) prediction_summary_txt += msg + "\n" acceptable = False title = self.extractor.find_bbox_text( page_number, title_rect_x0, title_rect_y0, title_rect_x1, title_rect_y1) title = self.cleaner.one_line(title) authors = self.extractor.find_bbox_text( page_number, author_rect_x0, author_rect_y0, author_rect_x1, author_rect_y1) authors = self.cleaner.cleanAuthors(authors) smsg = f"{basename}: page={page_number} TITLE={title} AUTHORS={authors}" self._put_results_log(smsg) prediction_summary_txt += smsg prediction_summary_txt += f"\nTITLE({title_info['comments']})\n" prediction_summary_txt += f"AUTHOR({title_info['comments']})\n" if acceptable: json_per_image = os.path.join(self.output_dir, f"{basename}.json") json_dict = {} json_dict["page_number"] = page_number json_dict["basename"] = basename json_dict["type"] = "start_article" json_dict["title"] = title json_dict["authors"] = authors json_txt = json.dumps(json_dict) with open(json_per_image, "a") as f: f.write(f"{json_txt}\n") elif page_type == 2: # references page, save info page_prediction_msg += f"type=2" pass else: # toc page, save info # pagte_type == 3 page_prediction_msg += f"type=3" pass else: # mode for gathering of training data for post model pass finish_post = time.time() self._put_results_log( f"TIMING: wait={finish_wait - start_wait} post={finish_post - finish_wait} {page_prediction_msg}" ) # if debug, emit a txt summary if self.enable_debug: ############################## if len(prediction_summary_txt) > 0: debug_per_image = os.path.join(self.output_dir, f"{basename}.txt") with open(debug_per_image, "a") as f: f.write(f"{prediction_summary_txt}\n")