def report_extracted(self, input_filename, output_dir, project): input_filename_base = os.path.basename(input_filename) input_md5 = md5sum(input_filename) pathname = output_prefix(input_filename, output_dir) + "*.pgm" for fname in glob.glob(pathname): output_filename = os.path.basename(fname) output_md5 = md5sum(fname) page_num = self.page_num(output_filename) extract_image.send(self, input_filename=input_filename_base, input_md5=input_md5, filename=output_filename, md5=output_md5, page=page_num, project=project)
def split(self, output_dir, **kwargs): """Split an image that contains a table into per-cell images""" for box in self.get_boxes(**kwargs): col, row, left, upper, right, lower = box region = self.im.crop((left, upper, right, lower)) region = self.transform(region, col, row) # Increase contrast. Whiteish regions become absolute white region = threshold(region, 200) output_filename = cell_basename(self.filename, col, row) + ".tiff" output_path = os.path.join(output_dir, output_filename) region.save(output_path) md5 = md5sum(output_path) split_image.send(self, input_filename=self.filename, input_md5=self.md5, filename=output_filename, md5=md5, column=col, row=row, left=left, upper=upper, right=right, lower=lower)
def ocr_image(self, input_filename, num_cols, num_rows, split_dir, ocr_dir, user): for col in range(num_cols): for row in range(num_rows): basename = cell_basename(input_filename, col, row) img_filename = basename + '.tiff' path = os.path.join(split_dir, img_filename) outbase = os.path.join(ocr_dir, basename) if os.path.exists(path): img_md5 = md5sum(path) cmd = "tesseract {} {}".format(path, outbase) cmd_args = shlex.split(cmd) subprocess.check_call(cmd_args) output_filename = outbase + '.txt' with open(output_filename, 'r') as f: text = f.read() image_text.send(self, source_filename=img_filename, source_md5=img_md5, method='ocr', text=text, user=user)
def __init__(self, filename): self.md5 = md5sum(filename) self.filename = os.path.basename(filename) self._im = Image.open(filename)