def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text try: img = Image.open(StringIO(data)) except DecompressionBombWarning as dce: log.debug("Image too large: %", dce) return None except IOError as ioe: log.info("Unknown image format: %r", ioe) return None # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_image(img) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.get_text() or '' text = text.decode(encoding="UTF-8") # extractor.clear() log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will convert the whole file to XML using `pdftohtml`, then run OCR on individual images within the file. """ temp_dir = make_tempdir() try: out_file = os.path.join(temp_dir, 'pdf.xml') log.info("Converting PDF to XML: %r...", path) pdftohtml = get_config('PDFTOHTML_BIN') args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file] subprocess.call(args) if not os.path.exists(out_file): raise IngestorException("Could not convert PDF to XML: %s" % path) with open(out_file, 'r') as fh: xml = string_value(fh.read()) xml = xml.replace('encoding="UTF-8"', '') parser = etree.XMLParser(recover=True, remove_comments=True) doc = etree.fromstring(xml, parser=parser) log.debug("Parsed XML: %r", path) pages = [] for page in doc.findall('./page'): pages.append(extract_page(path, temp_dir, page, languages)) return {'pages': pages} finally: remove_tempdir(temp_dir)
def ingest(self, meta, local_path): fh, out_path = mkstemp(suffix='.htm') os.close(fh) with open(local_path, 'rb') as fh: doc = html.fromstring(fh.read()) if not meta.has('title'): title = doc.findtext('.//title') if title is not None: meta.title = title.strip() if not meta.has('summary'): summary = doc.find('.//meta[@name="description"]') if summary is not None and summary.get('content'): meta.summary = summary.get('content') self.cleaner(doc) try: with open(out_path, 'w') as fh: fh.write(etree.tostring(doc)) pdf_path = self.generate_pdf_version(out_path) if pdf_path is None or not os.path.isfile(pdf_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, pdf_path) finally: if os.path.isfile(out_path): os.unlink(out_path)
def ingest_file(source_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(source_id, meta.data) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def ingest_file(collection_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(collection_id, meta.to_attr_dict()) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def handle_html(self, meta, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" out_path = make_tempfile(name=meta.file_name, suffix='pdf') try: wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [ wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path ] subprocess.call(args) if not os.path.isfile(out_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, out_path) finally: remove_tempfile(out_path)
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text img = Image.open(StringIO(data)) # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def ingest(self, meta, local_path): try: fh, pdf_path = mkstemp(suffix='.pdf') os.close(fh) meta.title = meta.file_name convert = get_config('CONVERT_BIN') args = [ convert, local_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path ] subprocess.call(args) if pdf_path is None or not os.path.isfile(pdf_path): raise IngestorException("Could not convert image: %r" % meta) self.store_pdf(meta, pdf_path) self.extract_pdf(meta, pdf_path) finally: if os.path.isfile(pdf_path): os.unlink(pdf_path)
def generate_pdf_alternative(self, meta, local_path): """Convert LibreOffice-supported documents to PDF.""" work_dir = six.text_type(mkdtemp()) instance_dir = six.text_type(mkdtemp()) try: soffice = get_config('SOFFICE_BIN') instance_path = u'"-env:UserInstallation=file://%s"' % instance_dir args = [soffice, '--convert-to', 'pdf', '--nofirststartwizard', instance_path, '--norestore', '--nologo', '--nodefault', '--nolockcheck', '--invisible', '--outdir', work_dir, '--headless', string_value(local_path)] # log.debug('Converting document: %r', ' '.join(args)) subprocess.call(args, timeout=CONVERT_TIMEOUT) for out_file in os.listdir(work_dir): return os.path.join(work_dir, out_file) raise IngestorException("Could not convert document: %r" % meta) finally: shutil.rmtree(instance_dir)
def ingest_file(collection_id, meta, file_path, move=False, queue=WORKER_QUEUE, routing_key=WORKER_ROUTING_KEY): # the queue and routing key arguments are a workaround to # expedite user uploads over long-running batch imports. try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.apply_async([collection_id, meta.to_attr_dict()], queue=queue, routing_key=routing_key) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def ingest_url(collection_id, metadata, url): meta = Metadata(data=metadata) try: fh, tmp_path = mkstemp() os.close(fh) log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True, timeout=120) if res.status_code >= 400: msg = "HTTP Error %r: %r" % (url, res.status_code) raise IngestorException(msg) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()