def processPDFMetadata(uv_task): task_tag = "PDF METADATA EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import os from conf import ANNEX_DIR, getConfig from fabric.api import local, settings with settings(warn_only=True): peepdf_raw = local("%s %s -s %s" % ( getConfig('compass.peepdf.root'), os.path.join(ANNEX_DIR, pdf.file_name), getConfig('compass.peepdf.batch')), capture=True) if peepdf_raw is None: print "METADATA COULD NOT BE GENERATED" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import re peepdf = [] for line in peepdf_raw.splitlines(): if line != "": peepdf.append(re.compile("\033\[[0-9;]+m").sub("", line)) # save to asset, next task: compile metadata md_file = pdf.addAsset("\n".join(peepdf), "%s.peeped" % pdf.file_name) if md_file is None or not pdf.addFile(md_file, None, sync=True): print "METADATA COULD NOT BE ADDED" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'md_file' : "%s.peeped" % pdf.file_name, 'md_namespace' : "PDF" }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def OCRPDF(task): task_tag = "PDF OCR-TO-TEXT" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text from pdf at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) total_pages = pdf_reader.getNumPages() if hasattr(task, "split_file"): pdf_reader = pdf.loadAsset(task.split_file) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return lower_bound = 0 upper_bound = lower_bound + pdf_reader.getNumPages() texts = [None] * total_pages for x in xrange(lower_bound, upper_bound): # TODO: OCR the doc texts[x] = "TBD" asset_path = pdf.addAsset(texts, "doc_ocr.json", as_literal=False, description="jsonified texts in document; page-by-page. From OCR", tags=[ASSET_TAGS['TXT_OCR']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.addCompletedTask(task.task_path) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def splitPDFPages(task): print "\n\n************** SPLITTING PDF PAGES [START] ******************\n" print "splitting pdf at %s into pages" % task.doc_id task.setStatus(412) from copy import deepcopy from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n" return from cStringIO import StringIO from PyPDF2 import PdfFileWriter from lib.Worker.Models.uv_task import UnveillanceTask from vars import MIME_TYPE_TASKS MAX_PAGES = 200 next_task = { 'task_path' : MIME_TYPE_TASKS['application/pdf'][1], 'doc_id' : task.doc_id, 'queue' : task.queue } pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n" return # get num pages total_pages = pdf_reader.getNumPages() if not hasattr(task, "num_pages"): task.num_pages = MAX_PAGES if total_pages > task.num_pages: print "THIS SHOULD BE SPLIT BEFORE CONTINUING!" count = done = 0 out = PdfFileWriter() for x in xrange(0, total_pages): page = pdf_reader.getPage(x) if x != 0 and x % num_pages == 0: if DEBUG: print "max reached... let's close this doc (done = %d)" % done print "merging pages %d to %d to PDF" % (count, x) count = x done += 1 new_pdf = StringIO() out.write(new_pdf) new_pdf.close() if pdf.addAsset(new_pdf.getvalue(), "doc_split_%d.pdf" % done, tags=[ASSET_TAGS['D_S'], ASSET_TAGS['AS_PDF']], description="Chunk %d of original document" % done): doc_split_task = deepcopy(next_task) doc_split_task.update({ 'split_file' : "doc_split_%d.pdf" % done, 'split_index' : done }) new_task = UnveillanceTask(inflate=doc_split_task) new_task.run() else: pdf.addCompletedTask(task.task_path) new_task = UnveillanceTask(inflate=deepcopy(next_task)) new_task.run() task.finish() print "\n\n************** SPLITTING PDF PAGES [END] ******************\n"
def extractPDFText(task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) total_pages = pdf_reader.getNumPages() if hasattr(task, "split_file"): pdf_reader = pdf.loadAsset(task.split_file) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return from json import loads lower_bound = 0 t = pdf.getAsset("doc_texts.json") if t is None: texts = [None] * total_pages else: try: texts = loads(t[0]) except TypeError as e: texts = [None] * total_pages if hasattr(task, "split_index") : lower_bound = task.split_index upper_bound = lower_bound + pdf_reader.getNumPages() for x in xrange(lower_bound, upper_bound): texts[x] = pdf_reader.getPage(x).extractText() if DEBUG: print "EXTRACTED TEXT from page %d of %d:\n%s" % (x, upper_bound, texts[x]) asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. uncleaned. (Not OCR)", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : pdf._id, 'searchable_text' : texts, 'file_name' : asset_path }) pdf.text_id = uv_text._id pdf.save() pdf.addCompletedTask(task.task_path) if not hasattr(task, "no_continue"): from lib.Worker.Models.uv_task import UnveillanceTask next_task = UnveillanceTask(inflate={ 'task_path' : 'Text.preprocess_nlp.preprocessNLP', 'doc_id' : task.doc_id, 'queue' : task.queue, 'text_file' : asset_path }) next_task.run() if DEBUG: print "WHERE ARE THE F*****G S TEXTS? %d" % len(pdf.searchable_texts) task.finish() print "\n\n************** PDF TEXT EXTRACTION [END] ******************\n"
def get_documentcloud_ocr(uv_task): task_tag = "PULLING OCR FROM DOCUMENTCLOUD" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text via documentcloud from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) if not hasattr(uv_task, "documentcloud_auth"): error_msg = "DOCUMENTCLOUD AUTH STRING NEEDED" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=error_msg) return from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if not hasattr(uv_task, "documentcloud_id"): try: uv_task.documentcloud_id = pdf.file_alias.replace(".pdf", "") print "DOCUMENTCLOUD ID NOT PASSED. GUESSING AT IT WITH %s" % uv_task.documentcloud_id print "\n\n************** %s [WARN] ******************\n" % task_tag except Exception as e: print "COULD NOT GET DOCUMENTCLOUD ID FOR %s" % pdf.file_name print e print "\n\n************** %s [ERROR] ******************\n" % task_tag return import os, requests from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR from vars import ASSET_TAGS texts = [None] * pdf.total_pages count = 0 req_map = { 'a' : uv_task.documentcloud_auth, 's' : uv_task.documentcloud_id.split('-')[0], 'd' : "-".join(uv_task.documentcloud_id.split('-')[1:]) } for x in xrange(0, pdf.total_pages): req_map['x'] = x req = "https://%(a)[email protected]/documents/%(s)s/pages/%(d)s-p%(x)d.txt" % (req_map) if DEBUG: print "trying %s" % req r = requests.get(req) if r.status_code != 200: print "\n\n************** %s [WARN] ******************\n" % task_tag print "no text at page %d" % x else: texts[count] = r.content els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : texts[count], 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) if texts[count] is None or len(texts[count]) == 0: print "\n\n************** %s [WARN] ******************\n" % task_tag print "no text at page %d (%s)" % (x, type(texts[count])) texts[count] = "" count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document, from DocumentCloud", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def OCRPDF(uv_task): task_tag = "PDF OCR-TO-TEXT" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text from pdf at %s" % uv_task.doc_id task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR from vars import ASSET_TAGS texts = [None] * pdf.total_pages count = 0 tmp_img = os.path.join(ANNEX_DIR, pdf.base_path, "p_image.jpg") for x in xrange(0, num_pages): # pdf page to image with Image(filename=os.path.join(ANNEX_DIR, "%s[%d]" % (pdf.file_name, x))) as p_image: p_image.save(filename=tmp_img) # image to ocr with settings(warn_only=True): text = cleanLine(local("tesseract p_image.jpg -", capture=True)) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count }) sleep(1) count += 1 os.remove(tmp_img) asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean. (OCR'd using tesseract)", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def splitPDFPages(task): task_tag = "SPLITTING PDF PAGES" print "\n\n************** %s [START] ******************\n" % task_tag print "splitting pdf at %s into pages" % task.doc_id task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from PyPDF2 import PdfFileWriter from lib.Worker.Models.uv_task import UnveillanceTask from vars import MIME_TYPE_TASKS MAX_PAGES = 75 pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return # get num pages pdf.total_pages = pdf_reader.getNumPages() pdf.save() if not hasattr(task, "max_pages"): task.max_pages = MAX_PAGES if pdf.total_pages > task.max_pages: print "THIS SHOULD BE SPLIT BEFORE CONTINUING!" count = done = 0 out = PdfFileWriter() for x in xrange(0, pdf.total_pages): page = pdf_reader.getPage(x) if x != 0 and x % task.max_pages == 0: if DEBUG: print "max reached... let's close this doc (done = %d)" % done print "merging pages %d to %d to PDF" % (count, x) count = x done += 1 saveSplitDocument(pdf, out, done) del out out = PdfFileWriter() out.addPage(page) count += 1 done += 1 saveSplitDocument(pdf, out, done) del out pdf.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def extractPDFText(uv_task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR, DEBUG from vars import ASSET_TAGS texts = [None] * pdf.total_pages if pdf.hasParts(): extractors = pdf.getParts() else: extractors = [pdf.file_name] count = 0 for e in extractors: if e == pdf.file_name: pdf_reader = pdf.loadFile(e) else: pdf_reader = pdf.loadAsset(e) try: num_pages = pdf_reader.getNumPages() except AttributeError as e: print e continue for x in xrange(0, num_pages): text = cleanLine(pdf_reader.getPage(x).extractText()) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()