Example #1
0
def processPDFMetadata(uv_task):
	task_tag = "PDF METADATA EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)
		
	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	import os
	from conf import ANNEX_DIR, getConfig
	from fabric.api import local, settings
	
	with settings(warn_only=True):
		peepdf_raw = local("%s %s -s %s" % (
			getConfig('compass.peepdf.root'), os.path.join(ANNEX_DIR, pdf.file_name),
			getConfig('compass.peepdf.batch')), capture=True)
			
	if peepdf_raw is None:
		print "METADATA COULD NOT BE GENERATED"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	import re
	peepdf = []
	for line in peepdf_raw.splitlines():
		if line != "":
			peepdf.append(re.compile("\033\[[0-9;]+m").sub("", line))
	
	# save to asset, next task: compile metadata
	md_file = pdf.addAsset("\n".join(peepdf), "%s.peeped" % pdf.file_name)
	if md_file is None or not pdf.addFile(md_file, None, sync=True):
		print "METADATA COULD NOT BE ADDED"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={
		'md_file' : "%s.peeped" % pdf.file_name,
		'md_namespace' : "PDF"
	})
	
	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()
Example #2
0
def OCRPDF(task):
	task_tag = "PDF OCR-TO-TEXT"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "OCRing text from pdf at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	pdf_reader = pdf.loadFile(pdf.file_name)
	total_pages = pdf_reader.getNumPages()
	if hasattr(task, "split_file"):
		pdf_reader = pdf.loadAsset(task.split_file)		

	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	lower_bound = 0
	upper_bound = lower_bound + pdf_reader.getNumPages()
	texts = [None] * total_pages

	for x in xrange(lower_bound, upper_bound):
		# TODO: OCR the doc
		texts[x] = "TBD"
	
	asset_path = pdf.addAsset(texts, "doc_ocr.json", as_literal=False,
		description="jsonified texts in document; page-by-page.  From OCR",
		tags=[ASSET_TAGS['TXT_OCR']])
	if asset_path is not None: pdf.addFile(asset_path, None, sync=True)
	
	pdf.addCompletedTask(task.task_path)
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
def splitPDFPages(task):
	print "\n\n************** SPLITTING PDF PAGES [START] ******************\n"
	print "splitting pdf at %s into pages" % task.doc_id
	task.setStatus(412)

	from copy import deepcopy
	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n"
		return

	from cStringIO import StringIO
	from PyPDF2 import PdfFileWriter

	from lib.Worker.Models.uv_task import UnveillanceTask
	from vars import MIME_TYPE_TASKS

	MAX_PAGES = 200

	next_task = {
		'task_path' : MIME_TYPE_TASKS['application/pdf'][1],
		'doc_id' : task.doc_id,
		'queue' : task.queue
	}

	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n"
		return

	# get num pages
	total_pages = pdf_reader.getNumPages()
	if not hasattr(task, "num_pages"): task.num_pages = MAX_PAGES

	if total_pages > task.num_pages:
		print "THIS SHOULD BE SPLIT BEFORE CONTINUING!"

		count = done = 0
		out = PdfFileWriter()

		for x in xrange(0, total_pages):
			page = pdf_reader.getPage(x)

			if x != 0 and x % num_pages == 0:
				if DEBUG:
					print "max reached... let's close this doc (done = %d)" % done
					print "merging pages %d to %d to PDF" % (count, x)

				count = x
				done += 1

				new_pdf = StringIO()
				out.write(new_pdf)
				new_pdf.close()

				if pdf.addAsset(new_pdf.getvalue(), "doc_split_%d.pdf" % done,
					tags=[ASSET_TAGS['D_S'], ASSET_TAGS['AS_PDF']], description="Chunk %d of original document" % done):
					
					doc_split_task = deepcopy(next_task)
					doc_split_task.update({
						'split_file' : "doc_split_%d.pdf" % done,
						'split_index' : done
					})
					
					new_task = UnveillanceTask(inflate=doc_split_task)
					new_task.run()
	else:
		pdf.addCompletedTask(task.task_path)
		new_task = UnveillanceTask(inflate=deepcopy(next_task))
		new_task.run()


	task.finish()
	print "\n\n************** SPLITTING PDF PAGES [END] ******************\n"
def extractPDFText(task):
	task_tag = "PDF TEXT EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n"
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	pdf_reader = pdf.loadFile(pdf.file_name)
	total_pages = pdf_reader.getNumPages()
	if hasattr(task, "split_file"):
		pdf_reader = pdf.loadAsset(task.split_file)
		
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n"
		return

	from json import loads
	lower_bound = 0
	t = pdf.getAsset("doc_texts.json")
	if t is None:
		texts = [None] * total_pages
	else:
		try:
			texts = loads(t[0])
		except TypeError as e:
			texts = [None] * total_pages
		
		if hasattr(task, "split_index") : lower_bound = task.split_index

	upper_bound = lower_bound + pdf_reader.getNumPages()
	
	for x in xrange(lower_bound, upper_bound):
		texts[x] = pdf_reader.getPage(x).extractText()
		if DEBUG: print "EXTRACTED TEXT from page %d of %d:\n%s" % (x, upper_bound, texts[x])
	
	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. uncleaned. (Not OCR)", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		from lib.Worker.Models.uv_text import UnveillanceText
		uv_text = UnveillanceText(inflate={
			'media_id' : pdf._id,
			'searchable_text' : texts,
			'file_name' : asset_path
		})

		pdf.text_id = uv_text._id
		pdf.save()

	pdf.addCompletedTask(task.task_path)
	
	if not hasattr(task, "no_continue"):
		from lib.Worker.Models.uv_task import UnveillanceTask
		next_task = UnveillanceTask(inflate={
			'task_path' : 'Text.preprocess_nlp.preprocessNLP',
			'doc_id' : task.doc_id,
			'queue' : task.queue,
			'text_file' : asset_path
		})
		next_task.run()
	
	if DEBUG: print "WHERE ARE THE F*****G S TEXTS? %d" % len(pdf.searchable_texts)
	
	task.finish()
	print "\n\n************** PDF TEXT EXTRACTION [END] ******************\n"
def get_documentcloud_ocr(uv_task):	
	task_tag = "PULLING OCR FROM DOCUMENTCLOUD"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "OCRing text via documentcloud from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)

	if not hasattr(uv_task, "documentcloud_auth"):
		error_msg = "DOCUMENTCLOUD AUTH STRING NEEDED"
		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail(status=412, message=error_msg)
		return 

	from lib.Worker.Models.cp_pdf import CompassPDF
	from conf import DEBUG

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	if not hasattr(uv_task, "documentcloud_id"):
		try:
			uv_task.documentcloud_id = pdf.file_alias.replace(".pdf", "")

			print "DOCUMENTCLOUD ID NOT PASSED. GUESSING AT IT WITH %s" % uv_task.documentcloud_id
			print "\n\n************** %s [WARN] ******************\n" % task_tag
		except Exception as e:
			print "COULD NOT GET DOCUMENTCLOUD ID FOR %s" % pdf.file_name
			print e
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			return

	import os, requests
	
	from lib.Core.Utils.funcs import cleanLine, generateMD5Hash
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	count = 0
	req_map = {
		'a' : uv_task.documentcloud_auth,
		's' : uv_task.documentcloud_id.split('-')[0],
		'd' : "-".join(uv_task.documentcloud_id.split('-')[1:])
	}

	for x in xrange(0, pdf.total_pages):
		req_map['x'] = x
		req = "https://%(a)[email protected]/documents/%(s)s/pages/%(d)s-p%(x)d.txt" % (req_map)
	
		if DEBUG:
			print "trying %s" % req

		r = requests.get(req)
		if r.status_code != 200:
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print "no text at page %d" % x
		else:
			texts[count] = r.content

			els_stub = UnveillanceELSStub('cp_page_text', inflate={
				'media_id' : pdf._id,
				'searchable_text' : texts[count],
				'index_in_parent' : count,
				'_id' : generateMD5Hash(content=pdf._id, salt=str(count))
			})

		if texts[count] is None or len(texts[count]) == 0:
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print "no text at page %d (%s)" % (x, type(texts[count]))
			texts[count] = ""

		count += 1

	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document, from DocumentCloud", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()
Example #6
0
def OCRPDF(uv_task):	
	task_tag = "PDF OCR-TO-TEXT"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "OCRing text from pdf at %s" % uv_task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF
	from conf import DEBUG

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	import os
	from fabric.api import settings, local
	from wand.image import Image
	from time import sleep

	from lib.Core.Utils.funcs import cleanLine
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	count = 0
	tmp_img = os.path.join(ANNEX_DIR, pdf.base_path, "p_image.jpg")

	for x in xrange(0, num_pages):
		# pdf page to image
		with Image(filename=os.path.join(ANNEX_DIR, "%s[%d]" % (pdf.file_name, x))) as p_image:
			p_image.save(filename=tmp_img)
			
			# image to ocr
			with settings(warn_only=True):
				text = cleanLine(local("tesseract p_image.jpg -", capture=True))
				texts[count] = text

				els_stub = UnveillanceELSStub('cp_page_text', inflate={
					'media_id' : pdf._id,
					'searchable_text' : text,
					'index_in_parent' : count
				})

			sleep(1)

		count += 1

	os.remove(tmp_img)

	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. unclean. (OCR'd using tesseract)",
		tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()
Example #7
0
def splitPDFPages(task):
	task_tag = "SPLITTING PDF PAGES"

	print "\n\n************** %s [START] ******************\n" % task_tag
	print "splitting pdf at %s into pages" % task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF
	from conf import DEBUG

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from PyPDF2 import PdfFileWriter
	from lib.Worker.Models.uv_task import UnveillanceTask
	from vars import MIME_TYPE_TASKS

	MAX_PAGES = 75

	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	# get num pages
	pdf.total_pages = pdf_reader.getNumPages()
	pdf.save()

	if not hasattr(task, "max_pages"): task.max_pages = MAX_PAGES

	if pdf.total_pages > task.max_pages:
		print "THIS SHOULD BE SPLIT BEFORE CONTINUING!"
		
		count = done = 0
		out = PdfFileWriter()

		for x in xrange(0, pdf.total_pages):
			page = pdf_reader.getPage(x)
			
			if x != 0 and x % task.max_pages == 0:
				if DEBUG:
					print "max reached... let's close this doc (done = %d)" % done
					print "merging pages %d to %d to PDF" % (count, x)

				count = x
				done += 1

				saveSplitDocument(pdf, out, done)
				
				del out
				out = PdfFileWriter()

			out.addPage(page)
			count += 1
	
		done += 1
		saveSplitDocument(pdf, out, done)
		del out


	pdf.addCompletedTask(task.task_path)
	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
Example #8
0
def extractPDFText(uv_task):	
	task_tag = "PDF TEXT EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	import os
	from fabric.api import settings, local
	from wand.image import Image
	from time import sleep

	from lib.Core.Utils.funcs import cleanLine, generateMD5Hash
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR, DEBUG
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	
	if pdf.hasParts():
		extractors = pdf.getParts()
	else:
		extractors = [pdf.file_name]
	
	count = 0
	for e in extractors:
		if e == pdf.file_name:
			pdf_reader = pdf.loadFile(e)
		else:
			pdf_reader = pdf.loadAsset(e)
		try:
			num_pages = pdf_reader.getNumPages()
		except AttributeError as e:
			print e
			continue

		for x in xrange(0, num_pages):
			text = cleanLine(pdf_reader.getPage(x).extractText())
			texts[count] = text

			els_stub = UnveillanceELSStub('cp_page_text', inflate={
				'media_id' : pdf._id,
				'searchable_text' : text,
				'index_in_parent' : count,
				'_id' : generateMD5Hash(content=pdf._id, salt=str(count))
			})

			count += 1
	
	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()