Beispiel #1
0
def routeNextTask(task, document, task_extras=None):
	if not hasattr(task, 'no_continue') or not task.no_continue:
		next_task_path = None
		
		from lib.Worker.Models.uv_task import UnveillanceTask
		
		if hasattr(task, 'next_task_path'):
			next_task_path = task.next_task_path
		else:
			from vars import MIME_TYPE_TASKS
		
			if document.mime_type in MIME_TYPE_TASKS.keys():
				try:
					next_task_path = MIME_TYPE_TASKS[document.mime_type][1]
				except Exception as e:
					if DEBUG: print e				
		
		if next_task_path is not None:
			inflate = {
				'task_path' : next_task_path,
				'doc_id' : document._id,
				'queue' : task.queue
			}
			
			if task_extras is not None: inflate.update(task_extras)
			
			next_task = UnveillanceTask(inflate=inflate)
			next_task.run()
def evaluateFile(task):
	task_tag = "EVALUATING DOCUMENT (INFORMACAM)"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "image preprocessing at %s" % task.doc_id
	task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	document = UnveillanceDocument(_id=task.doc_id)
	if document is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	if not document.getFile(task.file_name):
		print "NO FILE CONTENT"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
		
	from lib.Worker.Models.uv_task import UnveillanceTask
	from lib.Worker.Utils.funcs import getFileType
	from vars import MIME_TYPE_TASKS
	from conf import ANNEX_DIR
	
	try:
		mime_type = getFileType(os.path.join(ANNEX_DIR, task.file_name))
		new_task = UnveillanceTask(inflate={
			'task_path' : MIME_TYPE_TASKS[mime_type][0],
			'doc_id' : document._id,
			'file_name' : task.file_name
		})
		
		document.addCompletedTask(task.task_path)
		new_task.run()
	except IndexError as e:
		print "NO NEXT TASK: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
	def startElasticsearch(self, catch=True):
		cmd = [ELS_ROOT, '-Des.max-open-files=true', 
			'-Des.config=%s' % os.path.join(CONF_ROOT, "els.settings.yaml")]
		
		print "elasticsearch running in daemon."
		print cmd
		
		p = Popen(cmd, stdout=PIPE, close_fds=True)
		data = p.stdout.readline()
	
		while data:
			print data
			if re.match(r'.*started$', data):
				print "STARTED: %s" % data
				with open(self.els_status_file, 'wb+') as f: f.write("True")
				sleep(1)
				
				if self.first_use: self.initElasticsearch()
				break
		
			data = p.stdout.readline()
		p.stdout.close()

		#if self.first_use:
		startDaemon(self.els_log_file, self.els_pid_file)
		self.startCronJobs()

		try:
			with open(os.path.join(CONF_ROOT, "initial_tasks.json"), 'rb') as IT:
				from lib.Worker.Models.uv_task import UnveillanceTask
				for i_task in json.loads(IT.read()):
					task = UnveillanceTask(inflate=i_task)

					try:
						task.run()
					except Exception as e:
						if DEBUG:
							print "TASK ERROR: %s" % e

		except Exception as e:
			if DEBUG:
				print "No initial tasks...\n%s" % e
			
		if catch:
			while True: sleep(1)
Beispiel #4
0
	def do_reindex(self, request):
		print "DOING REINDEX"
		
		query = parseRequestEntity(request.query)
		if query is None: return None
		if '_id' not in query.keys(): return None
		
		document = self.get(_id=query['_id'])
		if document is None:
			return None
		
		document = UnveillanceDocument(_id=document['_id'])

		inflate={
			'doc_id' : document._id,
			'queue' : UUID
		}

		del query['_id']
		
		if 'task_path' not in query.keys() and 'task_queue' not in query.keys():
			document.reset()
			inflate.update({
				'task_path' : "Documents.evaluate_document.evaluateDocument"
			})
			
		else:
			inflate.update(query)

			if 'task_queue' in inflate.keys():
				inflate.update({
					'task_path' : inflate['task_queue'][0],
					'task_queue' : inflate['task_queue']
				})
			else:
				inflate.update({
					'no_continue' : True 
				})
		
		uv_task = UnveillanceTask(inflate=inflate)
		uv_task.run()
		
		return uv_task.emit()
Beispiel #5
0
	def runTask(self, handler):
		try:
			args = parseRequestEntity(handler.request.body)
		except AttributeError as e:
			if DEBUG: print "No body?\n%s" % e
			return None
		
		uv_task = None
		if len(args.keys()) == 1 and '_id' in args.keys():
			uv_task = UnveillanceTask(_id=args['_id'])
		else:
			# TODO: XXX: IF REFERER IS LOCALHOST ONLY (and other auth TBD)!
			if 'task_path' in args.keys():
				args['queue'] = UUID
				uv_task = UnveillanceTask(inflate=args)
		
		if uv_task is None: return None
		
		uv_task.run()
		return uv_task.emit()
def initSource(task):
	task_tag = "INITING SOURCE"
	print "\n\n************** %s [START] ******************\n" % task_tag
	task.setStatus(302)
	
	from lib.Worker.Models.ic_source import InformaCamSource
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	source = InformaCamSource(_id=task.doc_id)
	if source is None:
		print "SOURCE DOCUMENT DOES NOT EXIST"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	if not hasattr(task, "assets"):
		print "NO ASSETS FOR THIS SOURCE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	import re, json, os
	from conf import ANNEX_DIR
	
	next_task = None
	for asset in task.assets:
		description = None
		tags = None
		sync = False
		
		if re.match(r'publicKey', asset):
			# import key
			description = "Source's public pgp key"
			tags = [ASSET_TAGS['PGP_KEY']]
			
			from lib.Worker.Models.uv_task import UnveillanceTask
		
			next_task = UnveillanceTask(inflate={
				'doc_id' : source._id,
				'task_path' : "PGP.import_key.importKey",
				'queue' : task.queue
			})
			sync = True
		elif re.match(r'credentials', asset):
			# parse creds
			with open(os.path.join(ANNEX_DIR, source.base_path, asset), 'rb') as C:
				try:
					credentials = json.loads(C.read())
					if DEBUG: print credentials
					for field in ['email','alias']:
						if field in credentials.keys() and credentials[field] != "":
							setattr(source, field, credentials[field])
					
					source.save()
				except Exception as e:
					if DEBUG: print e
					pass
			
					
		asset_path = source.addAsset(None, asset, description=description, tags=tags)
		print "ASSET PATH: %s" % asset_path
		
		if asset_path is None: continue
		if sync:
			print "ADDING %s AS FILE AS WELL:" % asset_path
			source.addFile(asset_path, None)
			
	if next_task is None:
		print "NO PUBLIC KEY FOR SOURCE."
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	source.addCompletedTask(task.task_path)
	from time import sleep
	sleep(10)
	
	next_task.run()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
Beispiel #7
0
def unpackJ3MLog(uv_task):
	task_tag = "UNPACKING J3M LOG"
	print "\n\n************** %s [START] ******************\n" % task_tag
	uv_task.setStatus(302)
		
	from lib.Worker.Models.ic_j3mlog import InformaCamLog
	from conf import DEBUG
	
	if not hasattr(uv_task, "assets"):
		print "NO ASSETS FOR THIS J3M LOG"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	j3m_log = InformaCamLog(_id=uv_task.doc_id)
	if j3m_log is None:
		print "J3M LOG DOES NOT EXIST"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	import re, os
	from fabric.api import local, settings
	from fabric.context_managers import hide
	
	from lib.Worker.Models.uv_task import UnveillanceTask
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import ANNEX_DIR
	from vars import MIME_TYPES

	j3m_log.original_mime_type = j3m_log.mime_type
	j3m_log.mime_type = MIME_TYPES['j3mlog']
	j3m_log.save()

	for asset in uv_task.assets:
		if re.match(r'log.j3m(?:\.json)?', asset):
			# is the j3m
			try:	
				j3m_name = j3m_log.addAsset(None, asset)
			except Exception as e:
				print "WE COULD NOT ADD ASSET %s?" % asset
				print e
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				continue
				
			if j3m_name is None:
				print "COULD NOT ADD J3M."
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				continue

			uv_task.routeNext(inflate={'j3m_name' : j3m_name})
			
		elif re.match(r'.+\.(?:jpg|mkv)$', asset):
			# is a submission; create it, but move asset over into ANNEX_DIR first
			asset_path = os.path.join(ANNEX_DIR, j3m_log.base_path, asset)
			if DEBUG:
				print "MOVING ASSET FROM %s" % asset_path
				
			with settings(hide('everything'), warn_only=True):
				local("mv %s %s" % (asset_path, ANNEX_DIR))
			
			media = UnveillanceDocument(inflate={
				'file_name' : asset,
				'attached_to' : j3m_log._id
			})
			
			if not hasattr(j3m_log, "documents"):
				j3m_log.documents = []
			
			j3m_log.documents.append(media)
			
			media_task = UnveillanceTask(inflate={
				'task_path' : "Documents.evaluate_document.evaluateDocument",
				'doc_id' : media._id,
				'queue' : uv_task.queue,
				'file_name' : asset
			})
			media_task.run()
	
	uv_task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
def splitPDFPages(task):
	print "\n\n************** SPLITTING PDF PAGES [START] ******************\n"
	print "splitting pdf at %s into pages" % task.doc_id
	task.setStatus(412)

	from copy import deepcopy
	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n"
		return

	from cStringIO import StringIO
	from PyPDF2 import PdfFileWriter

	from lib.Worker.Models.uv_task import UnveillanceTask
	from vars import MIME_TYPE_TASKS

	MAX_PAGES = 200

	next_task = {
		'task_path' : MIME_TYPE_TASKS['application/pdf'][1],
		'doc_id' : task.doc_id,
		'queue' : task.queue
	}

	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n"
		return

	# get num pages
	total_pages = pdf_reader.getNumPages()
	if not hasattr(task, "num_pages"): task.num_pages = MAX_PAGES

	if total_pages > task.num_pages:
		print "THIS SHOULD BE SPLIT BEFORE CONTINUING!"

		count = done = 0
		out = PdfFileWriter()

		for x in xrange(0, total_pages):
			page = pdf_reader.getPage(x)

			if x != 0 and x % num_pages == 0:
				if DEBUG:
					print "max reached... let's close this doc (done = %d)" % done
					print "merging pages %d to %d to PDF" % (count, x)

				count = x
				done += 1

				new_pdf = StringIO()
				out.write(new_pdf)
				new_pdf.close()

				if pdf.addAsset(new_pdf.getvalue(), "doc_split_%d.pdf" % done,
					tags=[ASSET_TAGS['D_S'], ASSET_TAGS['AS_PDF']], description="Chunk %d of original document" % done):
					
					doc_split_task = deepcopy(next_task)
					doc_split_task.update({
						'split_file' : "doc_split_%d.pdf" % done,
						'split_index' : done
					})
					
					new_task = UnveillanceTask(inflate=doc_split_task)
					new_task.run()
	else:
		pdf.addCompletedTask(task.task_path)
		new_task = UnveillanceTask(inflate=deepcopy(next_task))
		new_task.run()


	task.finish()
	print "\n\n************** SPLITTING PDF PAGES [END] ******************\n"
def extractPDFText(task):
	task_tag = "PDF TEXT EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.cp_pdf import CompassPDF

	from conf import DEBUG
	from vars import ASSET_TAGS

	pdf = CompassPDF(_id=task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n"
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	pdf_reader = pdf.loadFile(pdf.file_name)
	total_pages = pdf_reader.getNumPages()
	if hasattr(task, "split_file"):
		pdf_reader = pdf.loadAsset(task.split_file)
		
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n"
		return

	from json import loads
	lower_bound = 0
	t = pdf.getAsset("doc_texts.json")
	if t is None:
		texts = [None] * total_pages
	else:
		try:
			texts = loads(t[0])
		except TypeError as e:
			texts = [None] * total_pages
		
		if hasattr(task, "split_index") : lower_bound = task.split_index

	upper_bound = lower_bound + pdf_reader.getNumPages()
	
	for x in xrange(lower_bound, upper_bound):
		texts[x] = pdf_reader.getPage(x).extractText()
		if DEBUG: print "EXTRACTED TEXT from page %d of %d:\n%s" % (x, upper_bound, texts[x])
	
	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. uncleaned. (Not OCR)", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		from lib.Worker.Models.uv_text import UnveillanceText
		uv_text = UnveillanceText(inflate={
			'media_id' : pdf._id,
			'searchable_text' : texts,
			'file_name' : asset_path
		})

		pdf.text_id = uv_text._id
		pdf.save()

	pdf.addCompletedTask(task.task_path)
	
	if not hasattr(task, "no_continue"):
		from lib.Worker.Models.uv_task import UnveillanceTask
		next_task = UnveillanceTask(inflate={
			'task_path' : 'Text.preprocess_nlp.preprocessNLP',
			'doc_id' : task.doc_id,
			'queue' : task.queue,
			'text_file' : asset_path
		})
		next_task.run()
	
	if DEBUG: print "WHERE ARE THE F*****G S TEXTS? %d" % len(pdf.searchable_texts)
	
	task.finish()
	print "\n\n************** PDF TEXT EXTRACTION [END] ******************\n"