def routeNextTask(task, document, task_extras=None): if not hasattr(task, 'no_continue') or not task.no_continue: next_task_path = None from lib.Worker.Models.uv_task import UnveillanceTask if hasattr(task, 'next_task_path'): next_task_path = task.next_task_path else: from vars import MIME_TYPE_TASKS if document.mime_type in MIME_TYPE_TASKS.keys(): try: next_task_path = MIME_TYPE_TASKS[document.mime_type][1] except Exception as e: if DEBUG: print e if next_task_path is not None: inflate = { 'task_path' : next_task_path, 'doc_id' : document._id, 'queue' : task.queue } if task_extras is not None: inflate.update(task_extras) next_task = UnveillanceTask(inflate=inflate) next_task.run()
def evaluateFile(task): task_tag = "EVALUATING DOCUMENT (INFORMACAM)" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS document = UnveillanceDocument(_id=task.doc_id) if document is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if not document.getFile(task.file_name): print "NO FILE CONTENT" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Worker.Models.uv_task import UnveillanceTask from lib.Worker.Utils.funcs import getFileType from vars import MIME_TYPE_TASKS from conf import ANNEX_DIR try: mime_type = getFileType(os.path.join(ANNEX_DIR, task.file_name)) new_task = UnveillanceTask(inflate={ 'task_path' : MIME_TYPE_TASKS[mime_type][0], 'doc_id' : document._id, 'file_name' : task.file_name }) document.addCompletedTask(task.task_path) new_task.run() except IndexError as e: print "NO NEXT TASK: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def startElasticsearch(self, catch=True): cmd = [ELS_ROOT, '-Des.max-open-files=true', '-Des.config=%s' % os.path.join(CONF_ROOT, "els.settings.yaml")] print "elasticsearch running in daemon." print cmd p = Popen(cmd, stdout=PIPE, close_fds=True) data = p.stdout.readline() while data: print data if re.match(r'.*started$', data): print "STARTED: %s" % data with open(self.els_status_file, 'wb+') as f: f.write("True") sleep(1) if self.first_use: self.initElasticsearch() break data = p.stdout.readline() p.stdout.close() #if self.first_use: startDaemon(self.els_log_file, self.els_pid_file) self.startCronJobs() try: with open(os.path.join(CONF_ROOT, "initial_tasks.json"), 'rb') as IT: from lib.Worker.Models.uv_task import UnveillanceTask for i_task in json.loads(IT.read()): task = UnveillanceTask(inflate=i_task) try: task.run() except Exception as e: if DEBUG: print "TASK ERROR: %s" % e except Exception as e: if DEBUG: print "No initial tasks...\n%s" % e if catch: while True: sleep(1)
def do_reindex(self, request): print "DOING REINDEX" query = parseRequestEntity(request.query) if query is None: return None if '_id' not in query.keys(): return None document = self.get(_id=query['_id']) if document is None: return None document = UnveillanceDocument(_id=document['_id']) inflate={ 'doc_id' : document._id, 'queue' : UUID } del query['_id'] if 'task_path' not in query.keys() and 'task_queue' not in query.keys(): document.reset() inflate.update({ 'task_path' : "Documents.evaluate_document.evaluateDocument" }) else: inflate.update(query) if 'task_queue' in inflate.keys(): inflate.update({ 'task_path' : inflate['task_queue'][0], 'task_queue' : inflate['task_queue'] }) else: inflate.update({ 'no_continue' : True }) uv_task = UnveillanceTask(inflate=inflate) uv_task.run() return uv_task.emit()
def runTask(self, handler): try: args = parseRequestEntity(handler.request.body) except AttributeError as e: if DEBUG: print "No body?\n%s" % e return None uv_task = None if len(args.keys()) == 1 and '_id' in args.keys(): uv_task = UnveillanceTask(_id=args['_id']) else: # TODO: XXX: IF REFERER IS LOCALHOST ONLY (and other auth TBD)! if 'task_path' in args.keys(): args['queue'] = UUID uv_task = UnveillanceTask(inflate=args) if uv_task is None: return None uv_task.run() return uv_task.emit()
def initSource(task): task_tag = "INITING SOURCE" print "\n\n************** %s [START] ******************\n" % task_tag task.setStatus(302) from lib.Worker.Models.ic_source import InformaCamSource from conf import DEBUG from vars import ASSET_TAGS source = InformaCamSource(_id=task.doc_id) if source is None: print "SOURCE DOCUMENT DOES NOT EXIST" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if not hasattr(task, "assets"): print "NO ASSETS FOR THIS SOURCE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import re, json, os from conf import ANNEX_DIR next_task = None for asset in task.assets: description = None tags = None sync = False if re.match(r'publicKey', asset): # import key description = "Source's public pgp key" tags = [ASSET_TAGS['PGP_KEY']] from lib.Worker.Models.uv_task import UnveillanceTask next_task = UnveillanceTask(inflate={ 'doc_id' : source._id, 'task_path' : "PGP.import_key.importKey", 'queue' : task.queue }) sync = True elif re.match(r'credentials', asset): # parse creds with open(os.path.join(ANNEX_DIR, source.base_path, asset), 'rb') as C: try: credentials = json.loads(C.read()) if DEBUG: print credentials for field in ['email','alias']: if field in credentials.keys() and credentials[field] != "": setattr(source, field, credentials[field]) source.save() except Exception as e: if DEBUG: print e pass asset_path = source.addAsset(None, asset, description=description, tags=tags) print "ASSET PATH: %s" % asset_path if asset_path is None: continue if sync: print "ADDING %s AS FILE AS WELL:" % asset_path source.addFile(asset_path, None) if next_task is None: print "NO PUBLIC KEY FOR SOURCE." print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return source.addCompletedTask(task.task_path) from time import sleep sleep(10) next_task.run() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def unpackJ3MLog(uv_task): task_tag = "UNPACKING J3M LOG" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.ic_j3mlog import InformaCamLog from conf import DEBUG if not hasattr(uv_task, "assets"): print "NO ASSETS FOR THIS J3M LOG" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return j3m_log = InformaCamLog(_id=uv_task.doc_id) if j3m_log is None: print "J3M LOG DOES NOT EXIST" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import re, os from fabric.api import local, settings from fabric.context_managers import hide from lib.Worker.Models.uv_task import UnveillanceTask from lib.Worker.Models.uv_document import UnveillanceDocument from conf import ANNEX_DIR from vars import MIME_TYPES j3m_log.original_mime_type = j3m_log.mime_type j3m_log.mime_type = MIME_TYPES['j3mlog'] j3m_log.save() for asset in uv_task.assets: if re.match(r'log.j3m(?:\.json)?', asset): # is the j3m try: j3m_name = j3m_log.addAsset(None, asset) except Exception as e: print "WE COULD NOT ADD ASSET %s?" % asset print e print "\n\n************** %s [WARN] ******************\n" % task_tag continue if j3m_name is None: print "COULD NOT ADD J3M." print "\n\n************** %s [WARN] ******************\n" % task_tag continue uv_task.routeNext(inflate={'j3m_name' : j3m_name}) elif re.match(r'.+\.(?:jpg|mkv)$', asset): # is a submission; create it, but move asset over into ANNEX_DIR first asset_path = os.path.join(ANNEX_DIR, j3m_log.base_path, asset) if DEBUG: print "MOVING ASSET FROM %s" % asset_path with settings(hide('everything'), warn_only=True): local("mv %s %s" % (asset_path, ANNEX_DIR)) media = UnveillanceDocument(inflate={ 'file_name' : asset, 'attached_to' : j3m_log._id }) if not hasattr(j3m_log, "documents"): j3m_log.documents = [] j3m_log.documents.append(media) media_task = UnveillanceTask(inflate={ 'task_path' : "Documents.evaluate_document.evaluateDocument", 'doc_id' : media._id, 'queue' : uv_task.queue, 'file_name' : asset }) media_task.run() uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def splitPDFPages(task): print "\n\n************** SPLITTING PDF PAGES [START] ******************\n" print "splitting pdf at %s into pages" % task.doc_id task.setStatus(412) from copy import deepcopy from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n" return from cStringIO import StringIO from PyPDF2 import PdfFileWriter from lib.Worker.Models.uv_task import UnveillanceTask from vars import MIME_TYPE_TASKS MAX_PAGES = 200 next_task = { 'task_path' : MIME_TYPE_TASKS['application/pdf'][1], 'doc_id' : task.doc_id, 'queue' : task.queue } pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** SPLITTING PDF PAGES [ERROR] ******************\n" return # get num pages total_pages = pdf_reader.getNumPages() if not hasattr(task, "num_pages"): task.num_pages = MAX_PAGES if total_pages > task.num_pages: print "THIS SHOULD BE SPLIT BEFORE CONTINUING!" count = done = 0 out = PdfFileWriter() for x in xrange(0, total_pages): page = pdf_reader.getPage(x) if x != 0 and x % num_pages == 0: if DEBUG: print "max reached... let's close this doc (done = %d)" % done print "merging pages %d to %d to PDF" % (count, x) count = x done += 1 new_pdf = StringIO() out.write(new_pdf) new_pdf.close() if pdf.addAsset(new_pdf.getvalue(), "doc_split_%d.pdf" % done, tags=[ASSET_TAGS['D_S'], ASSET_TAGS['AS_PDF']], description="Chunk %d of original document" % done): doc_split_task = deepcopy(next_task) doc_split_task.update({ 'split_file' : "doc_split_%d.pdf" % done, 'split_index' : done }) new_task = UnveillanceTask(inflate=doc_split_task) new_task.run() else: pdf.addCompletedTask(task.task_path) new_task = UnveillanceTask(inflate=deepcopy(next_task)) new_task.run() task.finish() print "\n\n************** SPLITTING PDF PAGES [END] ******************\n"
def extractPDFText(task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) total_pages = pdf_reader.getNumPages() if hasattr(task, "split_file"): pdf_reader = pdf.loadAsset(task.split_file) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return from json import loads lower_bound = 0 t = pdf.getAsset("doc_texts.json") if t is None: texts = [None] * total_pages else: try: texts = loads(t[0]) except TypeError as e: texts = [None] * total_pages if hasattr(task, "split_index") : lower_bound = task.split_index upper_bound = lower_bound + pdf_reader.getNumPages() for x in xrange(lower_bound, upper_bound): texts[x] = pdf_reader.getPage(x).extractText() if DEBUG: print "EXTRACTED TEXT from page %d of %d:\n%s" % (x, upper_bound, texts[x]) asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. uncleaned. (Not OCR)", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : pdf._id, 'searchable_text' : texts, 'file_name' : asset_path }) pdf.text_id = uv_text._id pdf.save() pdf.addCompletedTask(task.task_path) if not hasattr(task, "no_continue"): from lib.Worker.Models.uv_task import UnveillanceTask next_task = UnveillanceTask(inflate={ 'task_path' : 'Text.preprocess_nlp.preprocessNLP', 'doc_id' : task.doc_id, 'queue' : task.queue, 'text_file' : asset_path }) next_task.run() if DEBUG: print "WHERE ARE THE F*****G S TEXTS? %d" % len(pdf.searchable_texts) task.finish() print "\n\n************** PDF TEXT EXTRACTION [END] ******************\n"