def uploadDocument(uv_task): task_tag = "DOCUMENTCLOUD UPLOAD" print "\n\n************** %s [START] ******************\n" % task_tag print "uploading doc %s to DocumentCloud" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_documentcloud_client import CompassDocumentCloudClient from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG document = UnveillanceDocument(_id=uv_task.doc_id) if document is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document is None" uv_task.fail() return if not hasattr(uv_task, "auth_string"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" uv_task.fail() return dc_client = CompassDocumentCloudClient(auth_string=uv_task.auth_string) upload = dc_client.upload(document) if DEBUG: print upload if upload is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" uv_task.fail() return document.dc_id = upload['id'] document.save() document.addCompletedTask(uv_task.task_path) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag print "Uploaded document %s" % document._id
def evaluateText(task): task_tag = "TEXT EVALUATION" print "\n\n************** %s [START] ******************\n" % task_tag print "evaluating text at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import MIME_TYPE_TASKS document = UnveillanceDocument(_id=task.doc_id) """ limited choices: json, pgp, or txt """ if hasattr(task, "text_file"): content = document.loadAsset(task.text_file) else: content = document.loadFile(document.file_name) if content is None: print "no text to evaluate :(" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return new_mime_type = None import json try: json_txt = json.loads(content) new_mime_type = "application/json" print "THIS IS JSON" except Exception as e: print "NOT JSON: %s" % e task_path = None if new_mime_type is not None: document.mime_type = new_mime_type document.save() if document.mime_type in MIME_TYPE_TASKS.keys(): task_path = MIME_TYPE_TASKS[document.mime_type][0] else: try: from lib.Core.Utils.funcs import cleanLine from vars import ASSET_TAGS txt_json = [] txt_pages = [] line_count = 0 # this is arbitrary MAX_LINES_PER_PAGE = 80 for line in content.splitlines(): txt_pages.append(cleanLine(line)) line_count += 1 if line_count == MAX_LINES_PER_PAGE: txt_json.append(" ".join(txt_pages)) txt_pages = [] line_count = 0 txt_json.append(" ".join(txt_pages)) document.total_pages = len(txt_json) document.save() asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False, description="jsonified text of original document, segment by segment", tags=[ASSET_TAGS['TXT_JSON']]) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : document._id, 'searchable_text' : txt_json, 'file_name' : asset_path }) document.text_id = uv_text._id document.save() except Exception as e: if DEBUG: print "ERROR HERE GENERATING DOC TEXTS:" print e document.addCompletedTask(task.task_path) task.finish() task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag
def massageJ3M(task): task_tag = "MASSAGING J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "massaging j3m at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if hasattr(task, "j3m_name"): j3m_name = task.j3m_name else: j3m_name = "j3m.json" j3m = media.loadAsset(j3m_name) if j3m is None: print "J3M IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: j3m = loads(j3m) except Exception as e: print "J3M IS INVALID" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(status=412) return try: media.date_created = j3m['genealogy']['dateCreated'] media.saveFields("date_created") except KeyError as e: print "J3M HAS NO DATE CREATED: %s" % e print "\n\n************** %s [WARN] ******************\n" % task_tag from hashlib import sha1 try: j3m['public_hash'] = sha1("".join( [j3m['genealogy']['createdOnDevice'], "".join(j3m['genealogy']['hashes'])])).hexdigest() except KeyError as e: if DEBUG: print "no key %s" % e pass if 'data' in j3m.keys(): try: location = j3m['data']['exif']['location'] j3m['data']['exif'].update({ 'location' : [location[1], location[0]] }) except KeyError as e: if DEBUG: print "no key %s" % e pass try: if type(j3m['data']['sensorCapture']) is list: pass except KeyError as e: if DEBUG: print "no key %s" % e pass if 'sensorCapture' in j3m['data'].keys(): for playback in j3m['data']['sensorCapture']: if 'gps_coords' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['gps_coords'])[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps print type(gps) playback['sensorPlayback'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'regionLocationData' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['regionLocationData']['gps_coords']) gps = gps[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps playback['sensorPlayback']['regionLocationData'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'visibleWifiNetworks' in playback['sensorPlayback'].keys(): try: for i,b in enumerate(playback['sensorPlayback']['visibleWifiNetworks']): playback['sensorPlayback']['visibleWifiNetworks'][i].update({ 'bt_hash' : sha1(b['bssid']).hexdigest() }) except Exception as e: if DEBUG: print e pass import os, json from conf import getConfig from lib.Core.Utils.funcs import b64decode from lib.Worker.Utils.funcs import getFileType, unGzipBinary searchable_text = [] if 'userAppendedData' in j3m['data'].keys(): try: with open(os.path.join(getConfig('informacam.forms_root'), "forms.json"), 'rb') as F: form_data = json.loads(F.read())['forms'] for udata in j3m['data']['userAppendedData']: for aForms in udata['associatedForms']: st_keys = aForms['answerData'].keys() for f in form_data: if f['namespace'] == aForms['namespace']: try: for mapping in f['mapping']: try: group = mapping.keys()[0] key = aForms['answerData'][group].split(" ") for m in mapping[group]: if m.keys()[0] in key: key[key.index(m.keys()[0])] = m[m.keys()[0]] aForms['answerData'][group] = " ".join(key) except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass try: idx = 0 for audio in f['audio_form_data']: try: while audio in st_keys: st_keys.remove(audio) except Exception as e: pass try: audio_data = b64decode( aForms['answerData'][audio]) if audio_data is None: if DEBUG: print "could not unb64 audio" continue if getFileType(audio_data, as_buffer=True) != MIME_TYPES['gzip']: if DEBUG: print "audio is not gzipped" continue audio_f = "audio_%d.3gp" % idx idx += 1 media.addAsset(unGzipBinary(audio_data), audio_f, tags=[ASSET_TAGS['A_3GP']], description="3gp audio file from form") ''' new_task=UnveillanceTask(inflate={ 'task_path' : "Media.convert.audioConvert", 'doc_id' : media._id, 'formats' : ["3gp", "wav"], 'src_file' : "audio_%d.3gp" % idx, 'queue' : task.queue }) new_task.run() ''' aForms['answerData'][audio] = "audio_%d.wav" except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass if len(st_keys) > 0: for key in st_keys: searchable_text.append(aForms['answerData'][key]) except KeyError as e: if DEBUG: print "no key %s" % e pass except IOError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "no forms to go over: %s" % e except ValueError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "for some reason, forms.json is not legible?\n%s" % e if media.addAsset(j3m, "j3m.json", as_literal=False) is False: print "J3M COULD NOT BE ADDED" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Worker.Models.ic_j3m import InformaCamJ3M j3m['media_id'] = media._id if len(searchable_text) > 0: j3m['searchable_text'] = searchable_text j3m = InformaCamJ3M(inflate=j3m) print "\n\n***NEW J3M CREATED***\n\n" j3m.save() media.j3m_id = j3m._id media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def verifySignature(task): task_tag = "VERIFYING SIGNATURE" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return sig = media.getAsset("j3m.sig", return_only="path") j3m = media.getAsset("j3m.json", return_only="path") if DEBUG: print "j3m path: %s, sig path: %s" % (j3m, sig) if sig is None or j3m is None: err_msg = "NO SIGNATURE or J3M" print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(message=err_msg) return import gnupg from conf import getConfig try: gpg = gnupg.GPG(homedir=getConfig('gpg_homedir')) except Exception as e: print "ERROR INITING GPG" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return media.j3m_verified = False verified = gpg.verify_file(j3m, sig_file=sig) if DEBUG: print "verified fingerprint: %s" % verified.fingerprint if verified.fingerprint is not None: from json import loads supplied_fingerprint = str(loads( media.loadAsset("j3m.json"))['genealogy']['createdOnDevice']) if verified.fingerprint.upper() == supplied_fingerprint.upper(): if DEBUG: print "SIGNATURE VALID for %s" % verified.fingerprint.upper() media.j3m_verified = True media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def getAssets(uv_task): task_tag = "FETCHING DOCUMENTCLOUD ASSETS" print "\n\n************** %s [START] ******************\n" % task_tag print "getting DocumentCloud assets for %s" % uv_task.doc_id uv_task.setStatus(412) from lib.Worker.Models.cp_documentcloud_client import CompassDocumentCloudClient from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG document = UnveillanceDocument(_id=uv_task.doc_id) if document is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document is None" return if not hasattr(document, "dc_id"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document has not document cloud id!" return if not hasattr(uv_task, "auth_string"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" return dc_client = CompassDocumentCloudClient(auth_string=uv_task.auth_string) dc_manifest = dc_client.download("documents/%s.json" % document.dc_id) if dc_manifest is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "No DocumentCloud manifest yet for %s." % document._id return document.addAsset(dc_manifest, "document_cloud_manifest.json", as_literal=False, description="description of document on DocumentCloud", tags=[ASSET_TAGS['DOC_CLOUD_MANIFEST'], ASSET_TAGS['DOC_CLOUD_DOC']]) dc_entities = dc_client.download("documents/%s/entities.json" % document.dc_id) if dc_entities is None: print "\n\n************** %s [WARN] ******************\n" % task_tag print "No DocumentCloud entiteis yet for %s." % document._id else: entity_asset = document.addAsset(dc_entities, "document_cloud_entities.json", as_literal=False, description="entites pulled from DocumentCloud", tags=[ASSET_TAGS['DOC_CLOUD_ENTITIES'], ASSET_TAGS['DOC_CLOUD_DOC']]) from lib.Worker.Models.uv_text import UnveillanceText if not hasattr(document, "text_id"): text = UnveillanceText(inflate={ 'file_name' : entity_asset, 'entities' : dc_entities['entities'], 'media_id' : document._id }) document.text_id = text._id document.save() else: text = UnveillanceText(_id=document.text_id) text.entities = dc_entities['entities'] text.save() document.addCompletedTask(uv_task.task_path) uv_task.finish()
def verifyVisualContent(task): task_tag = "VERIFYING VISUAL CONTENT" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return j3m = media.loadAsset("j3m.json") if j3m is None: print "NO J3M AT ALL" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import os from json import loads from subprocess import Popen, PIPE from conf import ANNEX_DIR, getConfig from vars import MIME_TYPES try: supplied_hashes = loads(j3m)['genealogy']['hashes'] except KeyError as e: print "NO HASHES" print "\n\n************** %s [WARNING] ******************\n" % task_tag task.finish() return media.media_verified = False if media.mime_type == MIME_TYPES['image']: cmd = ["java", "-jar", os.path.join(getConfig('jpeg_tools_dir'), "JavaMediaHasher.jar"), os.path.join(ANNEX_DIR, media.file_name)] elif media.mime_type == MIME_TYPES['video']: cmd = ["ffmpeg", "-y", "-i", os.path.join(ANNEX_DIR, media.file_name), "-vcodec", "copy", "-an", "-f", "md5", "-"] p = Popen(cmd, stdout=PIPE, close_fds=True) verified_hash = p.stdout.readline().strip().replace("MD5=", "") p.stdout.close() if type(supplied_hashes) is list: for hash in supplied_hashes: if type(hash) is unicode: hash = str(hash) if hash == verified_hash: media.media_verified = True media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag