def evaluateDocument(uv_task): task_tag = "DOCUMENT EVALUATION" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG, UUID document = None if hasattr(uv_task, "doc_id"): if DEBUG: print "GETTING A DOCUMENT FROM ID: %s" % uv_task.doc_id document = UnveillanceDocument(_id=uv_task.doc_id) else: if DEBUG: print "INFLATING NEW DOCUMENT WITH FILE NAME: %s" % uv_task.file_name document = UnveillanceDocument(inflate={'file_name' : uv_task.file_name}) if document is None: print "\n\n************** %s [INVALID] ******************\n" % task_tag print "DOCUMENT INVALID (is None)" uv_task.fail(message="DOCUMUENT INVALID (is none)") return from lib.Worker.Models.uv_task import UnveillanceTask from vars import MIME_TYPE_TASKS, MIME_TYPES document.addCompletedTask(uv_task.task_path) uv_task.put_next(uv_task.task_path) mime_type = document.query_mime_type() print "\n\n************** %s [INFO] ******************\n" % task_tag print "MIME TYPE: %s" % mime_type if mime_type in MIME_TYPE_TASKS.keys(): if DEBUG: print "mime type (%s) usable..." % mime_type print MIME_TYPE_TASKS[mime_type] uv_task.put_next(MIME_TYPE_TASKS[mime_type]) else: uv_task.fail(status=412, message="document mime type (%s) not important" % mime_type) print "\n\n************** %s [ERROR] ******************\n" % task_tag return inflate = {'doc_id' : document._id} if mime_type == MIME_TYPES['symlink']: inflate['attempt_sync'] = True uv_task.routeNext(inflate=inflate) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def sanitizeWebUpload(uv_task): task_tag = "SANITIZE WEB UPLOAD" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return media.from_web_upload = True media.notarizedSave(['from_web_upload']) # TODO: delete all assets except for j3ms and sigs uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def basicTokenizer(task): task_tag = "NLP ADDRESS PARSER" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return txt = None if hasattr(task, "txt_file"): txt = doc.loadFile(task.txt_file) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name'] txt = doc.loadFile(os.path.join(doc.base_path, txt_path)) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return
def audioConvert(task): task_tag = "CONVERTING SOME AUDIO" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return audio = media.getAsset(task.src_file, return_only="path") if audio is None: print "SOURCE FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from subprocess import Popen cmd = ["ffmpeg", "-y", "-i", audio, "-vn", "-acodec", "mp2", "-ar", "22050", "-f", task.formats[1], audio.replace(".%s" % task.formats[0], ".%s" % task.formats[1])] p = Popen(cmd) p.wait() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def pullFromAnnex(uv_task): task_tag = "PULL FROM ANNEX" print "\n\n************** %s [START] ******************\n" % task_tag print "pulling file from document %s from annex" % uv_task.doc_id print uv_task.emit() uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG, BASE_DIR, getConfig document = UnveillanceDocument(_id=uv_task.doc_id) if document is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if not document.getFile(document.file_name): print "NO FILE CONTENT" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if hasattr(uv_task, "atttempt_sync") and uv_task.attempt_sync: print "SHOULD ATTEMPT SYNC AGAIN." from fabric.api import settings, local with settings(warn_only=True): local("%s %s %s" % (getConfig('python_home'), os.path.join(BASE_DIR, "sync_file.py"), document.file_name)) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def parse_zipped_j3m(uv_task): task_tag = "PARSING ZIPPED J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "parsing zipped j3m asset at %s" % uv_task.doc_id uv_task.setStatus(302) import os from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from conf import ANNEX_DIR if hasattr(uv_task, "j3m_name"): j3m_name = uv_task.j3m_name else: j3m_name = os.path.join(media.base_path, "j3m_raw.gz") if not media.getFile(j3m_name): print "NO J3M.GZ at %s" % j3m_name print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from cStringIO import StringIO from lib.Worker.Utils.funcs import getFileType, unGzipBinary from vars import MIME_TYPES j3m = media.loadFile(j3m_name) j3m_type = getFileType(j3m, as_buffer=True) if j3m_type == MIME_TYPES['gzip']: j3m = unGzipBinary(j3m) if j3m is None or getFileType(j3m, as_buffer=True) != MIME_TYPES['json']: print "THIS IS NOT A J3M (type %s)" % j3m_type print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412) return asset_path = "j3m_raw.json" media.addAsset(j3m, asset_path, as_literal=False) uv_task.put_next([ "J3M.j3mify.j3mify", "J3M.massage_j3m.massageJ3M", "PGP.verify_signature.verifySignature", "J3M.verify_visual_content.verifyVisualContent" ]) uv_task.routeNext(inflate={'j3m_name' : asset_path}) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def get_video_hash(uv_task): task_tag = "VIDEO HASHER" print "\n\n************** %s [START] ******************\n" % task_tag print "getting video hash for doc at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS video = UnveillanceDocument(_id=uv_task.doc_id) if video is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if not video.get_video_hash(): print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return video.addCompletedTask(uv_task.task_path) uv_task.routeNext() uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def get_vector(uv_task): task_tag = "IMAGE: GETTING VECTOR" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from vars import ASSET_TAGS from conf import ANNEX_DIR, DEBUG import os, pypuzzle image = UnveillanceDocument(_id=uv_task.doc_id) hi_res = image.getAssetsByTagName(ASSET_TAGS['HIGH']) if hi_res is None: error_msg = "Could not find the hi-res clone" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return hi_res = os.path.join(ANNEX_DIR, image.base_path, hi_res[0]['file_name']) puzz = pypuzzle.Puzzle() if DEBUG: print "generate puzzle vector from %s" % hi_res try: cvec = puzz.get_cvec_from_file(hi_res) except Exception as e: error_msg = "Could not get image vector because %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return if not image.addAsset(cvec, "image_cvec.json", as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]): error_msg = "could not save cvec asset!" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def evaluateFile(task): task_tag = "EVALUATING DOCUMENT (INFORMACAM)" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS document = UnveillanceDocument(_id=task.doc_id) if document is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if not document.getFile(task.file_name): print "NO FILE CONTENT" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Worker.Models.uv_task import UnveillanceTask from lib.Worker.Utils.funcs import getFileType from vars import MIME_TYPE_TASKS from conf import ANNEX_DIR try: mime_type = getFileType(os.path.join(ANNEX_DIR, task.file_name)) new_task = UnveillanceTask(inflate={ 'task_path' : MIME_TYPE_TASKS[mime_type][0], 'doc_id' : document._id, 'file_name' : task.file_name }) document.addCompletedTask(task.task_path) new_task.run() except IndexError as e: print "NO NEXT TASK: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def __init__(self, inflate=None, _id=None, auto_pull=False): emit_sentinels = [ EmitSentinel("config", "dict", None), EmitSentinel("service", "Api", None), EmitSentinel("usable", "bool", None)] TwitterClient.__init__(self) if inflate is not None: if 'screen_name' not in inflate.keys(): return try: lookup = self.lookup_user(screen_name=inflate['screen_name']).AsDict() print lookup except Exception as e: if DEBUG: print "COULD NOT LOOKUP TWIITERER:" print e, type(e) return if 'file_name' not in inflate.keys(): inflate['file_name'] = "%s.json" % inflate['screen_name'] with open(os.path.join(ANNEX_DIR, inflate['file_name']), 'wb+') as F: F.write(json.dumps(lookup)) for i in ['id', 'profile_image_url', 'entities', 'friends_count', 'followers_count', 'listed_count', 'created_at', 'time_zone']: try: inflate[i] = lookup[i] print "ADDING %s: %s" % (i, inflate[i]) except Exception as e: print "COULD NOT GET KEY: %s" % i pass inflate['_id'] = generateMD5Hash(content=inflate['id']) if 'created_at' in inflate.keys(): from time import mktime from dateutil.parser import parse inflate['created_at_ts'] = mktime(parse(inflate['created_at']).timetuple()) UnveillanceDocument.__init__(self, inflate=inflate, _id=_id, emit_sentinels=emit_sentinels) if auto_pull: self.pull_avitar()
def __init__(self, _id=None, inflate=None): emit_sentinels = [ EmitSentinel("user_source", "DLTwitterer", "_id"), EmitSentinel("user_target", "DLTwitterer", "_id"), EmitSentinel("service", "Api", None), EmitSentinel("config", "dict", None), EmitSentinel("usable", "bool", None)] UnveillanceDocument.__init__(self, _id=_id, inflate=inflate, emit_sentinels=emit_sentinels) if inflate is not None: self.original_mime_type = self.mime_type self.mime_type = "foxydoxxing/email" self.save() TwitterClient.__init__(self)
def register_upload_attempt(_id): from Utils.funcs import printAsLog from lib.Worker.Models.uv_document import UnveillanceDocument try: doc = UnveillanceDocument(_id=_id) if doc.getFileMetadata('upload_attempts') == None: upload_attempts = 1 else: upload_attempts += 1 doc.set_file_metadata('upload_attempts', upload_attempts) except Exception as e: printAsLog(e, as_error=True) return False return True
def extractNEREntities(task): task_tag = "NER ENTITY EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import ner, os from conf import getConfig from lib.Core.Utils.funcs import cleanLine st = ner.SocketNER(host='localhost', port=getConfig("nlp_server.port")) entities = {} for i, page in enumerate(texts): if page is None: continue lemmas = st.get_entities(cleanLine(page)) if len(lemmas.keys()) == 0: continue for lemma_type in lemmas.keys(): entities = updateEntities(entities, lemmas[lemma_type], lemma_type, i) #if DEBUG and i > 25: break if len(entities.keys()) > 0: ner_entity_path = doc.addAsset(entities, "stanford-ner_entities.json", as_literal=False, description="Entities as per Stanford-NER Tagger (via NLTK)", tags=[ASSET_TAGS['STANFORD_NER_ENTITIES'], ASSET_TAGS['CP_ENTITIES']]) if ner_entity_path is not None: doc.addFile(ner_entity_path, None, sync=True) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def do_reindex(self, request): print "DOING REINDEX" query = parseRequestEntity(request.query) if query is None: return None if '_id' not in query.keys(): return None document = self.get(_id=query['_id']) if document is None: return None document = UnveillanceDocument(_id=document['_id']) inflate={ 'doc_id' : document._id, 'queue' : UUID } del query['_id'] if 'task_path' not in query.keys() and 'task_queue' not in query.keys(): document.reset() inflate.update({ 'task_path' : "Documents.evaluate_document.evaluateDocument" }) else: inflate.update(query) if 'task_queue' in inflate.keys(): inflate.update({ 'task_path' : inflate['task_queue'][0], 'task_queue' : inflate['task_queue'] }) else: inflate.update({ 'no_continue' : True }) uv_task = UnveillanceTask(inflate=inflate) uv_task.run() return uv_task.emit()
def __init__(self, _id=None, inflate=None): emit_sentinels = [ EmitSentinel("user_source", "DLTwitterer", "_id"), EmitSentinel("user_target", "DLTwitterer", "_id"), EmitSentinel("service", "Api", None), EmitSentinel("config", "dict", None), EmitSentinel("usable", "bool", None) ] UnveillanceDocument.__init__(self, _id=_id, inflate=inflate, emit_sentinels=emit_sentinels) if inflate is not None: self.original_mime_type = self.mime_type self.mime_type = "foxydoxxing/email" self.save() TwitterClient.__init__(self)
def get_image_vector(uv_task): task_tag = "AVI: GETTING IMAGE VECTOR" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import ANNEX_DIR import pypuzzle image = UnveillanceDocument(_id=uv_task.doc_id) puzz = pypuzzle.Puzzle() try: cvec = puzz.get_cvec_from_file(os.path.join(ANNEX_DIR, image.file_name)) except Exception as e: error_msg = "Could not get image vector because %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return from vars import ASSET_TAGS if not image.addAsset(cvec, "image_cvec.json", as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]): error_msg = "could not save cvec asset!" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def update_similar_media(uv_task): task_tag = "LOCATING SIMILAR MEDIA" print "\n\n************** %s [START] ******************\n" % task_tag print "similar images for doc at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return media.update_similar_media() media.addCompletedTask(uv_task.task_path) uv_task.routeNext() uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def preprocessNLP(task): task_tag = "TEXT NLP PREPROCESSING" print "\n\n************** %s [START] ******************\n" % task_tag print "nlp preprocessing text at %s" % task.doc_id task.setStatus(302) import re from json import loads from lib.Worker.Models.uv_document import UnveillanceDocument from lib.Core.Utils.funcs import cleanAndSplitLine from conf import DEBUG from vars import ASSET_TAGS document = UnveillanceDocument(_id=task.doc_id) if document is None: print "DOC IS NONE" task.fail() return # 1. get all the words (bag of words) try: texts = loads(document.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return word_groups = [cleanAndSplitLine(text) for text in texts if text is not None] word_groups = [wg for wg in word_groups if len(wg) > 0] bag_of_words = sum(word_groups, []) document.addAsset(bag_of_words, "bag_of_words.txt", as_literal=False, description="bag of words", tags=ASSET_TAGS['BOW']) # 2. get keywords, weighted and parsable by gensim once_words = set(word for word in set(bag_of_words) if bag_of_words.count(word) == 1) key_words = [word for word in bag_of_words if word not in once_words] if len(key_words) > 0: document.addAsset(key_words, "key_words_gensim.txt", as_literal=False, description="keywords, as list, and parsable by gensim", tags=ASSET_TAGS['KW']) document.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def uploadDocument(uv_task): task_tag = "DOCUMENTCLOUD UPLOAD" print "\n\n************** %s [START] ******************\n" % task_tag print "uploading doc %s to DocumentCloud" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_documentcloud_client import CompassDocumentCloudClient from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG document = UnveillanceDocument(_id=uv_task.doc_id) if document is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document is None" uv_task.fail() return if not hasattr(uv_task, "auth_string"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" uv_task.fail() return dc_client = CompassDocumentCloudClient(auth_string=uv_task.auth_string) upload = dc_client.upload(document) if DEBUG: print upload if upload is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" uv_task.fail() return document.dc_id = upload['id'] document.save() document.addCompletedTask(uv_task.task_path) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag print "Uploaded document %s" % document._id
def decrypt(uv_task): task_tag = "DECRYPTING" print "\n\n************** %s [START] ******************\n" % task_tag print "decrypting pgp blob for %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if not media.getFile(uv_task.pgp_file): print "NO PGP FILE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from conf import getSecrets gpg_pwd = getSecrets("gpg_pwd") if gpg_pwd is None: err_msg = "NO PASSPHRASE TO DECRYPT" print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=err_msg) return gpg_dir = getSecrets("gpg_dir") # save as task.pgp_file.decrypted or whatever import os from fabric.api import local, settings from fabric.context_managers import hide from conf import ANNEX_DIR, DEBUG if not hasattr(uv_task, "save_as"): save_as = "%s.decrypted" % uv_task.pgp_file else: save_as = uv_task.save_as print "\n\n************** %s [INFO] ******************\n" % task_tag print "SAVING DECRYPTED ASSET TO %s IF SUCCESSFUL" % save_as with settings(hide("everything"), warn_only=True): d_cmd = "gpg --yes --no-tty --homedir=%s --passphrase %s --output %s --decrypt %s" % ( gpg_dir, gpg_pwd, os.path.join(ANNEX_DIR, save_as), os.path.join(ANNEX_DIR, uv_task.pgp_file), ) decrypted = local(d_cmd) print decrypted.return_code del gpg_pwd if decrypted.return_code == 2: err_msg = "could not successfully decrypt %s" % uv_task.pgp_file print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=err_msg) return media.addCompletedTask(uv_task.task_path) if uv_task.get_next() is None: # route according to mime type # get mime type of decrypted from vars import MIME_TYPE_TASKS from lib.Worker.Utils.funcs import getFileType mime_type = getFileType(os.path.join(ANNEX_DIR, save_as)) # usable: json (a j3m), zip (a source or a log->batch) if mime_type in MIME_TYPE_TASKS.keys(): print "mime type (%s) usable..." % mime_type try: uv_task.put_next(MIME_TYPE_TASKS[mime_type]) except Exception as e: print e uv_task.routeNext(inflate={"file_name": save_as}) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def j3mify(uv_task): task_tag = "J3MIFYING" print "\n\n************** %s [START] ******************\n" % task_tag print "j3mifying asset at %s" % uv_task.doc_id uv_task.setStatus(302) import os from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return j3m = media.loadAsset(uv_task.j3m_name) if j3m is None: error_message = "J3M IS NONE" print error_message print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_message) return import json print "JSSON HERE:" try: print type(j3m) j3m = json.loads(j3m) except Exception as e: print "\n\n************** J3MIFYING [WARN] ******************\n" print e print "json load once fail. trying again" print j3m if type(j3m) in [str, unicode]: try: j3m = json.loads(j3m) except Exception as e: print "\n\n************** J3MIFYING [WARN] ******************\n" print e print "json loads twice fail." print type(j3m) try: j3m_sig = j3m['signature'] except Exception as e: print "NO SIGNATURE TO EXTRACT" print "\n\n************** J3MIFYING [ERROR] ******************\n" uv_task.fail(status=412, message="No Signature in J3M.") return media.addAsset(j3m_sig, "j3m.sig", tags=[ASSET_TAGS['SIG']], description="The j3m's signature") media.addFile( media.addAsset(j3m['j3m'], "j3m.json", tags=[ASSET_TAGS['J3M']], description="The j3m itself.", as_literal=False), None, sync=True) media.addCompletedTask(uv_task.task_path) uv_task.j3m_name = "j3m.json" uv_task.save() uv_task.routeNext() uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def compare_avis(uv_task): task_tag = "CLUSTER: COMPARING 2 AVIS" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) if not hasattr(uv_task, 'avis') or len(uv_task.avis != 2): error_msg = "Cannot compare anything." print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg, status=412) return from lib.Worker.Models.uv_document import UnveillanceDocument try: avis = map(lambda a: UnveillanceDocument(_id=a), uv_task.avis) except Exception as e: error_msg = "could not load up avis as UnveillanceDocuments: %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg, status=412) return from conf import ANNEX_DIR from vars import ASSET_TAGS from json import loads import pypuzzle puzz = pypuzzle.Puzzle() try: compare_avi = puzz.get_distance_from_cvec( *(map(lambda a: loads(a.loadAsset("image_cvec.json")), avis))) except Exception as e: error_msg = "could not get one or more image vectors because %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg, status=412) return if type(compare_avi) not in [int, float]: error_msg = "non-numerical result for comparaison." print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg, status=412) return c_map = { 'avis': map(lambda a: { 'file_name': a.file_name, '_id': a._id }, avis), 'compared': compare_avi } if not uv_task.addAsset(c_map, "compare_avi_output.json", as_literal=False, tags=[ASSET_TAGS['C_RES']]): error_msg = "could not save result asset to this task." print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def createGensimObjects(task): task_tag = "GENSIM TOPIC EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "USING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if len(texts) == 0: print "THERE ARE NO TEXTS HERE ANYWAY!" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import logging, os, bz2 from json import loads from gensim import corpora from lib.Core.Utils.funcs import cleanLine from conf import getConfig, ANNEX_DIR logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt')) wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2'))) except Exception as e: print "\n\n************** %s [ERROR] ******************\n" % task_tag error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e) print error_msg print e task.fail(message=error_msg) return from gensim import models wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model') if not os.path.exists(wiki_log_entropy_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared log entropy model. going to generate this here, now. might take a minute..." logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary) logent_transformation.save(wiki_log_entropy_file) else: logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file) tokenize_function = corpora.wikicorpus.tokenize doc_corpus = [wiki_dictionary.doc2bow(tokenize_function(cleanLine(page).lower())) for page in texts] doc_corpus = logent_transformation[doc_corpus] wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model') if not os.path.exists(wiki_tfidf_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared tfidf model. going to generate this here, now. might take a minute..." wiki_tfidf = models.TfidfModel(wiki_corpus) wiki_tfidf.save(wiki_tfidf_file) else: wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file) doc_tfidf = wiki_tfidf[doc_corpus] num_topics = 35 lsi = models.LsiModel(corpus=doc_tfidf, id2word=wiki_dictionary, num_topics=num_topics) topics = [] t_lambda = lambda x : [float(x[0]), x[1]] for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]: topics.append([t_lambda(t.strip().replace('\"','').split("*")) for t in t_group]) lsi_topics = { "topics" : topics, "doc_comprehension" : [] } doc_lsi = lsi[doc_tfidf] for d in doc_lsi: lsi_topics['doc_comprehension'].append(d) topic_path = doc.addAsset(lsi_topics, "%s_topics.json" % doc.file_name, as_literal=False, description="Gensim Topics dump (from LSI Model)", tags=[ASSET_TAGS["GM_TOPICS"]]) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def evaluate_JSON_media(uv_task): task_tag = "EVALUATE JSON MEDIA" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) import os from json import loads from lib.Worker.Models.uv_document import UnveillanceDocument from conf import ANNEX_DIR, DEBUG doc = UnveillanceDocument(_id=uv_task.doc_id) try: if DEBUG: print doc.emit() except Exception as e: print e content = None try: content = loads(doc.loadFile(doc.file_name)) except Exception as e: error_msg = "could not load content at all: %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return # match keys to object mention_set = ["mimeType", "headers", "parts", "body", "filename"] if len(set(content.keys()).intersection(mention_set)) == len( content.keys()): from lib.Worker.Models.dl_FD_mention import FoxyDoxxingMention doc = FoxyDoxxingMention(inflate=doc.emit()) else: error_msg = "document not really usable." print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return doc.addCompletedTask(uv_task.task_path) from vars import MIME_TYPE_TASKS uv_task.task_queue = [uv_task.task_path] try: uv_task.task_queue.extend(MIME_TYPE_TASKS[doc.mime_type]) uv_task.routeNext() except Exception as e: error_msg = "cannot get task queue for mime type %s: %s" % ( doc.mime_type, e) print error_msg print "\n\n************** %s [WARN] ******************\n" % task_tag print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def addressParser(task): task_tag = "NLP ADDRESS PARSER" print "\n\n************** %s [START] ******************\n" % task_tag print "EXTRACTING ADDRESSES FROM TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return txt = None if hasattr(task, "txt_file"): txt = doc.loadFile(task.txt_file) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS["TXT_JSON"])[0]["file_name"] txt = doc.loadFile(os.path.join(doc.base_path, txt_path)) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import re # script from https://code.google.com/p/ebcode/ -> ebdata.tar.gz -> ebdata/nlp/addresses.py # Regex notes: # * This is *not* a case-insensitive regex, because we assume # capitalized words are special (street names). # * All data matched by capturing parentheses is concatenated together, so # if you don't want to include something in the resulting string, don't # capture it. # STREET_NAME is a fragment of a regular expression that is used in several # places in our "real" regular expression (ADDRESSES_RE) below. The one tricky # thing about it is that it includes a "CAPTURE_START" placeholder instead of # a capturing opening parenthesis. This lets us create two versions of the # regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE. STREET_NAME = r""" # Here, we define some common false positives and tell the regex to ignore them. (?! [Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press | [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff] # university of ) # DIRECTION %(CAPTURE_START)s (?: [NSEWnsew]\.? | (?: [Nn][Oo][Rr][Tt][Hh] | [Ss][Oo][Uu][Tt][Hh] | [Ee][Aa][Ss][Tt] | [Ww][Ee][Ss][Tt] | [Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] | [Ee][Aa][Ss][Tt][Ww][Ee][Ss][Tt] | [Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] | [Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt] ) | (?: N\.?W | S\.?W | N\.?E | S\.?E )\.? ) \ + # space (but not newline) )? (?: # STREET NAME %(CAPTURE_START)s # Numbered street names with a suffix ("3rd", "4th"). \d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D) | # Or, numbered street names without a suffix ("3", "4") # but with a street type. \d+ (?= \ + (?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive| Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte| Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy ) \b ) | # Or, street names that don't start with numbers. (?: # Optional prefixes -- # "St", as in "St Louis" # "Dr. Martin", as in "Dr. Martin Luther King" (?: [Ss][Tt]\.? | [Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn] ) \ + )? (?: Mass\.(?=\ +[Aa]ve) # Special case: "Mass." abbr. for "Massachussetts Ave." # Needs to be special-cased because of the period. | (?:Avenue|Ave\.?)\ +[A-Z] # Special case: "Avenue X" | [A-Z][a-z][A-Za-z]* # One initial-capped word | [A-Z]\b # Single-letter street name (e.g., K St. in DC) (?!\.\w) # Avoid '20 U.S.A.' ) ) (?: # Here, we list the options with street suffixes first, so that # the suffix abbreviations are treated as the last part of the # street name, to avoid overeagerly capturing "123 Main St. The". %(CAPTURE_START)s \ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | \ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5} )? # OPTIONAL POST-DIR (?: # Standard post-dir format %(CAPTURE_START)s ,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.? ) # Avoid greedily capturing more letters, like # '123 Main St, New England' to '123 Main St, N' (?![A-Za-z]) | # Or, a special-case for DC quadrants, to find stuff like: # "600 H Street in NE Washington" # "600 H Street in the NE quadrant" # "600 H Street in northeast DC" # Note that this is NOT captured, so that it's excluded from # the final output. ,? \s in %(CAPTURE_START)s \s ) (?: (?:the|far) \s )? %(CAPTURE_START)s (?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest) (?= \s (?:quadrant|D\.?C\.?|Washington) ) ) )? )? ) """ STREET_NAME_CAPTURE = STREET_NAME % {"CAPTURE_START": "("} STREET_NAME_NOCAPTURE = STREET_NAME % {"CAPTURE_START": "(?:"} ADDRESSES_RE = re.compile( r"""(?x) (?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection. \b # Ignore things that look like dates -- e.g., "21 May 2009". # This is a problem e.g. in cases where there's a May Street. (?! \d+\s+ (?:January|February|March|April|May|June|July|August|September|October|November|December) ,?\s+ \d\d\d\d ) # Ignore intersections that are prefixed by "University of", like # "University of Texas at Austin". This is a common false positive. (?<! [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s ) (?: # SEGMENT ("FOO BETWEEN BAR AND BAZ") (?: %(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s | %(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s ) | # BLOCK/ADDRESS (?: ( (?: (?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ] (?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )? [Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff] | \d+\ *-\ *\d+ | \d+ ) \ + ) %(STREET_NAME_CAPTURE)s # ignore the intersection in parenthesis so that it's not picked # up as a separate location. We do this by consuming the string # but *not* capturing it. (?: \ + \(? between \ + %(STREET_NAME_NOCAPTURE)s \ + and \ + %(STREET_NAME_NOCAPTURE)s \)? )? ) | # INTERSECTION (?: # Common intersection prefixes. They're included here so that the # regex doesn't include them as part of the street name. (?: (?: [Nn]ear | [Aa]t | [Oo]n | [Tt]o | [Aa]round | [Ii]ntersection\ of | [Cc]orner\ of | [Aa]rea\ of | [Aa]reas?\ surrounding | vicinity\ of | ran\ down | running\ down | crossed ) \ + )? \b (?:%(STREET_NAME_CAPTURE)s) (\ +) ( (?: [Aa][Nn][Dd] | [Aa][Tt] | [Nn][Ee][Aa][Rr] | & | [Aa][Rr][Oo][Uu][Nn][Dd] | [Tt][Oo][Ww][Aa][Rr][Dd][Ss]? | [Oo][Ff][Ff] | (?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] | (?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt] ) \ + ) (?:%(STREET_NAME_CAPTURE)s) ) ) # OPTIONAL CITY SUFFIX (?: (?: ,?\s+in | , ) \s+ # CITY NAME ( [A-Z][a-z][A-Za-z]* # One initial-capped word (?: ,?\ Jr\.?,? | \ [A-Z][a-z][A-Za-z]* | -[A-Za-z]+ # Hyphenated words (e.g. "Croton-on-Hudson" in NY) ){0,4} # Initial-capped words ) )? """ % {"STREET_NAME_CAPTURE": STREET_NAME_CAPTURE, "STREET_NAME_NOCAPTURE": STREET_NAME_NOCAPTURE} ) addresses = parse_addresses(txt, ADDRESSES_RE) if addresses is None: print "COULD NOT EXTRACT ADDRESSES." print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return asset_path = doc.addAsset( addresses, "addresses.json", as_literal=False, description="addresses output from Everyblock address extractor", tags=[ASSET_TAGS["ADDRESSES_NLP"], ASSET_TAGS["CP_ENTITIES"]], ) if asset_path is None or not doc.addFile(asset_path, None, sync=True): print "COULD NOT SAVE ASSET." print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def evaluateText(task): task_tag = "TEXT EVALUATION" print "\n\n************** %s [START] ******************\n" % task_tag print "evaluating text at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import MIME_TYPE_TASKS document = UnveillanceDocument(_id=task.doc_id) """ limited choices: json, pgp, or txt """ if hasattr(task, "text_file"): content = document.loadAsset(task.text_file) else: content = document.loadFile(document.file_name) if content is None: print "no text to evaluate :(" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return new_mime_type = None import json try: json_txt = json.loads(content) new_mime_type = "application/json" print "THIS IS JSON" except Exception as e: print "NOT JSON: %s" % e task_path = None if new_mime_type is not None: document.mime_type = new_mime_type document.save() if document.mime_type in MIME_TYPE_TASKS.keys(): task_path = MIME_TYPE_TASKS[document.mime_type][0] else: try: from lib.Core.Utils.funcs import cleanLine from vars import ASSET_TAGS txt_json = [] txt_pages = [] line_count = 0 # this is arbitrary MAX_LINES_PER_PAGE = 80 for line in content.splitlines(): txt_pages.append(cleanLine(line)) line_count += 1 if line_count == MAX_LINES_PER_PAGE: txt_json.append(" ".join(txt_pages)) txt_pages = [] line_count = 0 txt_json.append(" ".join(txt_pages)) document.total_pages = len(txt_json) document.save() asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False, description="jsonified text of original document, segment by segment", tags=[ASSET_TAGS['TXT_JSON']]) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : document._id, 'searchable_text' : txt_json, 'file_name' : asset_path }) document.text_id = uv_text._id document.save() except Exception as e: if DEBUG: print "ERROR HERE GENERATING DOC TEXTS:" print e document.addCompletedTask(task.task_path) task.finish() task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag
def massageJ3M(task): task_tag = "MASSAGING J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "massaging j3m at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if hasattr(task, "j3m_name"): j3m_name = task.j3m_name else: j3m_name = "j3m.json" j3m = media.loadAsset(j3m_name) if j3m is None: print "J3M IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: j3m = loads(j3m) except Exception as e: print "J3M IS INVALID" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(status=412) return try: media.date_created = j3m['genealogy']['dateCreated'] media.saveFields("date_created") except KeyError as e: print "J3M HAS NO DATE CREATED: %s" % e print "\n\n************** %s [WARN] ******************\n" % task_tag from hashlib import sha1 try: j3m['public_hash'] = sha1("".join( [j3m['genealogy']['createdOnDevice'], "".join(j3m['genealogy']['hashes'])])).hexdigest() except KeyError as e: if DEBUG: print "no key %s" % e pass if 'data' in j3m.keys(): try: location = j3m['data']['exif']['location'] j3m['data']['exif'].update({ 'location' : [location[1], location[0]] }) except KeyError as e: if DEBUG: print "no key %s" % e pass try: if type(j3m['data']['sensorCapture']) is list: pass except KeyError as e: if DEBUG: print "no key %s" % e pass if 'sensorCapture' in j3m['data'].keys(): for playback in j3m['data']['sensorCapture']: if 'gps_coords' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['gps_coords'])[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps print type(gps) playback['sensorPlayback'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'regionLocationData' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['regionLocationData']['gps_coords']) gps = gps[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps playback['sensorPlayback']['regionLocationData'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'visibleWifiNetworks' in playback['sensorPlayback'].keys(): try: for i,b in enumerate(playback['sensorPlayback']['visibleWifiNetworks']): playback['sensorPlayback']['visibleWifiNetworks'][i].update({ 'bt_hash' : sha1(b['bssid']).hexdigest() }) except Exception as e: if DEBUG: print e pass import os, json from conf import getConfig from lib.Core.Utils.funcs import b64decode from lib.Worker.Utils.funcs import getFileType, unGzipBinary searchable_text = [] if 'userAppendedData' in j3m['data'].keys(): try: with open(os.path.join(getConfig('informacam.forms_root'), "forms.json"), 'rb') as F: form_data = json.loads(F.read())['forms'] for udata in j3m['data']['userAppendedData']: for aForms in udata['associatedForms']: st_keys = aForms['answerData'].keys() for f in form_data: if f['namespace'] == aForms['namespace']: try: for mapping in f['mapping']: try: group = mapping.keys()[0] key = aForms['answerData'][group].split(" ") for m in mapping[group]: if m.keys()[0] in key: key[key.index(m.keys()[0])] = m[m.keys()[0]] aForms['answerData'][group] = " ".join(key) except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass try: idx = 0 for audio in f['audio_form_data']: try: while audio in st_keys: st_keys.remove(audio) except Exception as e: pass try: audio_data = b64decode( aForms['answerData'][audio]) if audio_data is None: if DEBUG: print "could not unb64 audio" continue if getFileType(audio_data, as_buffer=True) != MIME_TYPES['gzip']: if DEBUG: print "audio is not gzipped" continue audio_f = "audio_%d.3gp" % idx idx += 1 media.addAsset(unGzipBinary(audio_data), audio_f, tags=[ASSET_TAGS['A_3GP']], description="3gp audio file from form") ''' new_task=UnveillanceTask(inflate={ 'task_path' : "Media.convert.audioConvert", 'doc_id' : media._id, 'formats' : ["3gp", "wav"], 'src_file' : "audio_%d.3gp" % idx, 'queue' : task.queue }) new_task.run() ''' aForms['answerData'][audio] = "audio_%d.wav" except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass if len(st_keys) > 0: for key in st_keys: searchable_text.append(aForms['answerData'][key]) except KeyError as e: if DEBUG: print "no key %s" % e pass except IOError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "no forms to go over: %s" % e except ValueError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "for some reason, forms.json is not legible?\n%s" % e if media.addAsset(j3m, "j3m.json", as_literal=False) is False: print "J3M COULD NOT BE ADDED" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Worker.Models.ic_j3m import InformaCamJ3M j3m['media_id'] = media._id if len(searchable_text) > 0: j3m['searchable_text'] = searchable_text j3m = InformaCamJ3M(inflate=j3m) print "\n\n***NEW J3M CREATED***\n\n" j3m.save() media.j3m_id = j3m._id media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def getAssets(uv_task): task_tag = "FETCHING DOCUMENTCLOUD ASSETS" print "\n\n************** %s [START] ******************\n" % task_tag print "getting DocumentCloud assets for %s" % uv_task.doc_id uv_task.setStatus(412) from lib.Worker.Models.cp_documentcloud_client import CompassDocumentCloudClient from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG document = UnveillanceDocument(_id=uv_task.doc_id) if document is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document is None" return if not hasattr(document, "dc_id"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "Document has not document cloud id!" return if not hasattr(uv_task, "auth_string"): print "\n\n************** %s [ERROR] ******************\n" % task_tag print "DocumentCloud upload needs an auth string" return dc_client = CompassDocumentCloudClient(auth_string=uv_task.auth_string) dc_manifest = dc_client.download("documents/%s.json" % document.dc_id) if dc_manifest is None: print "\n\n************** %s [ERROR] ******************\n" % task_tag print "No DocumentCloud manifest yet for %s." % document._id return document.addAsset(dc_manifest, "document_cloud_manifest.json", as_literal=False, description="description of document on DocumentCloud", tags=[ASSET_TAGS['DOC_CLOUD_MANIFEST'], ASSET_TAGS['DOC_CLOUD_DOC']]) dc_entities = dc_client.download("documents/%s/entities.json" % document.dc_id) if dc_entities is None: print "\n\n************** %s [WARN] ******************\n" % task_tag print "No DocumentCloud entiteis yet for %s." % document._id else: entity_asset = document.addAsset(dc_entities, "document_cloud_entities.json", as_literal=False, description="entites pulled from DocumentCloud", tags=[ASSET_TAGS['DOC_CLOUD_ENTITIES'], ASSET_TAGS['DOC_CLOUD_DOC']]) from lib.Worker.Models.uv_text import UnveillanceText if not hasattr(document, "text_id"): text = UnveillanceText(inflate={ 'file_name' : entity_asset, 'entities' : dc_entities['entities'], 'media_id' : document._id }) document.text_id = text._id document.save() else: text = UnveillanceText(_id=document.text_id) text.entities = dc_entities['entities'] text.save() document.addCompletedTask(uv_task.task_path) uv_task.finish()
def basicTokenizer(task): task_tag = "NLP TOKENIZER" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return txt = None from json import loads if hasattr(task, "txt_file"): txt = loads(doc.loadFile(task.txt_file)) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name'] txt = loads(doc.loadFile(os.path.join(doc.base_path, txt_path))) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return from lib.Worker.Models.cp_nlp_server import CompassNLPServer nlp_server = CompassNLPServer() tokenized = nlp_server.sendNLPRequest({ 'method' : 'tokenize', 'txt' : txt }) if tokenized is None: print "COULD NOT TOKENIZE." print "\n\n************** %s [ERROR] ******************\n" % task_tag return if DEBUG: print "here is res" print type(tokenized) asset_path = doc.addAsset(tokenized, "core_nlp_tokenized.json", as_literal=False, description="tokenized output from Stanford Core NLP", tags=[ASSET_TAGS['TOKENS_NLP']]) if asset_path is None or not doc.addFile(asset_path, None, sync=True): print "COULD NOT SAVE ASSET." print "\n\n************** %s [ERROR] ******************\n" % task_tag return doc.addCompletedTask(task.task_path) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def verifySignature(task): task_tag = "VERIFYING SIGNATURE" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return sig = media.getAsset("j3m.sig", return_only="path") j3m = media.getAsset("j3m.json", return_only="path") if DEBUG: print "j3m path: %s, sig path: %s" % (j3m, sig) if sig is None or j3m is None: err_msg = "NO SIGNATURE or J3M" print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(message=err_msg) return import gnupg from conf import getConfig try: gpg = gnupg.GPG(homedir=getConfig('gpg_homedir')) except Exception as e: print "ERROR INITING GPG" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return media.j3m_verified = False verified = gpg.verify_file(j3m, sig_file=sig) if DEBUG: print "verified fingerprint: %s" % verified.fingerprint if verified.fingerprint is not None: from json import loads supplied_fingerprint = str(loads( media.loadAsset("j3m.json"))['genealogy']['createdOnDevice']) if verified.fingerprint.upper() == supplied_fingerprint.upper(): if DEBUG: print "SIGNATURE VALID for %s" % verified.fingerprint.upper() media.j3m_verified = True media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def evaluateTextFile(task): task_tag = "EVALUATING TEXT FILE" print "\n\n************** %s [START] ******************\n" % task_tag print "evaluating text file at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if not media.queryFile(media.file_name): print "NO DOCUMENT CONTENT" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return content = media.loadFile(media.file_name) if content is None: print "NO DOCUMENT CONTENT" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Core.Utils.funcs import b64decode un_b64 = b64decode(content) # We have removed base 64-ing from the log files... if un_b64 is None: un_b64 = content if un_b64 is not None: from lib.Worker.Utils.funcs import getFileType from vars import MIME_TYPES, MIME_TYPE_MAP un_b64_mime_type = getFileType(un_b64, as_buffer=True) if DEBUG: print "MIME TYPE: %s" % un_b64_mime_type if un_b64_mime_type not in [MIME_TYPES['pgp'], MIME_TYPES['wildcard']]: err_msg = "MIME TYPE NOT USABLE" print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(status=412, message=err_msg) return media.addAsset(un_b64, "%s.pgp" % media.file_name, description="un-b64'ed pgp asset") media.addCompletedTask(task.task_path) message_sentinel = "-----BEGIN PGP MESSAGE-----" if un_b64[0:len(message_sentinel)] == message_sentinel: task.put_next("PGP.decrypt.decrypt") task.routeNext(inflate={ 'pgp_file' : ".data/%s/%s.pgp" % (media._id, media.file_name) }) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def unzipAndEvaluateArchive(uv_task): task_tag = "UNZIPPING FILE" print "\n\n************** %s [START] ******************\n" % task_tag print "unzipping and evaluating %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if hasattr(uv_task, "file_name"): zip = uv_task.file_name else: zip = media.file_name if DEBUG: print "Zip file here: %s" % zip if zip is None or not media.getFile(zip): print "THERE IS NO ZIP HERE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import os from time import sleep from fabric.api import * from fabric.context_managers import hide from conf import ANNEX_DIR with settings(warn_only=True): this_dir = os.getcwd() os.chdir(ANNEX_DIR) local("unzip -o %s -d %s" % (zip, media.base_path)) sleep(2) try: unzipped_files = local("ls %s" % media.base_path, capture=True).splitlines() except Exception as e: print e err_msg = "Could not find any unzipped files in %s" % media.base_path print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=err_msg) return os.chdir(this_dir) if DEBUG: print "UNZIPPED FILES: \n%s" % unzipped_files ZIPPED_ASSET_EXPECTED_NAMES = { 'source' : [ r"publicKey", r"baseImage_\d", r"credentials" ], 'j3mlog' : [ r"log.j3m(?:\.json)?", r".+\.(?:jpg|mkv)$" ] } assets = [] import re for facet, names in ZIPPED_ASSET_EXPECTED_NAMES.iteritems(): for file in unzipped_files: matches = [n for n in names if re.match(n, file) is not None] if len(matches) > 0: assets.append(file) if uv_task.get_next() is None: if facet == "source": uv_task.put_next([ "Source.init_source.initSource" ]) elif facet == "j3mlog": uv_task.put_next([ "Log.unpack_j3mlog.unpackJ3MLog", "J3M.j3mify.j3mify", "J3M.massage_j3m.massageJ3M", "PGP.verify_signature.verifySignature", "J3M.verify_visual_content.verifyVisualContent" ]) media.addCompletedTask(uv_task.task_path) if uv_task.get_next() is None: print "NO DECERNABLE TASK PATH" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return ''' could be either a source or a j3mlog at this point. ''' uv_task.routeNext(inflate={'assets' : assets}) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def locate_j3m(uv_task): task_tag = "PULLING J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "pulling j3m at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG, ANNEX_DIR from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from lib.Worker.Utils.funcs import getFileType from vars import MIME_TYPES, MIME_TYPE_MAP ic_j3m_txt = media.loadAsset("j3m_raw.txt") ic_j3m_txt_mime_type = getFileType(ic_j3m_txt, as_buffer=True, force_json=True) inflate = {} print "J3M MIME TYPE SNIFFED: %s" % ic_j3m_txt_mime_type if ic_j3m_txt_mime_type != MIME_TYPES['json']: import os from lib.Core.Utils.funcs import b64decode un_b64 = b64decode(ic_j3m_txt) if un_b64 is not None: un_b64_mime_type = getFileType(un_b64, as_buffer=True) if un_b64_mime_type in [MIME_TYPES['pgp'], MIME_TYPES['gzip']]: if DEBUG: print "MIME TYPE: %s" % un_b64_mime_type asset_path = "j3m_raw.%s" % MIME_TYPE_MAP[un_b64_mime_type] media.addAsset(un_b64, asset_path) if DEBUG: print "\n\nPGP KEY FILE PATH: %s\n\n" % asset_path gz = media.addAsset(None, "j3m_raw.gz", tags=[ASSET_TAGS['OB_M']], description="j3m data extracted from obscura marker") if un_b64_mime_type == MIME_TYPES['pgp']: uv_task.put_next([ "PGP.decrypt.decrypt", "J3M.j3mify.parse_zipped_j3m" ]) inflate.update({ 'pgp_file' : os.path.join(media.base_path, asset_path), 'save_as' : gz }) was_encrypted = True elif un_b64_mime_type in MIME_TYPES['gzip']: uv_task.put_next("J3M.j3mify.parse_zipped_j3m") else: import os from fabric.api import settings, local with settings(warn_only=True): src_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.txt") dest_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.json") local("mv %s %s" % (src_j3m, dest_j3m)) print "PUTTING J3M FROM HERE!!!! WAS JSON! (%s -> %s)" % (src_j3m, dest_j3m) media.addAsset(None, "j3m_raw.json") uv_task.put_next([ "J3M.j3mify.j3mify", "PGP.verify_signature.verifySignature", "J3M.massage_j3m.massageJ3M", "J3M.verify_visual_content.verifyVisualContent", "J3M.notarize.notarize_media" ]) inflate.update({'j3m_name' : "j3m_raw.json"}) media.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate=inflate) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag