def __init__(self, inflate=None, _id=None): if inflate is not None: if "task_path" in inflate.keys() and "task_path" == "Github.gist.run_gist": if "gist_id" not in args.keys(): return inflate["_id"] = generateMD5Hash(content=inflate["gist_id"], salt=time()) elif "_id" not in inflate.keys(): inflate["_id"] = generateMD5Hash() inflate["uv_doc_type"] = UV_DOC_TYPE["TASK"] inflate["status"] = 201 super(UnveillanceTask, self).__init__( _id=_id, inflate=inflate, emit_sentinels=[ EmitSentinel("ctx", "Worker", None), EmitSentinel("log_file", "str", None), EmitSentinel("task_channel", "UnveillanceTaskChannel", None), ], ) self.pid_file = os.path.join(ANNEX_DIR, self.base_path, "pid.txt") if not hasattr(self, "log_file"): self.log_file = os.path.join(ANNEX_DIR, self.base_path, "log.txt") else: if DEBUG: print "INHERITED A LOG FILE: %s" % self.log_file
def pull_avitar(self): print self.emit() t = time() avi = self.addAsset(None, "%s_%d.png" % (generateMD5Hash(content=self.profile_image_url), t), description="user's avitar at %d" % t, tags=[ASSET_TAGS['FD_AVI']]) if avi is None: return False with settings(warn_only=True): local("wget -O %s %s" % (os.path.join(ANNEX_DIR, avi), self.profile_image_url)) import pypuzzle puzz = pypuzzle.Puzzle() try: cvec = puzz.get_cvec_from_file(os.path.join(ANNEX_DIR, avi)) self.addAsset(cvec, "avitar_image_cvec_%d.json" % t, as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]) return True except Exception as e: if DEBUG: print "Could not get image vector because %s" % e return False
def decryptUserData(ciphertext, password, iv=None, p_salt=None): try: ciphertext_json = json.loads(b64decode(ciphertext)) ciphertext = ciphertext_json['data'].decode('hex') except Exception as e: if DEBUG: print e return None if p_salt is not None: password = password + p_salt try: if iv is None: iv = ciphertext_json['iv'].decode('hex') else: try: from conf import IV except ImportError as e: if DEBUG: print e return None iv = IV.decode('hex') except Exception as e: if DEBUG: print e return None aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv) user_data = json.loads(unpad(aes.decrypt(ciphertext))) if user_data['username']: return user_data return None
def createNewUser(username, password, as_admin=False): try: IV = getConfig('encryption.iv') SALT = getConfig('encryption.salt') USER_SALT = getConfig('encyption.user_salt') except Exception as e: if DEBUG: print e return None try: user_data = copy.deepcopy(USER_CREDENTIAL_PACK) user_data['username'] = username if as_admin: user_data['admin'] = True user_data['annex_key_sent'] = False if DEBUG: print "creating %s as admin!" % username user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT) if os.path.exists(os.path.join(USER_ROOT, user_root)): if DEBUG: print "user already exists NOPE!" return False print user_data with open(os.path.join(USER_ROOT, user_root), 'wb+') as user: user.write(encryptUserData(user_data, password, p_salt=SALT, iv=IV)) return True except Exception as e: print e return False
def screenshot_tweet(uv_task): task_tag = "TWEETER: SCREENSHOTTING TWEET" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.dl_FD_mention import FoxyDoxxingMention try: mention = FoxyDoxxingMention(_id=uv_task.doc_id) except Exception as e: error_msg = "Cannot load mention: %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=404, message=error_msg) return if not hasattr(mention, 'url'): error_msg = "no url for this tweet" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=error_msg) return import os from lib.Core.Utils.funcs import generateMD5Hash from lib.Worker.Models.fd_screencapper import FoxyDoxxingScreenCapper from conf import DEBUG, ANNEX_DIR from vars import ASSET_TAGS asset_path = mention.addAsset(None, "cap_%s.png" % generateMD5Hash(content=mention.url), description="Screen Capture from %s" % mention.url, tags=[ASSET_TAGS['FD_CAP']]) if DEBUG: print "SAVING SCREENCAP TO:" print asset_path cap = FoxyDoxxingScreenCapper(mention.url, asset_path) if not cap.success: error_msg = "Trouble screenshotting" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=error_msg) return mention.addCompletedTask(uv_task.task_path) uv_task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def loginUser(self, username, password, handler): try: SALT = getConfig('encryption.salt') USER_SALT = getConfig('encyption.user_salt') except Exception as e: if DEBUG: print e return None from base64 import b64encode try: from Utils.funcs import decryptUserData except Exception as e: if DEBUG: print e from lib.Frontend.Utils.funcs import decryptUserData try: user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT) with open(os.path.join(USER_ROOT, user_root), 'rb') as UD: user_data = decryptUserData(UD.read(), password, p_salt=SALT) if user_data is None: return None try: if user_data['admin']: del user_data['admin'] handler.set_secure_cookie(UnveillanceCookie.ADMIN, "true", path="/", expires_days=1) if not self.do_get_drive_status(): self.initDriveClient() if "annex_key_sent" not in user_data.keys( ) or not user_data['annex_key_sent']: if self.drive_client.checkAnnexKeyStatus(): user_data['annex_key_sent'] = True except KeyError as e: if DEBUG: print e pass handler.set_secure_cookie(UnveillanceCookie.USER, b64encode(json.dumps(user_data)), path="/", expires_days=1) return user_data except Exception as e: if DEBUG: print e return None
def __init__(self, _id=None, inflate=None): if inflate is not None: from lib.Core.Utils.funcs import generateMD5Hash from conf import UUID from vars import UV_DOC_TYPE, MIME_TYPES inflate['_id'] = generateMD5Hash(content=inflate['media_id'], salt=MIME_TYPES['j3m']) inflate['farm'] = UUID inflate['uv_doc_type'] = UV_DOC_TYPE['DOC'] inflate['mime_type'] = MIME_TYPES['j3m'] super(InformaCamJ3M, self).__init__("ic_j3m", _id=_id, inflate=inflate)
def routeNext(self, inflate=None): if DEBUG: print "ROUTING NEXT TASK FROM QUEUE\nCLONING SOME VARS FROM SELF:\n%s" % self.emit() if hasattr(self, "no_continue"): if DEBUG: print "NO CONTINUE FLAG DETECTED. NO ROUTING POSSIBLE." self.signal_terminate() return next_task_path = self.get_next() if next_task_path is None: if DEBUG: print "TASK QUEUE EXHAUSTED. NO ROUTING POSSIBLE." self.signal_terminate() if hasattr(self, "recurring"): for r in self.recurring: try: r = UnveillanceTask(_id=r) salt = "%s%s" % ( "" if r.salt is None else getattr(self, r.salt), str(r.persist * r.persist_until), ) if generateMD5Hash(content="%s_persist" % r.task_path, salt=salt) == r._id: r.run() except Exception as e: if DEBUG: print e, type(e) return if inflate is None: inflate = {} persist_keys = TASK_PERSIST_KEYS if hasattr(self, "persist_keys"): persist_keys += self.persist_keys for a in persist_keys: if hasattr(self, a): inflate[a] = getattr(self, a) inflate["task_path"] = next_task_path next_task = UnveillanceTask(inflate=inflate) next_task.run()
def logoutUser(self, credentials, handler): handler.clear_cookie(UnveillanceCookie.USER) handler.clear_cookie(UnveillanceCookie.ADMIN) try: username = credentials['username'] except KeyError as e: return None try: password = credentials['password'] except KeyError as e: return True try: IV = getConfig('encryption.iv') SALT = getConfig('encryption.salt') USER_SALT = getConfig('encyption.user_salt') except Exception as e: if DEBUG: print e return None try: from Utils.funcs import decryptUserData, encryptUserData except Exception as e: if DEBUG: print e from lib.Frontend.Utils.funcs import decryptUserData, encryptUserData user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT) with open(os.path.join(USER_ROOT, user_root), 'rb') as UD: user_data = decryptUserData(UD.read, password, p_salt=SALT) if user_data is None: return None new_data = copy.deepcopy(user_data) new_data['saved_searches'] = credentials['save_data'][ 'saved_searches'] try: new_data['annex_key_sent'] = credentials['save_data'][ 'annex_key_sent'] except KeyError as e: pass with open(os.path.join(USER_ROOT, user_root), 'wb+') as UD: UD.write(encryptUserData(new_data, password, iv=IV, p_salt=SALT)) return True return None
def encryptUserData(plaintext, password, iv=None, p_salt=None): if p_salt is not None: password = password + p_salt if iv is None: iv = generateSecureRandom() else: iv = iv.decode('hex') aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv) ciphertext = { 'iv' : iv.encode('hex'), 'data' : aes.encrypt(pad(json.dumps(plaintext))).encode('hex') } print ciphertext return b64encode(json.dumps(ciphertext))
def encryptUserData(plaintext, password, iv=None, p_salt=None): if p_salt is not None: password = password + p_salt if iv is None: iv = generateSecureRandom() else: iv = iv.decode('hex') aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv) ciphertext = { 'iv': iv.encode('hex'), 'data': aes.encrypt(pad(json.dumps(plaintext))).encode('hex') } print ciphertext return b64encode(json.dumps(ciphertext))
def __init__(self, inflate=None, _id=None, auto_pull=False): emit_sentinels = [ EmitSentinel("config", "dict", None), EmitSentinel("service", "Api", None), EmitSentinel("usable", "bool", None)] TwitterClient.__init__(self) if inflate is not None: if 'screen_name' not in inflate.keys(): return try: lookup = self.lookup_user(screen_name=inflate['screen_name']).AsDict() print lookup except Exception as e: if DEBUG: print "COULD NOT LOOKUP TWIITERER:" print e, type(e) return if 'file_name' not in inflate.keys(): inflate['file_name'] = "%s.json" % inflate['screen_name'] with open(os.path.join(ANNEX_DIR, inflate['file_name']), 'wb+') as F: F.write(json.dumps(lookup)) for i in ['id', 'profile_image_url', 'entities', 'friends_count', 'followers_count', 'listed_count', 'created_at', 'time_zone']: try: inflate[i] = lookup[i] print "ADDING %s: %s" % (i, inflate[i]) except Exception as e: print "COULD NOT GET KEY: %s" % i pass inflate['_id'] = generateMD5Hash(content=inflate['id']) if 'created_at' in inflate.keys(): from time import mktime from dateutil.parser import parse inflate['created_at_ts'] = mktime(parse(inflate['created_at']).timetuple()) UnveillanceDocument.__init__(self, inflate=inflate, _id=_id, emit_sentinels=emit_sentinels) if auto_pull: self.pull_avitar()
def set_recurring(self, task_path, persist_period, persist_until, inflate=None, salt=None): # persist period in minutes if DEBUG: print "SETTING A RECURRING TASK UNTIL %d TIME, PERIOD %d" % (persist_until, persist_period) # check for reasonability max_time = mktime((date.today() + timedelta(1)).timetuple()) if not (persist_until <= max_time and persist_until > time()): if DEBUG: print "TIME NOT REASONABLE." return if inflate is None: inflate = {} persist_keys = TASK_PERSIST_KEYS if hasattr(self, "persist_keys"): persist_keys += self.persist_keys for a in persist_keys: if a == "recurring": continue if hasattr(self, a): inflate[a] = getattr(self, a) inflate.update( { "task_path": task_path, "persist": persist_period, "persist_until": persist_until, "salt": None if not hasattr(self, salt) else salt, } ) inflate["_id"] = generateMD5Hash( content="%s_persist" % task_path, salt="%s%s" % ("" if inflate["salt"] is None else getattr(self, salt), str(persist_period * persist_until)), ) if not hasattr(self, "recurring"): self.recurring = [] self.recurring.append(UnveillanceTask(inflate=inflate)._id) self.recurring = list(set(self.recurring)) self.save()
def loginUser(self, username, password, handler): try: SALT = getConfig('encryption.salt') USER_SALT = getConfig('encyption.user_salt') except Exception as e: if DEBUG: print e return None from base64 import b64encode try: from Utils.funcs import decryptUserData except Exception as e: if DEBUG: print e from lib.Frontend.Utils.funcs import decryptUserData try: user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT) with open(os.path.join(USER_ROOT, user_root), 'rb') as UD: user_data = decryptUserData(UD.read(), password, p_salt=SALT) if user_data is None: return None try: if user_data['admin']: del user_data['admin'] handler.set_secure_cookie(UnveillanceCookie.ADMIN, "true", path="/", expires_days=1) if not self.do_get_drive_status(): self.initDriveClient() if "annex_key_sent" not in user_data.keys() or not user_data['annex_key_sent']: if self.drive_client.checkAnnexKeyStatus(): user_data['annex_key_sent'] = True except KeyError as e: if DEBUG: print e pass handler.set_secure_cookie(UnveillanceCookie.USER, b64encode(json.dumps(user_data)), path="/", expires_days=1) return user_data except Exception as e: if DEBUG: print e return None
def logoutUser(self, credentials, handler): handler.clear_cookie(UnveillanceCookie.USER) handler.clear_cookie(UnveillanceCookie.ADMIN) try: username = credentials['username'] except KeyError as e: return None try: password = credentials['password'] except KeyError as e: return True try: IV = getConfig('encryption.iv') SALT = getConfig('encryption.salt') USER_SALT = getConfig('encyption.user_salt') except Exception as e: if DEBUG: print e return None try: from Utils.funcs import decryptUserData, encryptUserData except Exception as e: if DEBUG: print e from lib.Frontend.Utils.funcs import decryptUserData, encryptUserData user_root = "%s.txt" % generateMD5Hash(content=username,salt=USER_SALT) with open(os.path.join(USER_ROOT, user_root), 'rb') as UD: user_data = decryptUserData(UD.read, password, p_salt=SALT) if user_data is None: return None new_data = copy.deepcopy(user_data) new_data['saved_searches'] = credentials['save_data']['saved_searches'] try: new_data['annex_key_sent'] = credentials['save_data']['annex_key_sent'] except KeyError as e: pass with open(os.path.join(USER_ROOT, user_root), 'wb+') as UD: UD.write(encryptUserData(new_data, password, iv=IV, p_salt=SALT)) return True return None
def __init__(self, _id=None, inflate=None, emit_sentinels=None): EMIT_SENTINELS = [EmitSentinel("documents", "UnveillanceDocument", "_id")] if inflate is not None: from lib.Core.Utils.funcs import generateMD5Hash try: inflate['_id'] = generateMD5Hash(content="".join(inflate['documents'])) print "NEW BATCH WITH ID %s" % inflate['_id'] print inflate['documents'] except Exception as e: print "ERROR WITH NEW BATCH ID GEN:" print e if emit_sentinels is None: if type(emit_sentinels) is not list: emit_sentinels = [emit_sentinels] EMIT_SENTINELS.extend(emit_sentinels) super(UnveillanceBatch, self).__init__(_id=_id, inflate=inflate, emit_sentinels=EMIT_SENTINELS)
def __init__(self, els_doc_root, emit_sentinels=None, _id=None, inflate=None): self.emit_sentinels = deepcopy(EMIT_SENTINELS) self.els_doc_root = els_doc_root if emit_sentinels is not None: if type(emit_sentinels) is not list: emit_sentinels = [emit_sentinels] self.emit_sentinels.extend(emit_sentinels) if inflate is not None: from lib.Core.Utils.funcs import generateMD5Hash inflate['date_added'] = time() * 1000 if '_id' not in inflate.keys(): inflate['_id'] = generateMD5Hash(salt=inflate['date_added']) self.inflate(inflate) self.save(create=True) elif _id is not None: self.getObject(_id, els_doc_root)
def __init__(self, _id=None, inflate=None, emit_sentinels=None): if emit_sentinels is None: emit_sentinels = [] if type(emit_sentinels) is not list: emit_sentinels = [emit_sentinels] emit_sentinels.append(EmitSentinel("els_doc_root", "str", None)) if inflate is not None: import os from fabric.api import settings, local from lib.Core.Utils.funcs import generateMD5Hash from conf import UUID, ANNEX_DIR from vars import UV_DOC_TYPE, MIME_TYPES inflate['_id'] = generateMD5Hash(content=inflate['media_id'], salt=MIME_TYPES['txt_stub']) inflate['farm'] = UUID inflate['uv_doc_type'] = UV_DOC_TYPE['DOC'] inflate['mime_type'] = MIME_TYPES['txt_stub'] inflate['els_doc_root'] = "uv_text_stub" this_dir = os.getcwd() os.chdir(ANNEX_DIR) file_name = "%s_%s" % (inflate['media_id'], inflate['file_name'].split("/")[-1]) with settings(warn_only=True): ln = local("ln -s %s %s" % (inflate['file_name'], file_name), capture=True) if DEBUG: print ln os.chdir(this_dir) inflate['file_name'] = file_name super(UnveillanceText, self).__init__(_id=_id, inflate=inflate, emit_sentinels=emit_sentinels)
def __init__(self, _id=None, inflate=None): if inflate is not None: for must in ['documents', 'task_path']: if must not in inflate.keys(): raise Exception("No documents and/or task_path") if DEBUG: print "INFLATING CLUSTER" from copy import deepcopy from lib.Core.Utils.funcs import generateMD5Hash from conf import UUID ids = deepcopy(inflate['documents']) if "query" in inflate.keys(): ids += inflate['query'] inflate.update({ '_id' : generateMD5Hash(content="".join(ids), salt=inflate['task_path']), 'queue' : UUID, 'uv_cluster' : True }) super(UnveillanceCluster, self).__init__(_id=_id, inflate=inflate)
def extractPDFText(uv_task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR, DEBUG from vars import ASSET_TAGS texts = [None] * pdf.total_pages if pdf.hasParts(): extractors = pdf.getParts() else: extractors = [pdf.file_name] count = 0 for e in extractors: if e == pdf.file_name: pdf_reader = pdf.loadFile(e) else: pdf_reader = pdf.loadAsset(e) try: num_pages = pdf_reader.getNumPages() except AttributeError as e: print e continue for x in xrange(0, num_pages): text = cleanLine(pdf_reader.getPage(x).extractText()) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def uploadToAnnex(self, netcat_stub): use_git_annex = False this_dir = os.getcwd() os.chdir(ANNEX_DIR) if type(netcat_stub['file']) in [str, unicode]: if GIT_ANNEX is not None: use_git_annex = True if DEBUG: print "GIT ANNEX ATTACHED TO INSTANCE." if use_git_annex: with settings(warn_only=True): # has this stub been uploaded? is_absorbed = local( "%s metadata \"%s\" --json --get=uv_uploaded" % (GIT_ANNEX, netcat_stub['save_as']), capture=True) if DEBUG: print "%s absorbed? (uv_uploaded = %s type = %s)" % ( netcat_stub['save_as'], is_absorbed, type(is_absorbed)) if is_absorbed == "" or "False": is_absorbed = False elif is_absorbed == "True": is_absorbed = True else: is_absorbed = False else: is_absorbed = False else: is_absorbed = False if is_absorbed: if DEBUG: print "%s IS absorbed (uv_uploaded = %s)" % ( netcat_stub['save_as'], is_absorbed) os.chdir(this_dir) return None new_hash = self.get_new_hash(netcat_stub['file']) possible_duplicate = self.checkForDuplicate(new_hash) if possible_duplicate is not None: if DEBUG: print "Document already exists in Annex and will not be uploaded! Here it is:" print possible_duplicate p = UnveillanceFabricProcess(register_upload_attempt, {'_id': possible_duplicate['_id']}) p.join() os.chdir(this_dir) self.netcat_queue.remove(netcat_stub) possible_duplicate = self.checkForDuplicate( possible_duplicate['_id']) possible_duplicate.update({ 'uploaded': False, 'duplicate_attempt': True }) return possible_duplicate with settings(warn_only=True): new_save_as = generateMD5Hash(content=new_hash, salt=local("whoami", capture=True)) if type(netcat_stub['file']) in [str, unicode]: new_file = netcat_stub['file'].replace(netcat_stub['save_as'], new_save_as) with settings(warn_only=True): local("mv \"%s\" %s" % (netcat_stub['file'], new_file)) if use_git_annex: local("%s metadata %s --json --set=uv_file_alias=\"%s\"" % (GIT_ANNEX, new_file, netcat_stub['save_as'])) netcat_stub['file'] = new_file netcat_stub['alias'] = netcat_stub['save_as'] netcat_stub['save_as'] = new_save_as success_tag = False # look up to see if this file is already in the annex with settings(warn_only=True): if type(netcat_stub['file']) in [str, unicode] and use_git_annex: local("%s add %s" % (GIT_ANNEX, netcat_stub['save_as'])) p = UnveillanceFabricProcess(netcat, netcat_stub) p.join() if p.error is None and p.output is not None: success_tag = True if DEBUG: print "NETCAT RESULT: (type=%s, success=%s)" % (type( p.output), success_tag) print "NETCAT ERROR (none is good!): (type=%s)" % type(p.error) if p.output is not None and DEBUG: for o in p.output: print "\n%s\n" % o if p.error is not None and DEBUG: print "ERROR:" print p.error if type(netcat_stub['file']) in [str, unicode] and use_git_annex: local("%s metadata \"%s\" --json --set=uv_uploaded=%s" % (GIT_ANNEX, netcat_stub['save_as'], str(success_tag))) self.netcat_queue.remove(netcat_stub) os.chdir(this_dir) return {'uploaded': success_tag, '_id': new_hash}
def get_documentcloud_ocr(uv_task): task_tag = "PULLING OCR FROM DOCUMENTCLOUD" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text via documentcloud from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) if not hasattr(uv_task, "documentcloud_auth"): error_msg = "DOCUMENTCLOUD AUTH STRING NEEDED" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(status=412, message=error_msg) return from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return if not hasattr(uv_task, "documentcloud_id"): try: uv_task.documentcloud_id = pdf.file_alias.replace(".pdf", "") print "DOCUMENTCLOUD ID NOT PASSED. GUESSING AT IT WITH %s" % uv_task.documentcloud_id print "\n\n************** %s [WARN] ******************\n" % task_tag except Exception as e: print "COULD NOT GET DOCUMENTCLOUD ID FOR %s" % pdf.file_name print e print "\n\n************** %s [ERROR] ******************\n" % task_tag return import os, requests from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR from vars import ASSET_TAGS texts = [None] * pdf.total_pages count = 0 req_map = { 'a' : uv_task.documentcloud_auth, 's' : uv_task.documentcloud_id.split('-')[0], 'd' : "-".join(uv_task.documentcloud_id.split('-')[1:]) } for x in xrange(0, pdf.total_pages): req_map['x'] = x req = "https://%(a)[email protected]/documents/%(s)s/pages/%(d)s-p%(x)d.txt" % (req_map) if DEBUG: print "trying %s" % req r = requests.get(req) if r.status_code != 200: print "\n\n************** %s [WARN] ******************\n" % task_tag print "no text at page %d" % x else: texts[count] = r.content els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : texts[count], 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) if texts[count] is None or len(texts[count]) == 0: print "\n\n************** %s [WARN] ******************\n" % task_tag print "no text at page %d (%s)" % (x, type(texts[count])) texts[count] = "" count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document, from DocumentCloud", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def uploadToAnnex(self, netcat_stub): use_git_annex = False this_dir = os.getcwd() os.chdir(ANNEX_DIR) if type(netcat_stub['file']) in [str, unicode]: if GIT_ANNEX is not None: use_git_annex = True if DEBUG: print "GIT ANNEX ATTACHED TO INSTANCE." if use_git_annex: with settings(warn_only=True): # has this stub been uploaded? is_absorbed = local("%s metadata \"%s\" --json --get=uv_uploaded" % ( GIT_ANNEX, netcat_stub['save_as']), capture=True) if DEBUG: print "%s absorbed? (uv_uploaded = %s type = %s)" % ( netcat_stub['save_as'], is_absorbed, type(is_absorbed)) if is_absorbed == "" or "False": is_absorbed = False elif is_absorbed == "True": is_absorbed = True else: is_absorbed = False else: is_absorbed = False else: is_absorbed = False if is_absorbed: if DEBUG: print "%s IS absorbed (uv_uploaded = %s)" % ( netcat_stub['save_as'], is_absorbed) os.chdir(this_dir) return None new_hash = self.get_new_hash(netcat_stub['file']) possible_duplicate = self.checkForDuplicate(new_hash) if possible_duplicate is not None: if DEBUG: print "Document already exists in Annex and will not be uploaded! Here it is:" print possible_duplicate p = UnveillanceFabricProcess(register_upload_attempt, {'_id' : possible_duplicate['_id'] }) p.join() os.chdir(this_dir) self.netcat_queue.remove(netcat_stub) possible_duplicate = self.checkForDuplicate(possible_duplicate['_id']) possible_duplicate.update({ 'uploaded' : False, 'duplicate_attempt' : True }) return possible_duplicate with settings(warn_only=True): new_save_as = generateMD5Hash(content=new_hash, salt=local("whoami", capture=True)) if type(netcat_stub['file']) in [str, unicode]: new_file = netcat_stub['file'].replace(netcat_stub['save_as'], new_save_as) with settings(warn_only=True): local("mv \"%s\" %s" % (netcat_stub['file'], new_file)) if use_git_annex: local("%s metadata %s --json --set=uv_file_alias=\"%s\"" % (GIT_ANNEX, new_file, netcat_stub['save_as'])) netcat_stub['file'] = new_file netcat_stub['alias'] = netcat_stub['save_as'] netcat_stub['save_as'] = new_save_as success_tag = False # look up to see if this file is already in the annex with settings(warn_only=True): if type(netcat_stub['file']) in [str, unicode] and use_git_annex: local("%s add %s" % (GIT_ANNEX, netcat_stub['save_as'])) p = UnveillanceFabricProcess(netcat, netcat_stub) p.join() if p.error is None and p.output is not None: success_tag = True if DEBUG: print "NETCAT RESULT: (type=%s, success=%s)" % (type(p.output), success_tag) print "NETCAT ERROR (none is good!): (type=%s)" % type(p.error) if p.output is not None and DEBUG: for o in p.output: print "\n%s\n" % o if p.error is not None and DEBUG: print "ERROR:" print p.error if type(netcat_stub['file']) in [str, unicode] and use_git_annex: local("%s metadata \"%s\" --json --set=uv_uploaded=%s" % ( GIT_ANNEX, netcat_stub['save_as'], str(success_tag))) self.netcat_queue.remove(netcat_stub) os.chdir(this_dir) return { 'uploaded' : success_tag, '_id' : new_hash }
def getFileNameHash(self, name_base): from conf import DOC_SALT return generateMD5Hash(content=name_base, salt=DOC_SALT)