Example #1
0
    def __init__(self, inflate=None, _id=None):
        if inflate is not None:
            if "task_path" in inflate.keys() and "task_path" == "Github.gist.run_gist":
                if "gist_id" not in args.keys():
                    return

                inflate["_id"] = generateMD5Hash(content=inflate["gist_id"], salt=time())

            elif "_id" not in inflate.keys():
                inflate["_id"] = generateMD5Hash()

            inflate["uv_doc_type"] = UV_DOC_TYPE["TASK"]
            inflate["status"] = 201

        super(UnveillanceTask, self).__init__(
            _id=_id,
            inflate=inflate,
            emit_sentinels=[
                EmitSentinel("ctx", "Worker", None),
                EmitSentinel("log_file", "str", None),
                EmitSentinel("task_channel", "UnveillanceTaskChannel", None),
            ],
        )

        self.pid_file = os.path.join(ANNEX_DIR, self.base_path, "pid.txt")

        if not hasattr(self, "log_file"):
            self.log_file = os.path.join(ANNEX_DIR, self.base_path, "log.txt")
        else:
            if DEBUG:
                print "INHERITED A LOG FILE: %s" % self.log_file
Example #2
0
	def pull_avitar(self):
		print self.emit()

		t = time()
		avi = self.addAsset(None, "%s_%d.png" % (generateMD5Hash(content=self.profile_image_url), t),
			description="user's avitar at %d" % t, tags=[ASSET_TAGS['FD_AVI']])

		if avi is None:
			return False

		with settings(warn_only=True):
			local("wget -O %s %s" % (os.path.join(ANNEX_DIR, avi), self.profile_image_url))

		import pypuzzle
		puzz = pypuzzle.Puzzle()

		try:
			cvec = puzz.get_cvec_from_file(os.path.join(ANNEX_DIR, avi))
			self.addAsset(cvec, "avitar_image_cvec_%d.json" % t, as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']])
			return True
		except Exception as e:
			if DEBUG:
				print "Could not get image vector because %s" % e

		return False
Example #3
0
def decryptUserData(ciphertext, password, iv=None, p_salt=None):
	try:
		ciphertext_json = json.loads(b64decode(ciphertext))
		ciphertext = ciphertext_json['data'].decode('hex')
	except Exception as e:
		if DEBUG: print e
		return None
	
	if p_salt is not None:
		password = password + p_salt
	
	try:
		if iv is None: iv = ciphertext_json['iv'].decode('hex')
		else: 
			try:
				from conf import IV
			except ImportError as e:
				if DEBUG: print e
				return None

			iv = IV.decode('hex')
	except Exception as e:
		if DEBUG: print e
		return None
	
	aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv)
	user_data = json.loads(unpad(aes.decrypt(ciphertext)))
	
	if user_data['username']: return user_data
	return None
Example #4
0
def createNewUser(username, password, as_admin=False):
	try:
		IV = getConfig('encryption.iv')
		SALT = getConfig('encryption.salt')
		USER_SALT = getConfig('encyption.user_salt')
	except Exception as e:
		if DEBUG: print e
		return None
		
	try:
		user_data = copy.deepcopy(USER_CREDENTIAL_PACK)
		user_data['username'] = username
		if as_admin:
			user_data['admin'] = True
			user_data['annex_key_sent'] = False
			if DEBUG: print "creating %s as admin!" % username
		
		user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT)
		if os.path.exists(os.path.join(USER_ROOT, user_root)):
			if DEBUG: print "user already exists NOPE!"
			return False
		
		print user_data
		
		with open(os.path.join(USER_ROOT, user_root), 'wb+') as user:
			user.write(encryptUserData(user_data, password, p_salt=SALT, iv=IV))
			return True

	except Exception as e: print e		
	return False
Example #5
0
def createNewUser(username, password, as_admin=False):
    try:
        IV = getConfig('encryption.iv')
        SALT = getConfig('encryption.salt')
        USER_SALT = getConfig('encyption.user_salt')
    except Exception as e:
        if DEBUG: print e
        return None

    try:
        user_data = copy.deepcopy(USER_CREDENTIAL_PACK)
        user_data['username'] = username
        if as_admin:
            user_data['admin'] = True
            user_data['annex_key_sent'] = False
            if DEBUG: print "creating %s as admin!" % username

        user_root = "%s.txt" % generateMD5Hash(content=username,
                                               salt=USER_SALT)
        if os.path.exists(os.path.join(USER_ROOT, user_root)):
            if DEBUG: print "user already exists NOPE!"
            return False

        print user_data

        with open(os.path.join(USER_ROOT, user_root), 'wb+') as user:
            user.write(encryptUserData(user_data, password, p_salt=SALT,
                                       iv=IV))
            return True

    except Exception as e:
        print e
    return False
Example #6
0
def decryptUserData(ciphertext, password, iv=None, p_salt=None):
    try:
        ciphertext_json = json.loads(b64decode(ciphertext))
        ciphertext = ciphertext_json['data'].decode('hex')
    except Exception as e:
        if DEBUG: print e
        return None

    if p_salt is not None:
        password = password + p_salt

    try:
        if iv is None: iv = ciphertext_json['iv'].decode('hex')
        else:
            try:
                from conf import IV
            except ImportError as e:
                if DEBUG: print e
                return None

            iv = IV.decode('hex')
    except Exception as e:
        if DEBUG: print e
        return None

    aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv)
    user_data = json.loads(unpad(aes.decrypt(ciphertext)))

    if user_data['username']: return user_data
    return None
Example #7
0
def screenshot_tweet(uv_task):
	task_tag = "TWEETER: SCREENSHOTTING TWEET"

	print "\n\n************** %s [START] ******************\n" % task_tag
	uv_task.setStatus(302)

	from lib.Worker.Models.dl_FD_mention import FoxyDoxxingMention

	try:
		mention = FoxyDoxxingMention(_id=uv_task.doc_id)
	except Exception as e:
		error_msg = "Cannot load mention: %s" % e

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag

		uv_task.fail(status=404, message=error_msg)
		return

	if not hasattr(mention, 'url'):
		error_msg = "no url for this tweet"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(status=412, message=error_msg)
		return

	import os
	
	from lib.Core.Utils.funcs import generateMD5Hash
	from lib.Worker.Models.fd_screencapper import FoxyDoxxingScreenCapper

	from conf import DEBUG, ANNEX_DIR	
	from vars import ASSET_TAGS

	asset_path = mention.addAsset(None, "cap_%s.png" % generateMD5Hash(content=mention.url),
		description="Screen Capture from %s" % mention.url, tags=[ASSET_TAGS['FD_CAP']])

	if DEBUG:
		print "SAVING SCREENCAP TO:"
		print asset_path

	cap = FoxyDoxxingScreenCapper(mention.url, asset_path)
	if not cap.success:
		error_msg = "Trouble screenshotting"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag

		uv_task.fail(status=412, message=error_msg)
		return

	mention.addCompletedTask(uv_task.task_path)
	uv_task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()
Example #8
0
    def loginUser(self, username, password, handler):
        try:
            SALT = getConfig('encryption.salt')
            USER_SALT = getConfig('encyption.user_salt')
        except Exception as e:
            if DEBUG: print e
            return None

        from base64 import b64encode
        try:
            from Utils.funcs import decryptUserData
        except Exception as e:
            if DEBUG: print e
            from lib.Frontend.Utils.funcs import decryptUserData

        try:
            user_root = "%s.txt" % generateMD5Hash(content=username,
                                                   salt=USER_SALT)
            with open(os.path.join(USER_ROOT, user_root), 'rb') as UD:
                user_data = decryptUserData(UD.read(), password, p_salt=SALT)
                if user_data is None: return None

                try:
                    if user_data['admin']:
                        del user_data['admin']
                        handler.set_secure_cookie(UnveillanceCookie.ADMIN,
                                                  "true",
                                                  path="/",
                                                  expires_days=1)

                        if not self.do_get_drive_status():
                            self.initDriveClient()
                            if "annex_key_sent" not in user_data.keys(
                            ) or not user_data['annex_key_sent']:
                                if self.drive_client.checkAnnexKeyStatus():
                                    user_data['annex_key_sent'] = True

                except KeyError as e:
                    if DEBUG: print e
                    pass

                handler.set_secure_cookie(UnveillanceCookie.USER,
                                          b64encode(json.dumps(user_data)),
                                          path="/",
                                          expires_days=1)

                return user_data

        except Exception as e:
            if DEBUG: print e

        return None
Example #9
0
	def __init__(self, _id=None, inflate=None):
		if inflate is not None:
			from lib.Core.Utils.funcs import generateMD5Hash
			from conf import UUID
			from vars import UV_DOC_TYPE, MIME_TYPES
			
			inflate['_id'] = generateMD5Hash(content=inflate['media_id'],
				salt=MIME_TYPES['j3m'])

			inflate['farm'] = UUID
			inflate['uv_doc_type'] = UV_DOC_TYPE['DOC']
			inflate['mime_type'] = MIME_TYPES['j3m']
			
		super(InformaCamJ3M, self).__init__("ic_j3m", _id=_id, inflate=inflate)
Example #10
0
    def routeNext(self, inflate=None):
        if DEBUG:
            print "ROUTING NEXT TASK FROM QUEUE\nCLONING SOME VARS FROM SELF:\n%s" % self.emit()

        if hasattr(self, "no_continue"):
            if DEBUG:
                print "NO CONTINUE FLAG DETECTED.  NO ROUTING POSSIBLE."

            self.signal_terminate()
            return

        next_task_path = self.get_next()
        if next_task_path is None:
            if DEBUG:
                print "TASK QUEUE EXHAUSTED. NO ROUTING POSSIBLE."

            self.signal_terminate()

            if hasattr(self, "recurring"):
                for r in self.recurring:
                    try:
                        r = UnveillanceTask(_id=r)

                        salt = "%s%s" % (
                            "" if r.salt is None else getattr(self, r.salt),
                            str(r.persist * r.persist_until),
                        )
                        if generateMD5Hash(content="%s_persist" % r.task_path, salt=salt) == r._id:
                            r.run()
                    except Exception as e:
                        if DEBUG:
                            print e, type(e)

            return

        if inflate is None:
            inflate = {}

        persist_keys = TASK_PERSIST_KEYS

        if hasattr(self, "persist_keys"):
            persist_keys += self.persist_keys

        for a in persist_keys:
            if hasattr(self, a):
                inflate[a] = getattr(self, a)

        inflate["task_path"] = next_task_path
        next_task = UnveillanceTask(inflate=inflate)
        next_task.run()
Example #11
0
    def logoutUser(self, credentials, handler):
        handler.clear_cookie(UnveillanceCookie.USER)
        handler.clear_cookie(UnveillanceCookie.ADMIN)

        try:
            username = credentials['username']
        except KeyError as e:
            return None

        try:
            password = credentials['password']
        except KeyError as e:
            return True

        try:
            IV = getConfig('encryption.iv')
            SALT = getConfig('encryption.salt')
            USER_SALT = getConfig('encyption.user_salt')
        except Exception as e:
            if DEBUG: print e
            return None

        try:
            from Utils.funcs import decryptUserData, encryptUserData
        except Exception as e:
            if DEBUG: print e
            from lib.Frontend.Utils.funcs import decryptUserData, encryptUserData

        user_root = "%s.txt" % generateMD5Hash(content=username,
                                               salt=USER_SALT)
        with open(os.path.join(USER_ROOT, user_root), 'rb') as UD:
            user_data = decryptUserData(UD.read, password, p_salt=SALT)

            if user_data is None: return None

            new_data = copy.deepcopy(user_data)
            new_data['saved_searches'] = credentials['save_data'][
                'saved_searches']
            try:
                new_data['annex_key_sent'] = credentials['save_data'][
                    'annex_key_sent']
            except KeyError as e:
                pass

        with open(os.path.join(USER_ROOT, user_root), 'wb+') as UD:
            UD.write(encryptUserData(new_data, password, iv=IV, p_salt=SALT))
            return True

        return None
Example #12
0
def encryptUserData(plaintext, password, iv=None, p_salt=None):
	if p_salt is not None:
		password = password + p_salt
	
	if iv is None: iv = generateSecureRandom()
	else: iv = iv.decode('hex')
	
	aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv)
	ciphertext = {
		'iv' : iv.encode('hex'), 
		'data' : aes.encrypt(pad(json.dumps(plaintext))).encode('hex')
	}
	
	print ciphertext
	return b64encode(json.dumps(ciphertext))
Example #13
0
def encryptUserData(plaintext, password, iv=None, p_salt=None):
    if p_salt is not None:
        password = password + p_salt

    if iv is None: iv = generateSecureRandom()
    else: iv = iv.decode('hex')

    aes = AES.new(generateMD5Hash(content=password), AES.MODE_CBC, iv)
    ciphertext = {
        'iv': iv.encode('hex'),
        'data': aes.encrypt(pad(json.dumps(plaintext))).encode('hex')
    }

    print ciphertext
    return b64encode(json.dumps(ciphertext))
Example #14
0
	def __init__(self, inflate=None, _id=None, auto_pull=False):
		emit_sentinels = [
			EmitSentinel("config", "dict", None), 
			EmitSentinel("service", "Api", None),
			EmitSentinel("usable", "bool", None)]

		TwitterClient.__init__(self)

		if inflate is not None:
			if 'screen_name' not in inflate.keys():
				return

			try:
				lookup = self.lookup_user(screen_name=inflate['screen_name']).AsDict()
				print lookup
			except Exception as e:
				if DEBUG:
					print "COULD NOT LOOKUP TWIITERER:"
					print e, type(e)

				return

			if 'file_name' not in inflate.keys():
				inflate['file_name'] = "%s.json" % inflate['screen_name']
				with open(os.path.join(ANNEX_DIR, inflate['file_name']), 'wb+') as F:
					F.write(json.dumps(lookup))

			for i in ['id', 'profile_image_url', 'entities', 'friends_count', 'followers_count', 'listed_count', 'created_at', 'time_zone']:
				try:
					inflate[i] = lookup[i]
					print "ADDING %s: %s" % (i, inflate[i])
				except Exception as e:
					print "COULD NOT GET KEY: %s" % i
					pass
			
			inflate['_id'] = generateMD5Hash(content=inflate['id'])
			
			if 'created_at' in inflate.keys():
				from time import mktime
				from dateutil.parser import parse

				inflate['created_at_ts'] = mktime(parse(inflate['created_at']).timetuple())
		
		UnveillanceDocument.__init__(self, inflate=inflate, _id=_id, emit_sentinels=emit_sentinels)
		
		if auto_pull:
			self.pull_avitar()
Example #15
0
    def set_recurring(self, task_path, persist_period, persist_until, inflate=None, salt=None):
        # persist period in minutes
        if DEBUG:
            print "SETTING A RECURRING TASK UNTIL %d TIME, PERIOD %d" % (persist_until, persist_period)

            # check for reasonability
        max_time = mktime((date.today() + timedelta(1)).timetuple())
        if not (persist_until <= max_time and persist_until > time()):
            if DEBUG:
                print "TIME NOT REASONABLE."
                return

        if inflate is None:
            inflate = {}

        persist_keys = TASK_PERSIST_KEYS
        if hasattr(self, "persist_keys"):
            persist_keys += self.persist_keys

        for a in persist_keys:
            if a == "recurring":
                continue

            if hasattr(self, a):
                inflate[a] = getattr(self, a)

        inflate.update(
            {
                "task_path": task_path,
                "persist": persist_period,
                "persist_until": persist_until,
                "salt": None if not hasattr(self, salt) else salt,
            }
        )

        inflate["_id"] = generateMD5Hash(
            content="%s_persist" % task_path,
            salt="%s%s" % ("" if inflate["salt"] is None else getattr(self, salt), str(persist_period * persist_until)),
        )

        if not hasattr(self, "recurring"):
            self.recurring = []

        self.recurring.append(UnveillanceTask(inflate=inflate)._id)
        self.recurring = list(set(self.recurring))
        self.save()
Example #16
0
	def loginUser(self, username, password, handler):
		try:
			SALT = getConfig('encryption.salt')
			USER_SALT = getConfig('encyption.user_salt')
		except Exception as e:
			if DEBUG: print e
			return None		
		
		from base64 import b64encode
		try:
			from Utils.funcs import decryptUserData
		except Exception as e:
			if DEBUG: print e
			from lib.Frontend.Utils.funcs import decryptUserData

		try:
			user_root = "%s.txt" % generateMD5Hash(content=username, salt=USER_SALT)
			with open(os.path.join(USER_ROOT, user_root), 'rb') as UD:
				user_data = decryptUserData(UD.read(), password, p_salt=SALT)
				if user_data is None: return None
				
				try:
					if user_data['admin']:
						del user_data['admin']
						handler.set_secure_cookie(UnveillanceCookie.ADMIN, 
							"true", path="/", expires_days=1)
							
						if not self.do_get_drive_status():
							self.initDriveClient()
							if "annex_key_sent" not in user_data.keys() or not user_data['annex_key_sent']:							
								if self.drive_client.checkAnnexKeyStatus():
									user_data['annex_key_sent'] = True

				except KeyError as e: 
					if DEBUG: print e
					pass
				
				handler.set_secure_cookie(UnveillanceCookie.USER, 
					b64encode(json.dumps(user_data)), path="/", expires_days=1)
				
				return user_data
		
		except Exception as e:
			if DEBUG: print e		
		
		return None
Example #17
0
	def logoutUser(self, credentials, handler):
		handler.clear_cookie(UnveillanceCookie.USER)
		handler.clear_cookie(UnveillanceCookie.ADMIN)
		
		try:
			username = credentials['username']
		except KeyError as e: return None
		
		try:
			password = credentials['password']
		except KeyError as e: return True
		
		try:
			IV = getConfig('encryption.iv')
			SALT = getConfig('encryption.salt')
			USER_SALT = getConfig('encyption.user_salt')
		except Exception as e:
			if DEBUG: print e
			return None		
		
		try:
			from Utils.funcs import decryptUserData, encryptUserData
		except Exception as e:
			if DEBUG: print e
			from lib.Frontend.Utils.funcs import decryptUserData, encryptUserData
				
		user_root = "%s.txt" % generateMD5Hash(content=username,salt=USER_SALT)
		with open(os.path.join(USER_ROOT, user_root), 'rb') as UD:
			user_data = decryptUserData(UD.read, password, p_salt=SALT)
			
			if user_data is None: return None
			
			new_data = copy.deepcopy(user_data)
			new_data['saved_searches'] = credentials['save_data']['saved_searches']
			try:
				new_data['annex_key_sent'] = credentials['save_data']['annex_key_sent']
			except KeyError as e:
				pass
		
		with open(os.path.join(USER_ROOT, user_root), 'wb+') as UD:
			UD.write(encryptUserData(new_data, password, iv=IV, p_salt=SALT))
			return True
		
		return None
Example #18
0
	def __init__(self, _id=None, inflate=None, emit_sentinels=None):
		EMIT_SENTINELS = [EmitSentinel("documents", "UnveillanceDocument", "_id")]
		
		if inflate is not None:
			from lib.Core.Utils.funcs import generateMD5Hash
			try:
				inflate['_id'] = generateMD5Hash(content="".join(inflate['documents']))
				print "NEW BATCH WITH ID %s" % inflate['_id']
				print inflate['documents']
			except Exception as e:
				print "ERROR WITH NEW BATCH ID GEN:"
				print e
		
		if emit_sentinels is None:
			if type(emit_sentinels) is not list:
				emit_sentinels = [emit_sentinels]

			EMIT_SENTINELS.extend(emit_sentinels)
		
		super(UnveillanceBatch, self).__init__(_id=_id, 
			inflate=inflate, emit_sentinels=EMIT_SENTINELS)
Example #19
0
	def __init__(self, els_doc_root, emit_sentinels=None, _id=None, inflate=None):		
		self.emit_sentinels = deepcopy(EMIT_SENTINELS)
		self.els_doc_root = els_doc_root
		
		if emit_sentinels is not None:
			if type(emit_sentinels) is not list:
				emit_sentinels = [emit_sentinels]
			
			self.emit_sentinels.extend(emit_sentinels)
			
		if inflate is not None:
			from lib.Core.Utils.funcs import generateMD5Hash

			inflate['date_added'] = time() * 1000

			if '_id' not in inflate.keys():
				inflate['_id'] = generateMD5Hash(salt=inflate['date_added'])

			self.inflate(inflate)
			self.save(create=True)
		
		elif _id is not None: self.getObject(_id, els_doc_root)
Example #20
0
	def __init__(self, _id=None, inflate=None, emit_sentinels=None):
		if emit_sentinels is None: emit_sentinels = []
		if type(emit_sentinels) is not list: emit_sentinels = [emit_sentinels]
		emit_sentinels.append(EmitSentinel("els_doc_root", "str", None))

		if inflate is not None:
			import os
			from fabric.api import settings, local
			
			from lib.Core.Utils.funcs import generateMD5Hash
			from conf import UUID, ANNEX_DIR
			from vars import UV_DOC_TYPE, MIME_TYPES
			
			inflate['_id'] = generateMD5Hash(content=inflate['media_id'], 
				salt=MIME_TYPES['txt_stub'])
			
			inflate['farm'] = UUID
			inflate['uv_doc_type'] = UV_DOC_TYPE['DOC']
			inflate['mime_type'] = MIME_TYPES['txt_stub']
			inflate['els_doc_root'] = "uv_text_stub"
			
			this_dir = os.getcwd()
			os.chdir(ANNEX_DIR)
			
			file_name = "%s_%s" % (inflate['media_id'],
				inflate['file_name'].split("/")[-1])
			
			with settings(warn_only=True):
				ln = local("ln -s %s %s" % (inflate['file_name'], file_name),
					capture=True)
				
				if DEBUG: print ln
			
			os.chdir(this_dir)
			inflate['file_name'] = file_name

		super(UnveillanceText, self).__init__(_id=_id, 
			inflate=inflate, emit_sentinels=emit_sentinels)
Example #21
0
	def __init__(self, _id=None, inflate=None):	
		if inflate is not None:
			for must in ['documents', 'task_path']:
				if must not in inflate.keys():
					raise Exception("No documents and/or task_path")

			if DEBUG: print "INFLATING CLUSTER"
			
			from copy import deepcopy
			from lib.Core.Utils.funcs import generateMD5Hash
			from conf import UUID

			ids = deepcopy(inflate['documents'])
			if "query" in inflate.keys():
				ids += inflate['query']

			inflate.update({
				'_id' : generateMD5Hash(content="".join(ids), salt=inflate['task_path']),
				'queue' : UUID,
				'uv_cluster' : True
			})
			
		super(UnveillanceCluster, self).__init__(_id=_id, inflate=inflate)
Example #22
0
def extractPDFText(uv_task):	
	task_tag = "PDF TEXT EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	import os
	from fabric.api import settings, local
	from wand.image import Image
	from time import sleep

	from lib.Core.Utils.funcs import cleanLine, generateMD5Hash
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR, DEBUG
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	
	if pdf.hasParts():
		extractors = pdf.getParts()
	else:
		extractors = [pdf.file_name]
	
	count = 0
	for e in extractors:
		if e == pdf.file_name:
			pdf_reader = pdf.loadFile(e)
		else:
			pdf_reader = pdf.loadAsset(e)
		try:
			num_pages = pdf_reader.getNumPages()
		except AttributeError as e:
			print e
			continue

		for x in xrange(0, num_pages):
			text = cleanLine(pdf_reader.getPage(x).extractText())
			texts[count] = text

			els_stub = UnveillanceELSStub('cp_page_text', inflate={
				'media_id' : pdf._id,
				'searchable_text' : text,
				'index_in_parent' : count,
				'_id' : generateMD5Hash(content=pdf._id, salt=str(count))
			})

			count += 1
	
	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()
    def uploadToAnnex(self, netcat_stub):
        use_git_annex = False
        this_dir = os.getcwd()
        os.chdir(ANNEX_DIR)

        if type(netcat_stub['file']) in [str, unicode]:
            if GIT_ANNEX is not None:
                use_git_annex = True
                if DEBUG:
                    print "GIT ANNEX ATTACHED TO INSTANCE."

            if use_git_annex:
                with settings(warn_only=True):
                    # has this stub been uploaded?
                    is_absorbed = local(
                        "%s metadata \"%s\" --json --get=uv_uploaded" %
                        (GIT_ANNEX, netcat_stub['save_as']),
                        capture=True)

                    if DEBUG:
                        print "%s absorbed? (uv_uploaded = %s type = %s)" % (
                            netcat_stub['save_as'], is_absorbed,
                            type(is_absorbed))

                    if is_absorbed == "" or "False":
                        is_absorbed = False
                    elif is_absorbed == "True":
                        is_absorbed = True
                    else:
                        is_absorbed = False
            else:
                is_absorbed = False
        else:
            is_absorbed = False

        if is_absorbed:
            if DEBUG:
                print "%s IS absorbed (uv_uploaded = %s)" % (
                    netcat_stub['save_as'], is_absorbed)

            os.chdir(this_dir)
            return None

        new_hash = self.get_new_hash(netcat_stub['file'])

        possible_duplicate = self.checkForDuplicate(new_hash)
        if possible_duplicate is not None:

            if DEBUG:
                print "Document already exists in Annex and will not be uploaded!  Here it is:"
                print possible_duplicate

            p = UnveillanceFabricProcess(register_upload_attempt,
                                         {'_id': possible_duplicate['_id']})
            p.join()

            os.chdir(this_dir)
            self.netcat_queue.remove(netcat_stub)

            possible_duplicate = self.checkForDuplicate(
                possible_duplicate['_id'])
            possible_duplicate.update({
                'uploaded': False,
                'duplicate_attempt': True
            })
            return possible_duplicate

        with settings(warn_only=True):
            new_save_as = generateMD5Hash(content=new_hash,
                                          salt=local("whoami", capture=True))

        if type(netcat_stub['file']) in [str, unicode]:
            new_file = netcat_stub['file'].replace(netcat_stub['save_as'],
                                                   new_save_as)

            with settings(warn_only=True):
                local("mv \"%s\" %s" % (netcat_stub['file'], new_file))

                if use_git_annex:
                    local("%s metadata %s --json --set=uv_file_alias=\"%s\"" %
                          (GIT_ANNEX, new_file, netcat_stub['save_as']))

            netcat_stub['file'] = new_file

        netcat_stub['alias'] = netcat_stub['save_as']
        netcat_stub['save_as'] = new_save_as
        success_tag = False

        # look up to see if this file is already in the annex

        with settings(warn_only=True):
            if type(netcat_stub['file']) in [str, unicode] and use_git_annex:
                local("%s add %s" % (GIT_ANNEX, netcat_stub['save_as']))

            p = UnveillanceFabricProcess(netcat, netcat_stub)
            p.join()

            if p.error is None and p.output is not None:
                success_tag = True

            if DEBUG:
                print "NETCAT RESULT: (type=%s, success=%s)" % (type(
                    p.output), success_tag)
                print "NETCAT ERROR (none is good!): (type=%s)" % type(p.error)

            if p.output is not None and DEBUG:
                for o in p.output:
                    print "\n%s\n" % o

            if p.error is not None and DEBUG:
                print "ERROR:"
                print p.error

            if type(netcat_stub['file']) in [str, unicode] and use_git_annex:
                local("%s metadata \"%s\" --json --set=uv_uploaded=%s" %
                      (GIT_ANNEX, netcat_stub['save_as'], str(success_tag)))

            self.netcat_queue.remove(netcat_stub)

        os.chdir(this_dir)
        return {'uploaded': success_tag, '_id': new_hash}
def get_documentcloud_ocr(uv_task):	
	task_tag = "PULLING OCR FROM DOCUMENTCLOUD"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "OCRing text via documentcloud from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)

	if not hasattr(uv_task, "documentcloud_auth"):
		error_msg = "DOCUMENTCLOUD AUTH STRING NEEDED"
		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail(status=412, message=error_msg)
		return 

	from lib.Worker.Models.cp_pdf import CompassPDF
	from conf import DEBUG

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	if not hasattr(uv_task, "documentcloud_id"):
		try:
			uv_task.documentcloud_id = pdf.file_alias.replace(".pdf", "")

			print "DOCUMENTCLOUD ID NOT PASSED. GUESSING AT IT WITH %s" % uv_task.documentcloud_id
			print "\n\n************** %s [WARN] ******************\n" % task_tag
		except Exception as e:
			print "COULD NOT GET DOCUMENTCLOUD ID FOR %s" % pdf.file_name
			print e
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			return

	import os, requests
	
	from lib.Core.Utils.funcs import cleanLine, generateMD5Hash
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	count = 0
	req_map = {
		'a' : uv_task.documentcloud_auth,
		's' : uv_task.documentcloud_id.split('-')[0],
		'd' : "-".join(uv_task.documentcloud_id.split('-')[1:])
	}

	for x in xrange(0, pdf.total_pages):
		req_map['x'] = x
		req = "https://%(a)[email protected]/documents/%(s)s/pages/%(d)s-p%(x)d.txt" % (req_map)
	
		if DEBUG:
			print "trying %s" % req

		r = requests.get(req)
		if r.status_code != 200:
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print "no text at page %d" % x
		else:
			texts[count] = r.content

			els_stub = UnveillanceELSStub('cp_page_text', inflate={
				'media_id' : pdf._id,
				'searchable_text' : texts[count],
				'index_in_parent' : count,
				'_id' : generateMD5Hash(content=pdf._id, salt=str(count))
			})

		if texts[count] is None or len(texts[count]) == 0:
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print "no text at page %d (%s)" % (x, type(texts[count]))
			texts[count] = ""

		count += 1

	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document, from DocumentCloud", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()
	def uploadToAnnex(self, netcat_stub):
		use_git_annex = False
		this_dir = os.getcwd()
		os.chdir(ANNEX_DIR)

		if type(netcat_stub['file']) in [str, unicode]:
			if GIT_ANNEX is not None:
				use_git_annex = True
				if DEBUG:
					print "GIT ANNEX ATTACHED TO INSTANCE."

			if use_git_annex:
				with settings(warn_only=True):
					# has this stub been uploaded?
					is_absorbed = local("%s metadata \"%s\" --json --get=uv_uploaded" % (
						GIT_ANNEX, netcat_stub['save_as']), capture=True)

					if DEBUG: print "%s absorbed? (uv_uploaded = %s type = %s)" % (
						netcat_stub['save_as'], is_absorbed, type(is_absorbed))

					if is_absorbed == "" or "False":
						is_absorbed = False
					elif is_absorbed == "True":
						is_absorbed = True
					else:
						is_absorbed = False
			else:
				is_absorbed = False
		else:
			is_absorbed = False

		if is_absorbed:
			if DEBUG: print "%s IS absorbed (uv_uploaded = %s)" % (
				netcat_stub['save_as'], is_absorbed)
			
			os.chdir(this_dir)
			return None

		new_hash = self.get_new_hash(netcat_stub['file'])

		possible_duplicate = self.checkForDuplicate(new_hash)
		if possible_duplicate is not None:

			if DEBUG: 
				print "Document already exists in Annex and will not be uploaded!  Here it is:"
				print possible_duplicate

			p = UnveillanceFabricProcess(register_upload_attempt, {'_id' : possible_duplicate['_id'] })
			p.join()

			os.chdir(this_dir)
			self.netcat_queue.remove(netcat_stub)

			possible_duplicate = self.checkForDuplicate(possible_duplicate['_id'])
			possible_duplicate.update({
				'uploaded' : False,
				'duplicate_attempt' : True
			})
			return possible_duplicate
		
		with settings(warn_only=True):
			new_save_as = generateMD5Hash(content=new_hash, salt=local("whoami", capture=True))
		
		if type(netcat_stub['file']) in [str, unicode]:
			new_file = netcat_stub['file'].replace(netcat_stub['save_as'], new_save_as)

			with settings(warn_only=True):
				local("mv \"%s\" %s" % (netcat_stub['file'], new_file))

				if use_git_annex:
					local("%s metadata %s --json --set=uv_file_alias=\"%s\"" % (GIT_ANNEX, new_file, netcat_stub['save_as']))

			netcat_stub['file'] = new_file

		netcat_stub['alias'] = netcat_stub['save_as']
		netcat_stub['save_as'] = new_save_as
		success_tag = False

		# look up to see if this file is already in the annex

		with settings(warn_only=True):
			if type(netcat_stub['file']) in [str, unicode] and use_git_annex:
				local("%s add %s" % (GIT_ANNEX, netcat_stub['save_as']))

			p = UnveillanceFabricProcess(netcat, netcat_stub)
			p.join()

			if p.error is None and p.output is not None:
				success_tag = True

			if DEBUG:
				print "NETCAT RESULT: (type=%s, success=%s)" % (type(p.output), success_tag)
				print "NETCAT ERROR (none is good!): (type=%s)" % type(p.error)

			if p.output is not None and DEBUG:
				for o in p.output:
					print "\n%s\n" % o

			if p.error is not None and DEBUG:
				print "ERROR:"
				print p.error

			if type(netcat_stub['file']) in [str, unicode] and use_git_annex:
				local("%s metadata \"%s\" --json --set=uv_uploaded=%s" % (
					GIT_ANNEX, netcat_stub['save_as'], str(success_tag)))

			self.netcat_queue.remove(netcat_stub)

		os.chdir(this_dir)
		return { 'uploaded' : success_tag, '_id' : new_hash } 
Example #26
0
	def getFileNameHash(self, name_base):
		from conf import DOC_SALT
		return generateMD5Hash(content=name_base, salt=DOC_SALT)