Ejemplo n.º 1
0
def evaluateText(task):
	task_tag = "TEXT EVALUATION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "evaluating text at %s" % task.doc_id
	task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import DEBUG
	from vars import MIME_TYPE_TASKS
	
	document = UnveillanceDocument(_id=task.doc_id)	
	"""
		limited choices: json, pgp, or txt
	"""

	if hasattr(task, "text_file"):
		content = document.loadAsset(task.text_file)
	else:
		content = document.loadFile(document.file_name)	
	
	if content is None:
		print "no text to evaluate :("
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	new_mime_type = None
	import json
	try:
		json_txt = json.loads(content)
		new_mime_type = "application/json"
		
		print "THIS IS JSON"
	except Exception as e:
		print "NOT JSON: %s" % e
	
	task_path = None	
	if new_mime_type is not None:
		document.mime_type = new_mime_type
		document.save()
		
		if document.mime_type in MIME_TYPE_TASKS.keys():
			task_path = MIME_TYPE_TASKS[document.mime_type][0]
	else:
		try:
			from lib.Core.Utils.funcs import cleanLine
			from vars import ASSET_TAGS
			
			txt_json = []
			txt_pages = []
			line_count = 0
			
			# this is arbitrary
			MAX_LINES_PER_PAGE = 80
			
			for line in content.splitlines():
				txt_pages.append(cleanLine(line))
				line_count += 1
				
				if line_count == MAX_LINES_PER_PAGE:
					txt_json.append(" ".join(txt_pages))
					txt_pages = []
					line_count = 0

			txt_json.append(" ".join(txt_pages))

			document.total_pages = len(txt_json)
			document.save()
						
			asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False,
				description="jsonified text of original document, segment by segment",
				tags=[ASSET_TAGS['TXT_JSON']])

			from lib.Worker.Models.uv_text import UnveillanceText
			uv_text = UnveillanceText(inflate={
				'media_id' : document._id,
				'searchable_text' : txt_json,
				'file_name' : asset_path
			})
			
			document.text_id = uv_text._id
			document.save()
		except Exception as e: 
			if DEBUG:
				print "ERROR HERE GENERATING DOC TEXTS:"
				print e
	
	document.addCompletedTask(task.task_path)
	task.finish()
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
Ejemplo n.º 2
0
def getAssets(uv_task):
	task_tag = "FETCHING DOCUMENTCLOUD ASSETS"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "getting DocumentCloud assets for %s" % uv_task.doc_id
	uv_task.setStatus(412)
	
	from lib.Worker.Models.cp_documentcloud_client import CompassDocumentCloudClient
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import DEBUG
	
	document = UnveillanceDocument(_id=uv_task.doc_id)
	
	if document is None:
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		print "Document is None"
		return
	
	if not hasattr(document, "dc_id"):
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		print "Document has not document cloud id!"
		return
	
	if not hasattr(uv_task, "auth_string"):
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		print "DocumentCloud upload needs an auth string"
		return
		
	dc_client = CompassDocumentCloudClient(auth_string=uv_task.auth_string)	
	
	dc_manifest = dc_client.download("documents/%s.json" % document.dc_id)
	if dc_manifest is None:
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		print "No DocumentCloud manifest yet for %s." % document._id
		return
	
	document.addAsset(dc_manifest, "document_cloud_manifest.json", as_literal=False,
		description="description of document on DocumentCloud",
		tags=[ASSET_TAGS['DOC_CLOUD_MANIFEST'], ASSET_TAGS['DOC_CLOUD_DOC']])
		
	dc_entities = dc_client.download("documents/%s/entities.json" % document.dc_id)
	if dc_entities is None:
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "No DocumentCloud entiteis yet for %s." % document._id
	else:
		entity_asset = document.addAsset(dc_entities, "document_cloud_entities.json",
			as_literal=False, description="entites pulled from DocumentCloud",
			tags=[ASSET_TAGS['DOC_CLOUD_ENTITIES'], ASSET_TAGS['DOC_CLOUD_DOC']])
		
		from lib.Worker.Models.uv_text import UnveillanceText
		
		if not hasattr(document, "text_id"):
			text = UnveillanceText(inflate={
				'file_name' : entity_asset,
				'entities' : dc_entities['entities'],
				'media_id' : document._id
			})
			
			document.text_id = text._id
			document.save()
		else:
			text = UnveillanceText(_id=document.text_id)
			text.entities = dc_entities['entities']
			text.save()
		
	document.addCompletedTask(uv_task.task_path)
	uv_task.finish()