def extractNEREntities(task):
	task_tag = "NER ENTITY EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from json import loads

	try:
		texts = loads(doc.loadAsset("doc_texts.json"))
	except Exception as e:
		print "ERROR GETTING DOC-TEXTS: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	import ner, os
	from conf import getConfig
	from lib.Core.Utils.funcs import cleanLine

	st = ner.SocketNER(host='localhost', port=getConfig("nlp_server.port"))
	entities = {}

	for i, page in enumerate(texts):
		if page is None: continue

		lemmas = st.get_entities(cleanLine(page))
		if len(lemmas.keys()) == 0: continue

		for lemma_type in lemmas.keys():
			entities = updateEntities(entities, lemmas[lemma_type], lemma_type, i)

		#if DEBUG and i > 25: break
		
	if len(entities.keys()) > 0:
		ner_entity_path = doc.addAsset(entities, "stanford-ner_entities.json", as_literal=False,
			description="Entities as per Stanford-NER Tagger (via NLTK)",
			tags=[ASSET_TAGS['STANFORD_NER_ENTITIES'], ASSET_TAGS['CP_ENTITIES']])

		if ner_entity_path is not None:
			doc.addFile(ner_entity_path, None, sync=True)

	doc.addCompletedTask(task.task_path)
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
	task.finish()
def evaluateText(task):
	task_tag = "TEXT EVALUATION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "evaluating text at %s" % task.doc_id
	task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import DEBUG
	from vars import MIME_TYPE_TASKS
	
	document = UnveillanceDocument(_id=task.doc_id)	
	"""
		limited choices: json, pgp, or txt
	"""

	if hasattr(task, "text_file"):
		content = document.loadAsset(task.text_file)
	else:
		content = document.loadFile(document.file_name)	
	
	if content is None:
		print "no text to evaluate :("
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	new_mime_type = None
	import json
	try:
		json_txt = json.loads(content)
		new_mime_type = "application/json"
		
		print "THIS IS JSON"
	except Exception as e:
		print "NOT JSON: %s" % e
	
	task_path = None	
	if new_mime_type is not None:
		document.mime_type = new_mime_type
		document.save()
		
		if document.mime_type in MIME_TYPE_TASKS.keys():
			task_path = MIME_TYPE_TASKS[document.mime_type][0]
	else:
		try:
			from lib.Core.Utils.funcs import cleanLine
			from vars import ASSET_TAGS
			
			txt_json = []
			txt_pages = []
			line_count = 0
			
			# this is arbitrary
			MAX_LINES_PER_PAGE = 80
			
			for line in content.splitlines():
				txt_pages.append(cleanLine(line))
				line_count += 1
				
				if line_count == MAX_LINES_PER_PAGE:
					txt_json.append(" ".join(txt_pages))
					txt_pages = []
					line_count = 0

			txt_json.append(" ".join(txt_pages))

			document.total_pages = len(txt_json)
			document.save()
						
			asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False,
				description="jsonified text of original document, segment by segment",
				tags=[ASSET_TAGS['TXT_JSON']])

			from lib.Worker.Models.uv_text import UnveillanceText
			uv_text = UnveillanceText(inflate={
				'media_id' : document._id,
				'searchable_text' : txt_json,
				'file_name' : asset_path
			})
			
			document.text_id = uv_text._id
			document.save()
		except Exception as e: 
			if DEBUG:
				print "ERROR HERE GENERATING DOC TEXTS:"
				print e
	
	document.addCompletedTask(task.task_path)
	task.finish()
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
Example #3
0
def createGensimObjects(task):
	task_tag = "GENSIM TOPIC EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "USING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from json import loads

	try:
		texts = loads(doc.loadAsset("doc_texts.json"))
	except Exception as e:
		print "ERROR GETTING DOC-TEXTS: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	if len(texts) == 0:
		print "THERE ARE NO TEXTS HERE ANYWAY!"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return 

	import logging, os, bz2
	from json import loads
	from gensim import corpora

	from lib.Core.Utils.funcs import cleanLine
	from conf import getConfig, ANNEX_DIR

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	try:
		wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt'))
		wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2')))
	except Exception as e:
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e)

		print error_msg
		print e
		
		task.fail(message=error_msg)
		return

	from gensim import models

	wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model')
	if not os.path.exists(wiki_log_entropy_file):
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "no pre-prepared log entropy model.  going to generate this here, now.  might take a minute..."
		
		logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary)
		logent_transformation.save(wiki_log_entropy_file)
	else:
		logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file)

	tokenize_function = corpora.wikicorpus.tokenize

	doc_corpus = [wiki_dictionary.doc2bow(tokenize_function(cleanLine(page).lower())) for page in texts]
	doc_corpus = logent_transformation[doc_corpus]

	wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model')
	if not os.path.exists(wiki_tfidf_file):
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "no pre-prepared tfidf model.  going to generate this here, now.  might take a minute..."
		
		wiki_tfidf = models.TfidfModel(wiki_corpus)
		wiki_tfidf.save(wiki_tfidf_file)
	else:
		wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file)

	doc_tfidf = wiki_tfidf[doc_corpus]

	num_topics = 35
	lsi = models.LsiModel(corpus=doc_tfidf, id2word=wiki_dictionary, num_topics=num_topics)

	topics = []
	t_lambda = lambda x : [float(x[0]), x[1]]
	for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]:
		topics.append([t_lambda(t.strip().replace('\"','').split("*")) for t in t_group])

	lsi_topics = {
		"topics" : topics,
		"doc_comprehension" : []
	}

	doc_lsi = lsi[doc_tfidf]

	for d in doc_lsi:
		lsi_topics['doc_comprehension'].append(d)

	topic_path = doc.addAsset(lsi_topics, "%s_topics.json" % doc.file_name, as_literal=False,
		description="Gensim Topics dump (from LSI Model)", tags=[ASSET_TAGS["GM_TOPICS"]])

	doc.addCompletedTask(task.task_path)
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
	task.finish()
def mapSimilaritiesGensim(uv_task):
	task_tag = "CLUSTER: GENSIM SIMILARITIES"
	print "\n\n************** %s [START] ******************\n" % task_tag

	uv_task.setStatus(302)

	for required in ["documents", "query"]:
		if not hasattr(uv_task, required):
			print "Cluster unavailable."
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			uv_task.fail()
			return

	import json, re, os, logging, bz2
	from gensim import corpora, models

	from lib.Worker.Models.uv_document import UnveillanceDocument
	from lib.Core.Utils.funcs import cleanLine
	from conf import DEBUG, ANNEX_DIR, getConfig
	from vars import ASSET_TAGS

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	try:
		wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt'))
		wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2')))
	except Exception as e:
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e)

		print error_msg
		print e
		
		task.fail(message=error_msg)
		return

	wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model')
	if not os.path.exists(wiki_log_entropy_file):
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "no pre-prepared log entropy model.  going to generate this here, now.  might take a minute..."
		
		logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary)
		logent_transformation.save(wiki_log_entropy_file)
	else:
		logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file)
		
	tokenize_function = corpora.wikicorpus.tokenize

	cluster_corpus = []
	document_map = {
		'query' : uv_task.query,
		'map' : [],
		'topics' : []
	}

	query_rx = re.compile(r'.*%s.*' % "|".join(uv_task.query))
	for doc_idx, document in enumerate([UnveillanceDocument(_id=d) for d in uv_task.documents]):

		doc_valid = True
		for required in ['_id']:
			
			if required not in document.emit().keys():				
				doc_valid = False
				break

		if not doc_valid:
			error_msg = "Document is invalid"
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			uv_task.communicate(message=error_msg)
			print error_msg

			continue


		uv_task.communicate(message="Processing %s (%d out of %d)" % (
			document._id if not hasattr(document, "file_alias") else document.file_alias, doc_idx, len(uv_task.documents)))
		concerned_pages = []

		try:
			page_map = json.loads(document.loadAsset("page_map.json"))['uv_page_map']
		except Exception as e:
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print e
			continue

		for page in page_map:
			if len([p for p in page['map'] if re.match(query_rx, p['word'])]) > 0:
				concerned_pages.append(page['index'])

		if len(concerned_pages) > 0:
			concerned_pages = list(set(concerned_pages))

			doc_map = {
				'_id' : document._id,
				'pages' : [{ 'index_in_parent' : i } for i in concerned_pages]
			}

			try:
				entity_map = json.loads(document.loadAsset("stanford-ner_entities.json"))['uv_page_map']
			except Exception as e:
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				print e
				entity_map = None

			if entity_map is not None:
				for s in doc_map['pages']:
					try:
						s['entities'] = list(set(filter(
							lambda e: s['index_in_parent'] in e['pages'], entity_map)))

					except Exception as e: pass

			try:
				texts = json.loads(document.loadAsset("doc_texts.json"))
			except Exception as e:
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				print e
				texts = None

			if texts is not None:
				# topic modeling the page
				for page in concerned_pages:
					try:
						cluster_corpus.append(wiki_dictionary.doc2bow(tokenize_function(cleanLine(texts[page]))))
					except Exception as e:
						print "\n\n************** %s [WARN] ******************\n" % task_tag
						print e
						continue

					for s in doc_map['pages']:
						try:
							if s['index_in_parent'] == page:
								s['index_in_corpus']  = len(cluster_corpus) - 1
								break

						except Exception as e: pass

			document_map['map'].append(doc_map)

		if len(document_map['map']) == 0:
			error_msg = "no document groups created"
			print error_msg
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			uv_task.fail(message=error_msg)
			return

	# make a corpus out of the concerned pages
	if len(cluster_corpus) > 0:
		uv_task.communicate(message="Building topic model...")
		cluster_corpus = logent_transformation[cluster_corpus]

		wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model')
		if not os.path.exists(wiki_tfidf_file):
			print "\n\n************** %s [WARN] ******************\n" % task_tag
			print "no pre-prepared tfidf model.  going to generate this here, now.  might take a minute..."
			
			wiki_tfidf = models.TfidfModel(wiki_corpus)
			wiki_tfidf.save(wiki_tfidf_file)
		else:
			wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file)

		cluster_tfidf = wiki_tfidf[cluster_corpus]
		
		num_topics = 35

		lsi = models.LsiModel(corpus=cluster_tfidf, id2word=wiki_dictionary, num_topics=num_topics)
		cluster_lsi = lsi[cluster_tfidf]

		# for all of the cluster_lsi objects, each document (a page within a doc, actually) will be rated according to its topic set
		for i, topics in enumerate(cluster_lsi):
			page_item_index = -1

			for doc_map in document_map['map']:
				for p, page_item in enumerate(doc_map['pages']):
					try:
						if page_item['index_in_corpus'] == i:
							page_item_index = p
							page_item['topic_comprehension'] = topics
							del page_item['index_in_corpus']

							break

					except Exception as e:
						continue

				if page_item_index != -1:
					break

		t_lambda = lambda x : [float(x[0]), x[1]]
		try:
			for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]:
				document_map['topics'].append([t_lambda(t.strip().replace('\"', '').split("*")) for t in t_group])
		except Exception as e:
			error_msg = "could not create topic list: %s." % e
			print error_msg
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			uv_task.fail(message=error_msg)
			return


		if DEBUG:
			print document_map['topics']

	# save massaged data to task outupt
	if not uv_task.addAsset(document_map, "gensim_similarity_output.json", 
		as_literal=False, tags=[ASSET_TAGS['C_RES']]):
		
		error_msg = "could not save result asset to this task."
		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail(message=error_msg)
		return

	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()
Example #5
0
def OCRPDF(uv_task):	
	task_tag = "PDF OCR-TO-TEXT"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "OCRing text from pdf at %s" % uv_task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF
	from conf import DEBUG

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	pdf_reader = pdf.loadFile(pdf.file_name)
	if pdf_reader is None:
		print "PDF READER IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	import os
	from fabric.api import settings, local
	from wand.image import Image
	from time import sleep

	from lib.Core.Utils.funcs import cleanLine
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	count = 0
	tmp_img = os.path.join(ANNEX_DIR, pdf.base_path, "p_image.jpg")

	for x in xrange(0, num_pages):
		# pdf page to image
		with Image(filename=os.path.join(ANNEX_DIR, "%s[%d]" % (pdf.file_name, x))) as p_image:
			p_image.save(filename=tmp_img)
			
			# image to ocr
			with settings(warn_only=True):
				text = cleanLine(local("tesseract p_image.jpg -", capture=True))
				texts[count] = text

				els_stub = UnveillanceELSStub('cp_page_text', inflate={
					'media_id' : pdf._id,
					'searchable_text' : text,
					'index_in_parent' : count
				})

			sleep(1)

		count += 1

	os.remove(tmp_img)

	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. unclean. (OCR'd using tesseract)",
		tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()
Example #6
0
def extractPDFText(uv_task):	
	task_tag = "PDF TEXT EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "extracting text from pdf at %s" % uv_task.doc_id
	uv_task.setStatus(302)

	from lib.Worker.Models.cp_pdf import CompassPDF

	pdf = CompassPDF(_id=uv_task.doc_id)
	if pdf is None:
		print "PDF IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	"""
		In this task, we might be asked to extract from a broken-up sub-group of documents.
		if so, that should be set in the task's properties.
		
	"""
	import os
	from fabric.api import settings, local
	from wand.image import Image
	from time import sleep

	from lib.Core.Utils.funcs import cleanLine, generateMD5Hash
	from Models.uv_els_stub import UnveillanceELSStub
	from conf import ANNEX_DIR, DEBUG
	from vars import ASSET_TAGS

	texts = [None] * pdf.total_pages
	
	if pdf.hasParts():
		extractors = pdf.getParts()
	else:
		extractors = [pdf.file_name]
	
	count = 0
	for e in extractors:
		if e == pdf.file_name:
			pdf_reader = pdf.loadFile(e)
		else:
			pdf_reader = pdf.loadAsset(e)
		try:
			num_pages = pdf_reader.getNumPages()
		except AttributeError as e:
			print e
			continue

		for x in xrange(0, num_pages):
			text = cleanLine(pdf_reader.getPage(x).extractText())
			texts[count] = text

			els_stub = UnveillanceELSStub('cp_page_text', inflate={
				'media_id' : pdf._id,
				'searchable_text' : text,
				'index_in_parent' : count,
				'_id' : generateMD5Hash(content=pdf._id, salt=str(count))
			})

			count += 1
	
	asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False,
		description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']])

	if asset_path is not None: 
		pdf.addFile(asset_path, None, sync=True)
		pdf.save()

	del texts

	pdf.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate={ 'text_file' : asset_path })
	print "\n\n************** %s [END] ******************\n" % task_tag

	uv_task.finish()