def extractNEREntities(task): task_tag = "NER ENTITY EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import ner, os from conf import getConfig from lib.Core.Utils.funcs import cleanLine st = ner.SocketNER(host='localhost', port=getConfig("nlp_server.port")) entities = {} for i, page in enumerate(texts): if page is None: continue lemmas = st.get_entities(cleanLine(page)) if len(lemmas.keys()) == 0: continue for lemma_type in lemmas.keys(): entities = updateEntities(entities, lemmas[lemma_type], lemma_type, i) #if DEBUG and i > 25: break if len(entities.keys()) > 0: ner_entity_path = doc.addAsset(entities, "stanford-ner_entities.json", as_literal=False, description="Entities as per Stanford-NER Tagger (via NLTK)", tags=[ASSET_TAGS['STANFORD_NER_ENTITIES'], ASSET_TAGS['CP_ENTITIES']]) if ner_entity_path is not None: doc.addFile(ner_entity_path, None, sync=True) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def evaluateText(task): task_tag = "TEXT EVALUATION" print "\n\n************** %s [START] ******************\n" % task_tag print "evaluating text at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import MIME_TYPE_TASKS document = UnveillanceDocument(_id=task.doc_id) """ limited choices: json, pgp, or txt """ if hasattr(task, "text_file"): content = document.loadAsset(task.text_file) else: content = document.loadFile(document.file_name) if content is None: print "no text to evaluate :(" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return new_mime_type = None import json try: json_txt = json.loads(content) new_mime_type = "application/json" print "THIS IS JSON" except Exception as e: print "NOT JSON: %s" % e task_path = None if new_mime_type is not None: document.mime_type = new_mime_type document.save() if document.mime_type in MIME_TYPE_TASKS.keys(): task_path = MIME_TYPE_TASKS[document.mime_type][0] else: try: from lib.Core.Utils.funcs import cleanLine from vars import ASSET_TAGS txt_json = [] txt_pages = [] line_count = 0 # this is arbitrary MAX_LINES_PER_PAGE = 80 for line in content.splitlines(): txt_pages.append(cleanLine(line)) line_count += 1 if line_count == MAX_LINES_PER_PAGE: txt_json.append(" ".join(txt_pages)) txt_pages = [] line_count = 0 txt_json.append(" ".join(txt_pages)) document.total_pages = len(txt_json) document.save() asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False, description="jsonified text of original document, segment by segment", tags=[ASSET_TAGS['TXT_JSON']]) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : document._id, 'searchable_text' : txt_json, 'file_name' : asset_path }) document.text_id = uv_text._id document.save() except Exception as e: if DEBUG: print "ERROR HERE GENERATING DOC TEXTS:" print e document.addCompletedTask(task.task_path) task.finish() task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag
def createGensimObjects(task): task_tag = "GENSIM TOPIC EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "USING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if len(texts) == 0: print "THERE ARE NO TEXTS HERE ANYWAY!" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import logging, os, bz2 from json import loads from gensim import corpora from lib.Core.Utils.funcs import cleanLine from conf import getConfig, ANNEX_DIR logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt')) wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2'))) except Exception as e: print "\n\n************** %s [ERROR] ******************\n" % task_tag error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e) print error_msg print e task.fail(message=error_msg) return from gensim import models wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model') if not os.path.exists(wiki_log_entropy_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared log entropy model. going to generate this here, now. might take a minute..." logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary) logent_transformation.save(wiki_log_entropy_file) else: logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file) tokenize_function = corpora.wikicorpus.tokenize doc_corpus = [wiki_dictionary.doc2bow(tokenize_function(cleanLine(page).lower())) for page in texts] doc_corpus = logent_transformation[doc_corpus] wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model') if not os.path.exists(wiki_tfidf_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared tfidf model. going to generate this here, now. might take a minute..." wiki_tfidf = models.TfidfModel(wiki_corpus) wiki_tfidf.save(wiki_tfidf_file) else: wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file) doc_tfidf = wiki_tfidf[doc_corpus] num_topics = 35 lsi = models.LsiModel(corpus=doc_tfidf, id2word=wiki_dictionary, num_topics=num_topics) topics = [] t_lambda = lambda x : [float(x[0]), x[1]] for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]: topics.append([t_lambda(t.strip().replace('\"','').split("*")) for t in t_group]) lsi_topics = { "topics" : topics, "doc_comprehension" : [] } doc_lsi = lsi[doc_tfidf] for d in doc_lsi: lsi_topics['doc_comprehension'].append(d) topic_path = doc.addAsset(lsi_topics, "%s_topics.json" % doc.file_name, as_literal=False, description="Gensim Topics dump (from LSI Model)", tags=[ASSET_TAGS["GM_TOPICS"]]) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def mapSimilaritiesGensim(uv_task): task_tag = "CLUSTER: GENSIM SIMILARITIES" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) for required in ["documents", "query"]: if not hasattr(uv_task, required): print "Cluster unavailable." print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import json, re, os, logging, bz2 from gensim import corpora, models from lib.Worker.Models.uv_document import UnveillanceDocument from lib.Core.Utils.funcs import cleanLine from conf import DEBUG, ANNEX_DIR, getConfig from vars import ASSET_TAGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt')) wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2'))) except Exception as e: print "\n\n************** %s [ERROR] ******************\n" % task_tag error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e) print error_msg print e task.fail(message=error_msg) return wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model') if not os.path.exists(wiki_log_entropy_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared log entropy model. going to generate this here, now. might take a minute..." logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary) logent_transformation.save(wiki_log_entropy_file) else: logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file) tokenize_function = corpora.wikicorpus.tokenize cluster_corpus = [] document_map = { 'query' : uv_task.query, 'map' : [], 'topics' : [] } query_rx = re.compile(r'.*%s.*' % "|".join(uv_task.query)) for doc_idx, document in enumerate([UnveillanceDocument(_id=d) for d in uv_task.documents]): doc_valid = True for required in ['_id']: if required not in document.emit().keys(): doc_valid = False break if not doc_valid: error_msg = "Document is invalid" print "\n\n************** %s [WARN] ******************\n" % task_tag uv_task.communicate(message=error_msg) print error_msg continue uv_task.communicate(message="Processing %s (%d out of %d)" % ( document._id if not hasattr(document, "file_alias") else document.file_alias, doc_idx, len(uv_task.documents))) concerned_pages = [] try: page_map = json.loads(document.loadAsset("page_map.json"))['uv_page_map'] except Exception as e: print "\n\n************** %s [WARN] ******************\n" % task_tag print e continue for page in page_map: if len([p for p in page['map'] if re.match(query_rx, p['word'])]) > 0: concerned_pages.append(page['index']) if len(concerned_pages) > 0: concerned_pages = list(set(concerned_pages)) doc_map = { '_id' : document._id, 'pages' : [{ 'index_in_parent' : i } for i in concerned_pages] } try: entity_map = json.loads(document.loadAsset("stanford-ner_entities.json"))['uv_page_map'] except Exception as e: print "\n\n************** %s [WARN] ******************\n" % task_tag print e entity_map = None if entity_map is not None: for s in doc_map['pages']: try: s['entities'] = list(set(filter( lambda e: s['index_in_parent'] in e['pages'], entity_map))) except Exception as e: pass try: texts = json.loads(document.loadAsset("doc_texts.json")) except Exception as e: print "\n\n************** %s [WARN] ******************\n" % task_tag print e texts = None if texts is not None: # topic modeling the page for page in concerned_pages: try: cluster_corpus.append(wiki_dictionary.doc2bow(tokenize_function(cleanLine(texts[page])))) except Exception as e: print "\n\n************** %s [WARN] ******************\n" % task_tag print e continue for s in doc_map['pages']: try: if s['index_in_parent'] == page: s['index_in_corpus'] = len(cluster_corpus) - 1 break except Exception as e: pass document_map['map'].append(doc_map) if len(document_map['map']) == 0: error_msg = "no document groups created" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return # make a corpus out of the concerned pages if len(cluster_corpus) > 0: uv_task.communicate(message="Building topic model...") cluster_corpus = logent_transformation[cluster_corpus] wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model') if not os.path.exists(wiki_tfidf_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared tfidf model. going to generate this here, now. might take a minute..." wiki_tfidf = models.TfidfModel(wiki_corpus) wiki_tfidf.save(wiki_tfidf_file) else: wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file) cluster_tfidf = wiki_tfidf[cluster_corpus] num_topics = 35 lsi = models.LsiModel(corpus=cluster_tfidf, id2word=wiki_dictionary, num_topics=num_topics) cluster_lsi = lsi[cluster_tfidf] # for all of the cluster_lsi objects, each document (a page within a doc, actually) will be rated according to its topic set for i, topics in enumerate(cluster_lsi): page_item_index = -1 for doc_map in document_map['map']: for p, page_item in enumerate(doc_map['pages']): try: if page_item['index_in_corpus'] == i: page_item_index = p page_item['topic_comprehension'] = topics del page_item['index_in_corpus'] break except Exception as e: continue if page_item_index != -1: break t_lambda = lambda x : [float(x[0]), x[1]] try: for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]: document_map['topics'].append([t_lambda(t.strip().replace('\"', '').split("*")) for t in t_group]) except Exception as e: error_msg = "could not create topic list: %s." % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return if DEBUG: print document_map['topics'] # save massaged data to task outupt if not uv_task.addAsset(document_map, "gensim_similarity_output.json", as_literal=False, tags=[ASSET_TAGS['C_RES']]): error_msg = "could not save result asset to this task." print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def OCRPDF(uv_task): task_tag = "PDF OCR-TO-TEXT" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text from pdf at %s" % uv_task.doc_id task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR from vars import ASSET_TAGS texts = [None] * pdf.total_pages count = 0 tmp_img = os.path.join(ANNEX_DIR, pdf.base_path, "p_image.jpg") for x in xrange(0, num_pages): # pdf page to image with Image(filename=os.path.join(ANNEX_DIR, "%s[%d]" % (pdf.file_name, x))) as p_image: p_image.save(filename=tmp_img) # image to ocr with settings(warn_only=True): text = cleanLine(local("tesseract p_image.jpg -", capture=True)) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count }) sleep(1) count += 1 os.remove(tmp_img) asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean. (OCR'd using tesseract)", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def extractPDFText(uv_task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR, DEBUG from vars import ASSET_TAGS texts = [None] * pdf.total_pages if pdf.hasParts(): extractors = pdf.getParts() else: extractors = [pdf.file_name] count = 0 for e in extractors: if e == pdf.file_name: pdf_reader = pdf.loadFile(e) else: pdf_reader = pdf.loadAsset(e) try: num_pages = pdf_reader.getNumPages() except AttributeError as e: print e continue for x in xrange(0, num_pages): text = cleanLine(pdf_reader.getPage(x).extractText()) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()