Python UnveillanceDocument.getAssetsByTagName Examples

Programming Language: Python

Namespace/Package Name: lib.Worker.Models.uv_document

Method/Function: getAssetsByTagName

Examples at hotexamples.com: 5

Python UnveillanceDocument.getAssetsByTagName - 5 examples found. These are the top rated real world Python examples of lib.Worker.Models.uv_document.UnveillanceDocument.getAssetsByTagName extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

loadAsset(11)

save(6)

getAssetsByTagName(5)

getFile(5)

UnveillanceDocument(3)

saveFields(2)

getAsset(2)

media_verified(2)

text_id(2)

get_video_hash(2)

__init__(2)

set_file_metadata(1)

reset(1)

query_mime_type(1)

queryFile(1)

notarizedSave(1)

total_pages(1)

mime_type(1)

j3m_verified(1)

loadFile(1)

j3m_id(1)

get_image_hash(1)

getFileMetadata(1)

from_web_upload(1)

emit(1)

dc_id(1)

date_created(1)

addCompletedTask(1)

addAsset(1)

update_similar_media(1)

Example #1

Show file

File: address_parser.py Project: jeremybmerrill/CompassAnnex

def basicTokenizer(task):
	task_tag = "NLP ADDRESS PARSER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	txt = None
	if hasattr(task, "txt_file"):
		txt = doc.loadFile(task.txt_file)
	else:
		import os
		try:
			txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
			txt = doc.loadFile(os.path.join(doc.base_path, txt_path))
		except Exception as e:
			if DEBUG: print e
	
	if txt is None:
		print "TEXT FILE IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

Example #2

Show file

File: get_vector.py Project: harlo/InformaAnnex

def get_vector(uv_task):
	task_tag = "IMAGE: GETTING VECTOR"

	print "\n\n************** %s [START] ******************\n" % task_tag
	uv_task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument
	from vars import ASSET_TAGS
	from conf import ANNEX_DIR, DEBUG
	
	import os, pypuzzle

	image = UnveillanceDocument(_id=uv_task.doc_id)
	hi_res = image.getAssetsByTagName(ASSET_TAGS['HIGH'])
	
	if hi_res is None:
		error_msg = "Could not find the hi-res clone"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	hi_res = os.path.join(ANNEX_DIR, image.base_path, hi_res[0]['file_name'])
	puzz = pypuzzle.Puzzle()

	if DEBUG:
		print "generate puzzle vector from %s" % hi_res

	try:
		cvec = puzz.get_cvec_from_file(hi_res)
	except Exception as e:
		error_msg = "Could not get image vector because %s" % e

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	if not image.addAsset(cvec, "image_cvec.json", as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]):
		error_msg = "could not save cvec asset!"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()

Example #3

Show file

File: tokenizer.py Project: jeremybmerrill/CompassAnnex

def basicTokenizer(task):
	task_tag = "NLP TOKENIZER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	txt = None
	
	from json import loads
	if hasattr(task, "txt_file"):
		txt = loads(doc.loadFile(task.txt_file))
	else:
		import os
		try:
			txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
			txt = loads(doc.loadFile(os.path.join(doc.base_path, txt_path)))
		except Exception as e:
			if DEBUG: print e
	
	if txt is None:
		print "TEXT FILE IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
		
	from lib.Worker.Models.cp_nlp_server import CompassNLPServer
	nlp_server = CompassNLPServer()
	tokenized = nlp_server.sendNLPRequest({
		'method' : 'tokenize',
		'txt' : txt
	})
	
	if tokenized is None:
		print "COULD NOT TOKENIZE."
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
	
	if DEBUG:
		print "here is res"
		print type(tokenized)
		
	asset_path = doc.addAsset(tokenized, "core_nlp_tokenized.json", as_literal=False,
		description="tokenized output from Stanford Core NLP",
		tags=[ASSET_TAGS['TOKENS_NLP']])

	if asset_path is None or not doc.addFile(asset_path, None, sync=True): 
		print "COULD NOT SAVE ASSET."
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
	
	doc.addCompletedTask(task.task_path)
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

Example #4

Show file

File: address_parser.py Project: harlo/CompassAnnex

def addressParser(task):
    task_tag = "NLP ADDRESS PARSER"
    print "\n\n************** %s [START] ******************\n" % task_tag
    print "EXTRACTING ADDRESSES FROM TEXT DOCUMENT at %s" % task.doc_id
    task.setStatus(302)

    from lib.Worker.Models.uv_document import UnveillanceDocument

    from conf import DEBUG
    from vars import ASSET_TAGS

    doc = UnveillanceDocument(_id=task.doc_id)
    if doc is None:
        print "DOC IS NONE"
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    txt = None
    if hasattr(task, "txt_file"):
        txt = doc.loadFile(task.txt_file)
    else:
        import os

        try:
            txt_path = doc.getAssetsByTagName(ASSET_TAGS["TXT_JSON"])[0]["file_name"]
            txt = doc.loadFile(os.path.join(doc.base_path, txt_path))
        except Exception as e:
            if DEBUG:
                print e

    if txt is None:
        print "TEXT FILE IS NONE"
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    import re

    # script from https://code.google.com/p/ebcode/ -> ebdata.tar.gz -> ebdata/nlp/addresses.py

    # Regex notes:
    #   * This is *not* a case-insensitive regex, because we assume
    #     capitalized words are special (street names).
    #   * All data matched by capturing parentheses is concatenated together, so
    #     if you don't want to include something in the resulting string, don't
    #     capture it.

    # STREET_NAME is a fragment of a regular expression that is used in several
    # places in our "real" regular expression (ADDRESSES_RE) below. The one tricky
    # thing about it is that it includes a "CAPTURE_START" placeholder instead of
    # a capturing opening parenthesis. This lets us create two versions of the
    # regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE.

    STREET_NAME = r"""
		# Here, we define some common false positives and tell the regex to ignore them.
		(?!
			[Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press
			|
			[Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff]             # university of
		)
		# DIRECTION
		%(CAPTURE_START)s
			(?:
				[NSEWnsew]\.?
				|
				(?:
					[Nn][Oo][Rr][Tt][Hh] |
					[Ss][Oo][Uu][Tt][Hh] |
					[Ee][Aa][Ss][Tt] |
					[Ww][Ee][Ss][Tt] |
					[Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] |
					[Ee][Aa][Ss][Tt][Ww][Ee][Ss][Tt] |
					[Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] |
					[Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt]
				)
				|
				(?:
					N\.?W | S\.?W | N\.?E | S\.?E
				)\.?
			)
			\ +                                        # space (but not newline)
		)?
		(?:
			# STREET NAME
			%(CAPTURE_START)s
				# Numbered street names with a suffix ("3rd", "4th").
				\d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D)

				|

				# Or, numbered street names without a suffix ("3", "4")
				# but with a street type.
				\d+
				(?=
					\ +
					(?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive|
					   Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte|
					   Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy
					)
					\b
				)

				|

				# Or, street names that don't start with numbers.
				(?:
					# Optional prefixes --
					# "St", as in "St Louis"
					# "Dr. Martin", as in "Dr. Martin Luther King"
					(?:
						[Ss][Tt]\.?
						|
						[Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn]
					)
					\ +
				)?
				(?:
					Mass\.(?=\ +[Aa]ve)  # Special case: "Mass." abbr. for "Massachussetts Ave."
										 # Needs to be special-cased because of the period.
					|
					(?:Avenue|Ave\.?)\ +[A-Z]       # Special case: "Avenue X"
					|
					[A-Z][a-z][A-Za-z]*  # One initial-capped word
					|
					[A-Z]\b              # Single-letter street name (e.g., K St. in DC)
					(?!\.\w)             # Avoid '20 U.S.A.'
				)
			)
			(?:
				# Here, we list the options with street suffixes first, so that
				# the suffix abbreviations are treated as the last part of the
				# street name, to avoid overeagerly capturing "123 Main St. The".
				%(CAPTURE_START)s
					\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					\ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5}
				)?
				# OPTIONAL POST-DIR
				(?:
					# Standard post-dir format
					%(CAPTURE_START)s
						,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.?
					)
					# Avoid greedily capturing more letters, like
					# '123 Main St, New England' to '123 Main St, N'
					(?![A-Za-z])

					|

					# Or, a special-case for DC quadrants, to find stuff like:
					# "600 H Street in NE Washington"
					# "600 H Street in the NE quadrant"
					# "600 H Street in northeast DC"

					# Note that this is NOT captured, so that it's excluded from
					# the final output.
					,?
					\s in
					%(CAPTURE_START)s
						\s
					)
					(?:
						(?:the|far) \s
					)?

					%(CAPTURE_START)s
						(?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest)
						(?=
							\s (?:quadrant|D\.?C\.?|Washington)
						)
					)
				)?
			)?
		)
	"""

    STREET_NAME_CAPTURE = STREET_NAME % {"CAPTURE_START": "("}

    STREET_NAME_NOCAPTURE = STREET_NAME % {"CAPTURE_START": "(?:"}

    ADDRESSES_RE = re.compile(
        r"""(?x)
		(?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection.
		\b

		# Ignore things that look like dates -- e.g., "21 May 2009".
		# This is a problem e.g. in cases where there's a May Street.
		(?!
			\d+\s+
			(?:January|February|March|April|May|June|July|August|September|October|November|December)
			,?\s+
			\d\d\d\d
		)

		# Ignore intersections that are prefixed by "University of", like
		# "University of Texas at Austin". This is a common false positive.
		(?<!
			[Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s
		)

		(?:
			# SEGMENT ("FOO BETWEEN BAR AND BAZ")
			(?:
				%(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s
				|
				%(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s
			)

			|

			# BLOCK/ADDRESS
			(?:
				(
					(?:
						(?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ]
							(?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )?
						[Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff]
						|
						\d+\ *-\ *\d+
						|
						\d+
					)
					\ +
				)
				%(STREET_NAME_CAPTURE)s

				# ignore the intersection in parenthesis so that it's not picked
				# up as a separate location. We do this by consuming the string
				# but *not* capturing it.
				(?:
					\ +
					\(?
					between
					\ +
					%(STREET_NAME_NOCAPTURE)s
					\ +
					and
					\ +
					%(STREET_NAME_NOCAPTURE)s
					\)?
				)?
			)

			|

			# INTERSECTION
			(?:
				# Common intersection prefixes. They're included here so that the
				# regex doesn't include them as part of the street name.
				(?:
					(?:
						[Nn]ear |
						[Aa]t |
						[Oo]n |
						[Tt]o |
						[Aa]round |
						[Ii]ntersection\ of |
						[Cc]orner\ of |
						[Aa]rea\ of |
						[Aa]reas?\ surrounding |
						vicinity\ of |
						ran\ down |
						running\ down |
						crossed
					)
					\ +
				)?
				\b
				(?:%(STREET_NAME_CAPTURE)s)
				(\ +)
				(
					(?:
						[Aa][Nn][Dd] |
						[Aa][Tt] |
						[Nn][Ee][Aa][Rr] |
						& |
						[Aa][Rr][Oo][Uu][Nn][Dd] |
						[Tt][Oo][Ww][Aa][Rr][Dd][Ss]? |
						[Oo][Ff][Ff] |
						(?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] |
						(?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt]
					)
					\ +
				)
				(?:%(STREET_NAME_CAPTURE)s)
			)
		)

		# OPTIONAL CITY SUFFIX
		(?:
			(?:
				,?\s+in |
				,
			)
			\s+

			# CITY NAME
			(
				[A-Z][a-z][A-Za-z]*                   # One initial-capped word
				(?:
					,?\ Jr\.?,?
					|
					\ [A-Z][a-z][A-Za-z]*
					|
					-[A-Za-z]+                        # Hyphenated words (e.g. "Croton-on-Hudson" in NY)
				){0,4}  # Initial-capped words
			)
		)?
		"""
        % {"STREET_NAME_CAPTURE": STREET_NAME_CAPTURE, "STREET_NAME_NOCAPTURE": STREET_NAME_NOCAPTURE}
    )

    addresses = parse_addresses(txt, ADDRESSES_RE)

    if addresses is None:
        print "COULD NOT EXTRACT ADDRESSES."
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    asset_path = doc.addAsset(
        addresses,
        "addresses.json",
        as_literal=False,
        description="addresses output from Everyblock address extractor",
        tags=[ASSET_TAGS["ADDRESSES_NLP"], ASSET_TAGS["CP_ENTITIES"]],
    )

    if asset_path is None or not doc.addFile(asset_path, None, sync=True):
        print "COULD NOT SAVE ASSET."
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    doc.addCompletedTask(task.task_path)
    task.routeNext()
    print "\n\n************** %s [END] ******************\n" % task_tag
    task.finish()

Example #5

Show file

File: page_map.py Project: harlo/CompassAnnex

def generatePageMap(uv_task):
	task_tag = "PAGE MAPPER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "MAPPING PAGES FROM TEXT DOCUMENT at %s" % uv_task.doc_id
	uv_task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=uv_task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	import os, json
	try:
		page_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
		pages = json.loads(doc.loadFile(os.path.join(doc.base_path, page_path)))
	except Exception as e:
		if DEBUG: print e
	
	try:
		bow_path = doc.getAssetsByTagName(ASSET_TAGS['BOW'])[0]['file_name']
		bow = json.loads(doc.loadFile(os.path.join(doc.base_path, bow_path)))
	except Exception as e:
		if DEBUG: print e
	
	if pages is None or bow is None:
		print "NO PAGES OR BAG OF WORDS"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	# with unique words in bag that are not stopwords
	# for each page, word count of each	
	from numpy import intersect1d, setdiff1d
	from conf import getConfig
	
	if hasattr(uv_task, "stopwords"):
		stopwords = uv_task.stopwords
	else:	
		stopwords = os.path.join(getConfig('nlp_server.path'), "stopwords.json")
	
	try:
		with open(stopwords, 'rb') as S:
			if hasattr(uv_task, "stopwords_lang"):
				lang = uv_task.stopwords_lang
			else:
				lang = "english"
			
			stopwords = json.loads(S.read())[lang]
				
	except Exception as e:
		print "NO STOPWORDS...\n%s" % e
		print "\n\n************** %s [WARN] ******************\n" % task_tag
	
	page_map = []
	
	print "STOPWORDS: (len %d)\nTOP:\n%s\n" % (len(stopwords), stopwords[:10])
	print "BAG OF WORDS: (len %d)\nTOP:\n%s\n" % (len(bow), bow[:10])
	
	use_words = [w for w in setdiff1d(bow, stopwords).tolist() if len(w) > 1]	
	print "SIFTING BAG OF WORDS (old len: %d, new len: %d)" % (len(bow), len(use_words))
	
	global_info = {}

	for i, p in enumerate(pages):
		if p is None: continue
		
		page_bow = p.lower().split(" ")
		words = intersect1d(use_words, page_bow).tolist()
		if len(words) == 0: continue
		
		map = []
		frequency_max = 0

		for word in words:
			word_info = { 'word' : word, 'count' : page_bow.count(word) }
			
			map.append(word_info)
			if word_info['count'] > frequency_max: frequency_max = word_info['count']

			if word not in global_info.keys():
				global_info[word] = 0

			global_info[word] += word_info['count']
		
		page_map.append({ 'index' : i, 'map' : map, 'frequency_max' : frequency_max })
	
	if len(page_map) > 0:
		global_info['uv_page_map'] = page_map
		asset_path = doc.addAsset(global_info, "page_map.json", as_literal=False,
			description="word frequencies, page-by-page", tags=[ASSET_TAGS['PAGE_MAP']])
				
		if asset_path is None or not doc.addFile(asset_path, None, sync=True):
			print "COULD NOT SAVE ASSET."
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			uv_task.fail()
			return
	
	doc.addCompletedTask(uv_task.task_path)
	uv_task.routeNext()
	
	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()