def basicTokenizer(task):
	task_tag = "NLP ADDRESS PARSER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	txt = None
	if hasattr(task, "txt_file"):
		txt = doc.loadFile(task.txt_file)
	else:
		import os
		try:
			txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
			txt = doc.loadFile(os.path.join(doc.base_path, txt_path))
		except Exception as e:
			if DEBUG: print e
	
	if txt is None:
		print "TEXT FILE IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
Example #2
0
def get_vector(uv_task):
	task_tag = "IMAGE: GETTING VECTOR"

	print "\n\n************** %s [START] ******************\n" % task_tag
	uv_task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument
	from vars import ASSET_TAGS
	from conf import ANNEX_DIR, DEBUG
	
	import os, pypuzzle

	image = UnveillanceDocument(_id=uv_task.doc_id)
	hi_res = image.getAssetsByTagName(ASSET_TAGS['HIGH'])
	
	if hi_res is None:
		error_msg = "Could not find the hi-res clone"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	hi_res = os.path.join(ANNEX_DIR, image.base_path, hi_res[0]['file_name'])
	puzz = pypuzzle.Puzzle()

	if DEBUG:
		print "generate puzzle vector from %s" % hi_res

	try:
		cvec = puzz.get_cvec_from_file(hi_res)
	except Exception as e:
		error_msg = "Could not get image vector because %s" % e

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	if not image.addAsset(cvec, "image_cvec.json", as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]):
		error_msg = "could not save cvec asset!"

		print error_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		
		uv_task.fail(message=error_msg)
		return

	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()
Example #3
0
def basicTokenizer(task):
	task_tag = "NLP TOKENIZER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(412)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return

	txt = None
	
	from json import loads
	if hasattr(task, "txt_file"):
		txt = loads(doc.loadFile(task.txt_file))
	else:
		import os
		try:
			txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
			txt = loads(doc.loadFile(os.path.join(doc.base_path, txt_path)))
		except Exception as e:
			if DEBUG: print e
	
	if txt is None:
		print "TEXT FILE IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
		
	from lib.Worker.Models.cp_nlp_server import CompassNLPServer
	nlp_server = CompassNLPServer()
	tokenized = nlp_server.sendNLPRequest({
		'method' : 'tokenize',
		'txt' : txt
	})
	
	if tokenized is None:
		print "COULD NOT TOKENIZE."
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
	
	if DEBUG:
		print "here is res"
		print type(tokenized)
		
	asset_path = doc.addAsset(tokenized, "core_nlp_tokenized.json", as_literal=False,
		description="tokenized output from Stanford Core NLP",
		tags=[ASSET_TAGS['TOKENS_NLP']])

	if asset_path is None or not doc.addFile(asset_path, None, sync=True): 
		print "COULD NOT SAVE ASSET."
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
	
	doc.addCompletedTask(task.task_path)
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag
Example #4
0
def addressParser(task):
    task_tag = "NLP ADDRESS PARSER"
    print "\n\n************** %s [START] ******************\n" % task_tag
    print "EXTRACTING ADDRESSES FROM TEXT DOCUMENT at %s" % task.doc_id
    task.setStatus(302)

    from lib.Worker.Models.uv_document import UnveillanceDocument

    from conf import DEBUG
    from vars import ASSET_TAGS

    doc = UnveillanceDocument(_id=task.doc_id)
    if doc is None:
        print "DOC IS NONE"
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    txt = None
    if hasattr(task, "txt_file"):
        txt = doc.loadFile(task.txt_file)
    else:
        import os

        try:
            txt_path = doc.getAssetsByTagName(ASSET_TAGS["TXT_JSON"])[0]["file_name"]
            txt = doc.loadFile(os.path.join(doc.base_path, txt_path))
        except Exception as e:
            if DEBUG:
                print e

    if txt is None:
        print "TEXT FILE IS NONE"
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    import re

    # script from https://code.google.com/p/ebcode/ -> ebdata.tar.gz -> ebdata/nlp/addresses.py

    # Regex notes:
    #   * This is *not* a case-insensitive regex, because we assume
    #     capitalized words are special (street names).
    #   * All data matched by capturing parentheses is concatenated together, so
    #     if you don't want to include something in the resulting string, don't
    #     capture it.

    # STREET_NAME is a fragment of a regular expression that is used in several
    # places in our "real" regular expression (ADDRESSES_RE) below. The one tricky
    # thing about it is that it includes a "CAPTURE_START" placeholder instead of
    # a capturing opening parenthesis. This lets us create two versions of the
    # regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE.

    STREET_NAME = r"""
		# Here, we define some common false positives and tell the regex to ignore them.
		(?!
			[Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press
			|
			[Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff]             # university of
		)
		# DIRECTION
		%(CAPTURE_START)s
			(?:
				[NSEWnsew]\.?
				|
				(?:
					[Nn][Oo][Rr][Tt][Hh] |
					[Ss][Oo][Uu][Tt][Hh] |
					[Ee][Aa][Ss][Tt] |
					[Ww][Ee][Ss][Tt] |
					[Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] |
					[Ee][Aa][Ss][Tt][Ww][Ee][Ss][Tt] |
					[Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] |
					[Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt]
				)
				|
				(?:
					N\.?W | S\.?W | N\.?E | S\.?E
				)\.?
			)
			\ +                                        # space (but not newline)
		)?
		(?:
			# STREET NAME
			%(CAPTURE_START)s
				# Numbered street names with a suffix ("3rd", "4th").
				\d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D)

				|

				# Or, numbered street names without a suffix ("3", "4")
				# but with a street type.
				\d+
				(?=
					\ +
					(?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive|
					   Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte|
					   Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy
					)
					\b
				)

				|

				# Or, street names that don't start with numbers.
				(?:
					# Optional prefixes --
					# "St", as in "St Louis"
					# "Dr. Martin", as in "Dr. Martin Luther King"
					(?:
						[Ss][Tt]\.?
						|
						[Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn]
					)
					\ +
				)?
				(?:
					Mass\.(?=\ +[Aa]ve)  # Special case: "Mass." abbr. for "Massachussetts Ave."
										 # Needs to be special-cased because of the period.
					|
					(?:Avenue|Ave\.?)\ +[A-Z]       # Special case: "Avenue X"
					|
					[A-Z][a-z][A-Za-z]*  # One initial-capped word
					|
					[A-Z]\b              # Single-letter street name (e.g., K St. in DC)
					(?!\.\w)             # Avoid '20 U.S.A.'
				)
			)
			(?:
				# Here, we list the options with street suffixes first, so that
				# the suffix abbreviations are treated as the last part of the
				# street name, to avoid overeagerly capturing "123 Main St. The".
				%(CAPTURE_START)s
					\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					\ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
					|
					(?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5}
				)?
				# OPTIONAL POST-DIR
				(?:
					# Standard post-dir format
					%(CAPTURE_START)s
						,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.?
					)
					# Avoid greedily capturing more letters, like
					# '123 Main St, New England' to '123 Main St, N'
					(?![A-Za-z])

					|

					# Or, a special-case for DC quadrants, to find stuff like:
					# "600 H Street in NE Washington"
					# "600 H Street in the NE quadrant"
					# "600 H Street in northeast DC"

					# Note that this is NOT captured, so that it's excluded from
					# the final output.
					,?
					\s in
					%(CAPTURE_START)s
						\s
					)
					(?:
						(?:the|far) \s
					)?

					%(CAPTURE_START)s
						(?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest)
						(?=
							\s (?:quadrant|D\.?C\.?|Washington)
						)
					)
				)?
			)?
		)
	"""

    STREET_NAME_CAPTURE = STREET_NAME % {"CAPTURE_START": "("}

    STREET_NAME_NOCAPTURE = STREET_NAME % {"CAPTURE_START": "(?:"}

    ADDRESSES_RE = re.compile(
        r"""(?x)
		(?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection.
		\b

		# Ignore things that look like dates -- e.g., "21 May 2009".
		# This is a problem e.g. in cases where there's a May Street.
		(?!
			\d+\s+
			(?:January|February|March|April|May|June|July|August|September|October|November|December)
			,?\s+
			\d\d\d\d
		)

		# Ignore intersections that are prefixed by "University of", like
		# "University of Texas at Austin". This is a common false positive.
		(?<!
			[Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s
		)

		(?:
			# SEGMENT ("FOO BETWEEN BAR AND BAZ")
			(?:
				%(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s
				|
				%(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s
			)

			|

			# BLOCK/ADDRESS
			(?:
				(
					(?:
						(?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ]
							(?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )?
						[Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff]
						|
						\d+\ *-\ *\d+
						|
						\d+
					)
					\ +
				)
				%(STREET_NAME_CAPTURE)s

				# ignore the intersection in parenthesis so that it's not picked
				# up as a separate location. We do this by consuming the string
				# but *not* capturing it.
				(?:
					\ +
					\(?
					between
					\ +
					%(STREET_NAME_NOCAPTURE)s
					\ +
					and
					\ +
					%(STREET_NAME_NOCAPTURE)s
					\)?
				)?
			)

			|

			# INTERSECTION
			(?:
				# Common intersection prefixes. They're included here so that the
				# regex doesn't include them as part of the street name.
				(?:
					(?:
						[Nn]ear |
						[Aa]t |
						[Oo]n |
						[Tt]o |
						[Aa]round |
						[Ii]ntersection\ of |
						[Cc]orner\ of |
						[Aa]rea\ of |
						[Aa]reas?\ surrounding |
						vicinity\ of |
						ran\ down |
						running\ down |
						crossed
					)
					\ +
				)?
				\b
				(?:%(STREET_NAME_CAPTURE)s)
				(\ +)
				(
					(?:
						[Aa][Nn][Dd] |
						[Aa][Tt] |
						[Nn][Ee][Aa][Rr] |
						& |
						[Aa][Rr][Oo][Uu][Nn][Dd] |
						[Tt][Oo][Ww][Aa][Rr][Dd][Ss]? |
						[Oo][Ff][Ff] |
						(?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] |
						(?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt]
					)
					\ +
				)
				(?:%(STREET_NAME_CAPTURE)s)
			)
		)

		# OPTIONAL CITY SUFFIX
		(?:
			(?:
				,?\s+in |
				,
			)
			\s+

			# CITY NAME
			(
				[A-Z][a-z][A-Za-z]*                   # One initial-capped word
				(?:
					,?\ Jr\.?,?
					|
					\ [A-Z][a-z][A-Za-z]*
					|
					-[A-Za-z]+                        # Hyphenated words (e.g. "Croton-on-Hudson" in NY)
				){0,4}  # Initial-capped words
			)
		)?
		"""
        % {"STREET_NAME_CAPTURE": STREET_NAME_CAPTURE, "STREET_NAME_NOCAPTURE": STREET_NAME_NOCAPTURE}
    )

    addresses = parse_addresses(txt, ADDRESSES_RE)

    if addresses is None:
        print "COULD NOT EXTRACT ADDRESSES."
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    asset_path = doc.addAsset(
        addresses,
        "addresses.json",
        as_literal=False,
        description="addresses output from Everyblock address extractor",
        tags=[ASSET_TAGS["ADDRESSES_NLP"], ASSET_TAGS["CP_ENTITIES"]],
    )

    if asset_path is None or not doc.addFile(asset_path, None, sync=True):
        print "COULD NOT SAVE ASSET."
        print "\n\n************** %s [ERROR] ******************\n" % task_tag
        task.fail()
        return

    doc.addCompletedTask(task.task_path)
    task.routeNext()
    print "\n\n************** %s [END] ******************\n" % task_tag
    task.finish()
Example #5
0
def generatePageMap(uv_task):
	task_tag = "PAGE MAPPER"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "MAPPING PAGES FROM TEXT DOCUMENT at %s" % uv_task.doc_id
	uv_task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=uv_task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	import os, json
	try:
		page_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name']
		pages = json.loads(doc.loadFile(os.path.join(doc.base_path, page_path)))
	except Exception as e:
		if DEBUG: print e
	
	try:
		bow_path = doc.getAssetsByTagName(ASSET_TAGS['BOW'])[0]['file_name']
		bow = json.loads(doc.loadFile(os.path.join(doc.base_path, bow_path)))
	except Exception as e:
		if DEBUG: print e
	
	if pages is None or bow is None:
		print "NO PAGES OR BAG OF WORDS"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return
	
	# with unique words in bag that are not stopwords
	# for each page, word count of each	
	from numpy import intersect1d, setdiff1d
	from conf import getConfig
	
	if hasattr(uv_task, "stopwords"):
		stopwords = uv_task.stopwords
	else:	
		stopwords = os.path.join(getConfig('nlp_server.path'), "stopwords.json")
	
	try:
		with open(stopwords, 'rb') as S:
			if hasattr(uv_task, "stopwords_lang"):
				lang = uv_task.stopwords_lang
			else:
				lang = "english"
			
			stopwords = json.loads(S.read())[lang]
				
	except Exception as e:
		print "NO STOPWORDS...\n%s" % e
		print "\n\n************** %s [WARN] ******************\n" % task_tag
	
	page_map = []
	
	print "STOPWORDS: (len %d)\nTOP:\n%s\n" % (len(stopwords), stopwords[:10])
	print "BAG OF WORDS: (len %d)\nTOP:\n%s\n" % (len(bow), bow[:10])
	
	use_words = [w for w in setdiff1d(bow, stopwords).tolist() if len(w) > 1]	
	print "SIFTING BAG OF WORDS (old len: %d, new len: %d)" % (len(bow), len(use_words))
	
	global_info = {}

	for i, p in enumerate(pages):
		if p is None: continue
		
		page_bow = p.lower().split(" ")
		words = intersect1d(use_words, page_bow).tolist()
		if len(words) == 0: continue
		
		map = []
		frequency_max = 0

		for word in words:
			word_info = { 'word' : word, 'count' : page_bow.count(word) }
			
			map.append(word_info)
			if word_info['count'] > frequency_max: frequency_max = word_info['count']

			if word not in global_info.keys():
				global_info[word] = 0

			global_info[word] += word_info['count']
		
		page_map.append({ 'index' : i, 'map' : map, 'frequency_max' : frequency_max })
	
	if len(page_map) > 0:
		global_info['uv_page_map'] = page_map
		asset_path = doc.addAsset(global_info, "page_map.json", as_literal=False,
			description="word frequencies, page-by-page", tags=[ASSET_TAGS['PAGE_MAP']])
				
		if asset_path is None or not doc.addFile(asset_path, None, sync=True):
			print "COULD NOT SAVE ASSET."
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			uv_task.fail()
			return
	
	doc.addCompletedTask(uv_task.task_path)
	uv_task.routeNext()
	
	print "\n\n************** %s [END] ******************\n" % task_tag
	uv_task.finish()