def process_doc(doc, fields=DOCS_FIELDS): # field extraction output = { 'metadata': [filter_for_postgres(f.transform(doc)) for f in fields], 'matches': [], 'submitter_matches': [] } # entity extraction if 'views' in doc and doc['views']: for view in doc['views']: if 'extracted' in view and view['extracted'] == True: for entity_id in match(view['text']).keys(): # hack to deal with documents whose scrapes failed but still got extracted object_id = doc['object_id'] if 'object_id' in doc else view['file'].split('/')[-1].split('.')[0] output['matches'].append([doc['document_id'], object_id, view['type'], 'view', entity_id]) if 'attachments' in doc and doc['attachments']: for attachment in doc['attachments']: if 'views' in attachment and attachment['views']: for view in attachment['views']: if 'extracted' in view and view['extracted'] == True: for entity_id in match(view['text']).keys(): output['matches'].append([doc['document_id'], attachment['object_id'], view['type'], 'attachment', entity_id]) # submitter matches for entity_id in match('\n'.join([output['metadata'][7], output['metadata'][8]])).keys(): output['submitter_matches'].append([doc['document_id'], entity_id]) return output
def process_doc(doc): # entity extraction for view in doc.views: if view.extracted == 'yes': view_matches = match(get_text(view), multiple=True) view.entities = list(view_matches.keys()) if view_matches else [] for attachment in doc.attachments: for view in attachment.views: if view.extracted == 'yes': view_matches = match(get_text(view), multiple=True) view.entities = list( view_matches.keys()) if view_matches else [] # submitter matches # check if there's submitter stuff in the title title_match = NAME_FINDER.match(doc.title) # next check details, which is where most title stuff lives details = doc.details # stick "XXXX" between tokens because it doesn't occur in entity names submitter_matches = match(' XXXX '.join([ # organization details.get('Organization_Name', ''), # submitter name ' '.join( filter( bool, [details.get('First_Name', ''), details.get('Last_Name', '')])), # submitter representative details.get('Submitter_s_Representative', ''), # title_match if we found one title_match.groupdict()['name'] if title_match else '', # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info doc.title if doc.source == 'sec_cftc' and doc.type in ('public_submission', 'other') else '' ])) doc.submitter_entities = list( submitter_matches.keys()) if submitter_matches else [] doc.entities_last_extracted = datetime.datetime.now() doc.save() return True
def handle(self, *args, **options): reader = csv.reader(open(args[0], 'r')) writer = csv.writer(open(args[1], 'w')) for (id, text) in reader: for entity_id in match(text, multiple=True).keys(): writer.writerow([id, entity_id])
def process_doc(doc): # entity extraction for view in doc.views: if view.extracted == 'yes': view_matches = match(get_text(view), multiple=True) view.entities = list(view_matches.keys()) if view_matches else [] for attachment in doc.attachments: for view in attachment.views: if view.extracted == 'yes': view_matches = match(get_text(view), multiple=True) view.entities = list(view_matches.keys()) if view_matches else [] # submitter matches # check if there's submitter stuff in the title title_match = NAME_FINDER.match(doc.title) # next check details, which is where most title stuff lives details = doc.details # stick "XXXX" between tokens because it doesn't occur in entity names submitter_matches = match(' XXXX '.join([ # organization details.get('Organization_Name', ''), # submitter name ' '.join( filter(bool, [details.get('First_Name', ''), details.get('Last_Name', '')]) ), # submitter representative details.get('Submitter_s_Representative', ''), # title_match if we found one title_match.groupdict()['name'] if title_match else '', # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info doc.title if doc.source == 'sec_cftc' and doc.type in ('public_submission', 'other') else '' ])) doc.submitter_entities = list(submitter_matches.keys()) if submitter_matches else [] doc.entities_last_extracted = datetime.datetime.now() doc.save() return True
def contextualize_text(request, pg_id=None): # api key test if not (hasattr(request, 'apikey') and request.apikey.status == 'A'): return HttpResponse("Authorization Required", status=401) text = strip_tags(request.REQUEST.get('text', '').strip()) full_text = str(filter(lambda x: x in string.printable, text)) if not request.GET.get('multiple', False): matches = matching.match(full_text) out = {'entities': []} for match in matches: entity_data = get_entity_data(match) if not entity_data: continue out['entities'].append({ 'matched_text': list(matches[match]), 'entity_data': entity_data }) else: matches = matching.match(full_text, multiple=True) items = {} for match in matches: entity_data = get_entity_data(match) if not entity_data: continue fs = frozenset(matches[match]) if fs in items: items[fs]['entity_data'].append(entity_data) else: items[fs] = { 'matched_text': list(matches[match]), 'entity_data': [entity_data] } out = {'entities': items.values()} if 'callback' in request.GET: return HttpResponse( '%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript') else: return HttpResponse(json.dumps(out), mimetype="application/json")
def contextualize_text(request, pg_id=None): # api key test if not (hasattr(request, 'apikey') and request.apikey.status == 'A'): return HttpResponse("Authorization Required", status=401) text = strip_tags(request.REQUEST.get('text', '').strip()) full_text = str(filter(lambda x: x in string.printable, text)) if not request.GET.get('multiple', False): matches = matching.match(full_text) out = {'entities': []} for match in matches: entity_data = get_entity_data(match) if not entity_data: continue out['entities'].append({ 'matched_text': list(matches[match]), 'entity_data': entity_data }) else: matches = matching.match(full_text, multiple=True) items = {} for match in matches: entity_data = get_entity_data(match) if not entity_data: continue fs = frozenset(matches[match]) if fs in items: items[fs]['entity_data'].append(entity_data) else: items[fs] = { 'matched_text': list(matches[match]), 'entity_data': [entity_data] } out = {'entities': items.values()} if 'callback' in request.GET: return HttpResponse('%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript') else: return HttpResponse(json.dumps(out), mimetype="application/json")
def process_doc(doc, fields=DOCS_FIELDS): # field extraction output = { 'metadata': [filter_for_postgres(f.transform(doc)) for f in fields], 'matches': [], 'submitter_matches': [] } # entity extraction if 'views' in doc and doc['views']: for view in doc['views']: if 'extracted' in view and view['extracted'] == True: for entity_id in match(view['text']).keys(): # hack to deal with documents whose scrapes failed but still got extracted object_id = doc[ 'object_id'] if 'object_id' in doc else view[ 'file'].split('/')[-1].split('.')[0] output['matches'].append([ doc['document_id'], object_id, view['type'], 'view', entity_id ]) if 'attachments' in doc and doc['attachments']: for attachment in doc['attachments']: if 'views' in attachment and attachment['views']: for view in attachment['views']: if 'extracted' in view and view['extracted'] == True: for entity_id in match(view['text']).keys(): output['matches'].append([ doc['document_id'], attachment['object_id'], view['type'], 'attachment', entity_id ]) # submitter matches for entity_id in match('\n'.join( [output['metadata'][7], output['metadata'][8]])).keys(): output['submitter_matches'].append([doc['document_id'], entity_id]) return output
def sender_info(request): # api key test if not (hasattr(request, 'apikey') and request.apikey.status == 'A'): return HttpResponse("Authorization Required", status=401) name = request.REQUEST.get('name', '').strip() email = request.REQUEST.get('email', '').strip() organization = None out = {} organization = '' org_info = None if email: parts = email.split("@") if len(parts) > 1: domain = parts[1] # if it's a US TLD, just use the actual purchased domain name (not subdomains) for a match if re.match(r'.*\.(com|net|org)$', domain): domain = '.'.join(domain.split('.')[-2:]) orgs = lookup_domain(domain) if len(orgs) > 1: orgs = sorted(orgs, key=lambda org: Levenshtein.ratio(domain, org['name'].lower()), reverse=True) if orgs: organization = orgs[0]['name'] matches = matching.match(organization) if matches: org_info = get_entity_data(matches.keys()[0]) results = None lat, lon = (None, None) geoip = ip_lookup(request.META['REMOTE_ADDR']) if geoip is not None: lat, lon = geoip if not lat or not lon: # hard-code DC's info for now so that it still works, since our API can't deal with not having geo data lat = '38.895112' lon = '-77.036366' if name and ' ' in name: results = api._get_url_json('contributions/contributor_geo.json', parse_json=True, query=name, lat=lat, lon=lon) sender_info = [] if results: for result in results: loc = result['contributor_location'].split(', ') if len(loc) > 1: city = loc[0].split('-')[0] state = loc[1].replace(' MSA', '').split('-')[0] else: sloc = result['contributor_location'].split(' ') state = sloc[0] city = string.capwords(' '.join(sloc[1:])) sender_info.append({ 'name': standardize_name(result['contributor_name'], "individual"), 'city': city, 'state': state, 'total': float(result['amount_total']), 'dem_total': float(result['amount_democrat']), 'rep_total': float(result['amount_republican']), 'other_total': float(result['amount_total']) - (float(result['amount_democrat']) + float(result['amount_republican'])), 'count': result['count'], 'url': base64.b64encode(urllib.urlencode({'contributor_ft': name, 'msa_ft': result['contributor_location']})) }) out = { 'name': name, 'email': email, 'sender_info': sender_info, 'url': base64.b64encode(urllib.urlencode({'contributor_ft': name})), 'organization': organization, 'org_info': org_info } if 'callback' in request.GET: return HttpResponse('%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript') else: return HttpResponse(json.dumps(out), mimetype="application/json")
def sender_info(request): # api key test if not (hasattr(request, 'apikey') and request.apikey.status == 'A'): return HttpResponse("Authorization Required", status=401) name = request.REQUEST.get('name', '').strip() email = request.REQUEST.get('email', '').strip() organization = None out = {} organization = '' org_info = None if email: parts = email.split("@") if len(parts) > 1: domain = parts[1] # if it's a US TLD, just use the actual purchased domain name (not subdomains) for a match if re.match(r'.*\.(com|net|org)$', domain): domain = '.'.join(domain.split('.')[-2:]) orgs = lookup_domain(domain) if len(orgs) > 1: orgs = sorted(orgs, key=lambda org: Levenshtein.ratio( domain, org['name'].lower()), reverse=True) if orgs: organization = orgs[0]['name'] matches = matching.match(organization) if matches: org_info = get_entity_data(matches.keys()[0]) results = None lat, lon = (None, None) geoip = ip_lookup(request.META['REMOTE_ADDR']) if geoip is not None: lat, lon = geoip if not lat or not lon: # hard-code DC's info for now so that it still works, since our API can't deal with not having geo data lat = '38.895112' lon = '-77.036366' if name and ' ' in name: results = api._get_url_json('contributions/contributor_geo.json', parse_json=True, query=name, lat=lat, lon=lon) sender_info = [] if results: for result in results: loc = result['contributor_location'].split(', ') if len(loc) > 1: city = loc[0].split('-')[0] state = loc[1].replace(' MSA', '').split('-')[0] else: sloc = result['contributor_location'].split(' ') state = sloc[0] city = string.capwords(' '.join(sloc[1:])) sender_info.append({ 'name': standardize_name(result['contributor_name'], "individual"), 'city': city, 'state': state, 'total': float(result['amount_total']), 'dem_total': float(result['amount_democrat']), 'rep_total': float(result['amount_republican']), 'other_total': float(result['amount_total']) - (float(result['amount_democrat']) + float(result['amount_republican'])), 'count': result['count'], 'url': base64.b64encode( urllib.urlencode({ 'contributor_ft': name, 'msa_ft': result['contributor_location'] })) }) out = { 'name': name, 'email': email, 'sender_info': sender_info, 'url': base64.b64encode(urllib.urlencode({'contributor_ft': name})), 'organization': organization, 'org_info': org_info } if 'callback' in request.GET: return HttpResponse( '%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript') else: return HttpResponse(json.dumps(out), mimetype="application/json")