Example #1
0
def process_doc(doc, fields=DOCS_FIELDS):
    # field extraction
    output = {
        'metadata': [filter_for_postgres(f.transform(doc)) for f in fields],
        'matches': [],
        'submitter_matches': []
    }
    
    # entity extraction
    if 'views' in doc and doc['views']:
        for view in doc['views']:
            if 'extracted' in view and view['extracted'] == True:
                for entity_id in match(view['text']).keys():
                    # hack to deal with documents whose scrapes failed but still got extracted
                    object_id = doc['object_id'] if 'object_id' in doc else view['file'].split('/')[-1].split('.')[0]
                    output['matches'].append([doc['document_id'], object_id, view['type'], 'view', entity_id])
    if 'attachments' in doc and doc['attachments']:
        for attachment in doc['attachments']:
            if 'views' in attachment and attachment['views']:
                for view in attachment['views']:
                    if 'extracted' in view and view['extracted'] == True:
                        for entity_id in match(view['text']).keys():
                            output['matches'].append([doc['document_id'], attachment['object_id'], view['type'], 'attachment', entity_id])
    
    # submitter matches
    for entity_id in match('\n'.join([output['metadata'][7], output['metadata'][8]])).keys():
        output['submitter_matches'].append([doc['document_id'], entity_id])
    
    return output
def process_doc(doc):
    # entity extraction
    for view in doc.views:
        if view.extracted == 'yes':
            view_matches = match(get_text(view), multiple=True)
            view.entities = list(view_matches.keys()) if view_matches else []

    for attachment in doc.attachments:
        for view in attachment.views:
            if view.extracted == 'yes':
                view_matches = match(get_text(view), multiple=True)
                view.entities = list(
                    view_matches.keys()) if view_matches else []

    # submitter matches
    #   check if there's submitter stuff in the title
    title_match = NAME_FINDER.match(doc.title)

    #   next check details, which is where most title stuff lives
    details = doc.details
    #   stick "XXXX" between tokens because it doesn't occur in entity names
    submitter_matches = match(' XXXX '.join([
        # organization
        details.get('Organization_Name', ''),

        # submitter name
        ' '.join(
            filter(
                bool,
                [details.get('First_Name', ''),
                 details.get('Last_Name', '')])),

        # submitter representative
        details.get('Submitter_s_Representative', ''),

        # title_match if we found one
        title_match.groupdict()['name'] if title_match else '',

        # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info
        doc.title if doc.source == 'sec_cftc'
        and doc.type in ('public_submission', 'other') else ''
    ]))
    doc.submitter_entities = list(
        submitter_matches.keys()) if submitter_matches else []

    doc.entities_last_extracted = datetime.datetime.now()

    doc.save()

    return True
Example #3
0
    def handle(self, *args, **options):
        reader = csv.reader(open(args[0], 'r'))
        writer = csv.writer(open(args[1], 'w'))

        for (id, text) in reader:
            for entity_id in match(text, multiple=True).keys():
                writer.writerow([id, entity_id])
def process_doc(doc):
    # entity extraction
    for view in doc.views:
        if view.extracted == 'yes':
            view_matches = match(get_text(view), multiple=True)
            view.entities = list(view_matches.keys()) if view_matches else []

    for attachment in doc.attachments:
        for view in attachment.views:
            if view.extracted == 'yes':
                view_matches = match(get_text(view), multiple=True)
                view.entities = list(view_matches.keys()) if view_matches else []
    
    # submitter matches
    #   check if there's submitter stuff in the title
    title_match = NAME_FINDER.match(doc.title)

    #   next check details, which is where most title stuff lives
    details = doc.details
    #   stick "XXXX" between tokens because it doesn't occur in entity names
    submitter_matches = match(' XXXX '.join([
        # organization
        details.get('Organization_Name', ''),
        
        # submitter name
        ' '.join(
            filter(bool, [details.get('First_Name', ''), details.get('Last_Name', '')])
        ),

        # submitter representative
        details.get('Submitter_s_Representative', ''),

        # title_match if we found one
        title_match.groupdict()['name'] if title_match else '',
        
        # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info
        doc.title if doc.source == 'sec_cftc' and doc.type in ('public_submission', 'other') else ''
    ]))
    doc.submitter_entities = list(submitter_matches.keys()) if submitter_matches else []

    doc.entities_last_extracted = datetime.datetime.now()
        
    doc.save()

    return True
Example #5
0
def contextualize_text(request, pg_id=None):
    # api key test
    if not (hasattr(request, 'apikey') and request.apikey.status == 'A'):
        return HttpResponse("Authorization Required", status=401)

    text = strip_tags(request.REQUEST.get('text', '').strip())

    full_text = str(filter(lambda x: x in string.printable, text))

    if not request.GET.get('multiple', False):
        matches = matching.match(full_text)
        out = {'entities': []}
        for match in matches:
            entity_data = get_entity_data(match)
            if not entity_data:
                continue
            out['entities'].append({
                'matched_text': list(matches[match]),
                'entity_data': entity_data
            })
    else:
        matches = matching.match(full_text, multiple=True)
        items = {}
        for match in matches:
            entity_data = get_entity_data(match)
            if not entity_data:
                continue

            fs = frozenset(matches[match])
            if fs in items:
                items[fs]['entity_data'].append(entity_data)
            else:
                items[fs] = {
                    'matched_text': list(matches[match]),
                    'entity_data': [entity_data]
                }
        out = {'entities': items.values()}

    if 'callback' in request.GET:
        return HttpResponse(
            '%s(%s)' % (request.GET['callback'], json.dumps(out)),
            'text/javascript')
    else:
        return HttpResponse(json.dumps(out), mimetype="application/json")
Example #6
0
def contextualize_text(request, pg_id=None):
    # api key test
    if not (hasattr(request, 'apikey') and request.apikey.status == 'A'):
        return HttpResponse("Authorization Required", status=401)
    
    text = strip_tags(request.REQUEST.get('text', '').strip())
    
    full_text = str(filter(lambda x: x in string.printable, text))
    
    if not request.GET.get('multiple', False):
        matches = matching.match(full_text)
        out = {'entities': []}
        for match in matches:
            entity_data = get_entity_data(match)
            if not entity_data:
                continue
            out['entities'].append({
                'matched_text': list(matches[match]),
                'entity_data': entity_data
            })
    else:
        matches = matching.match(full_text, multiple=True)
        items = {}
        for match in matches:
            entity_data = get_entity_data(match)
            if not entity_data:
                continue

            fs = frozenset(matches[match])
            if fs in items:
                items[fs]['entity_data'].append(entity_data)
            else:
                items[fs] = {
                    'matched_text': list(matches[match]),
                    'entity_data': [entity_data]
                }
        out = {'entities': items.values()}
            
    if 'callback' in request.GET:
        return HttpResponse('%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript')
    else:
        return HttpResponse(json.dumps(out), mimetype="application/json")
def process_doc(doc, fields=DOCS_FIELDS):
    # field extraction
    output = {
        'metadata': [filter_for_postgres(f.transform(doc)) for f in fields],
        'matches': [],
        'submitter_matches': []
    }

    # entity extraction
    if 'views' in doc and doc['views']:
        for view in doc['views']:
            if 'extracted' in view and view['extracted'] == True:
                for entity_id in match(view['text']).keys():
                    # hack to deal with documents whose scrapes failed but still got extracted
                    object_id = doc[
                        'object_id'] if 'object_id' in doc else view[
                            'file'].split('/')[-1].split('.')[0]
                    output['matches'].append([
                        doc['document_id'], object_id, view['type'], 'view',
                        entity_id
                    ])
    if 'attachments' in doc and doc['attachments']:
        for attachment in doc['attachments']:
            if 'views' in attachment and attachment['views']:
                for view in attachment['views']:
                    if 'extracted' in view and view['extracted'] == True:
                        for entity_id in match(view['text']).keys():
                            output['matches'].append([
                                doc['document_id'], attachment['object_id'],
                                view['type'], 'attachment', entity_id
                            ])

    # submitter matches
    for entity_id in match('\n'.join(
        [output['metadata'][7], output['metadata'][8]])).keys():
        output['submitter_matches'].append([doc['document_id'], entity_id])

    return output
Example #8
0
def sender_info(request):
    # api key test
    if not (hasattr(request, 'apikey') and request.apikey.status == 'A'):
        return HttpResponse("Authorization Required", status=401)
    
    name = request.REQUEST.get('name', '').strip()
    email = request.REQUEST.get('email', '').strip()
    organization = None
    
    out = {}
    
    organization = ''
    org_info = None
    if email:
        parts = email.split("@")
        if len(parts) > 1:
            domain = parts[1]
            
            # if it's a US TLD, just use the actual purchased domain name (not subdomains) for a match
            if re.match(r'.*\.(com|net|org)$', domain):
                domain = '.'.join(domain.split('.')[-2:])
            
            orgs = lookup_domain(domain)
            
            if len(orgs) > 1:
                orgs = sorted(orgs, key=lambda org: Levenshtein.ratio(domain, org['name'].lower()), reverse=True)
            
            if orgs:
                organization = orgs[0]['name']
                matches = matching.match(organization)
                if matches:
                    org_info = get_entity_data(matches.keys()[0])
    
    results = None
    
    lat, lon = (None, None)
    geoip = ip_lookup(request.META['REMOTE_ADDR'])
    if geoip is not None:
        lat, lon = geoip
    
    if not lat or not lon:
        # hard-code DC's info for now so that it still works, since our API can't deal with not having geo data
        lat = '38.895112'
        lon = '-77.036366'
    
    if name and ' ' in name:
        results = api._get_url_json('contributions/contributor_geo.json', parse_json=True, query=name, lat=lat, lon=lon)
    
    sender_info = []
    if results:
        for result in results:
            loc = result['contributor_location'].split(', ')
            if len(loc) > 1:
                city = loc[0].split('-')[0]
                state = loc[1].replace(' MSA', '').split('-')[0]
            else:
                sloc = result['contributor_location'].split(' ')
                state = sloc[0]
                city = string.capwords(' '.join(sloc[1:]))
            sender_info.append({
                'name': standardize_name(result['contributor_name'], "individual"),
                'city': city,
                'state': state,
                'total': float(result['amount_total']),
                'dem_total': float(result['amount_democrat']),
                'rep_total': float(result['amount_republican']),
                'other_total': float(result['amount_total']) - (float(result['amount_democrat']) + float(result['amount_republican'])),
                'count': result['count'],
                'url': base64.b64encode(urllib.urlencode({'contributor_ft': name, 'msa_ft': result['contributor_location']}))
            })
    
    out = {
        'name': name,
        'email': email,
        'sender_info': sender_info,
        'url': base64.b64encode(urllib.urlencode({'contributor_ft': name})),
        'organization': organization,
        'org_info': org_info
    }
    
    if 'callback' in request.GET:
        return HttpResponse('%s(%s)' % (request.GET['callback'], json.dumps(out)), 'text/javascript')
    else:
        return HttpResponse(json.dumps(out), mimetype="application/json")
Example #9
0
def sender_info(request):
    # api key test
    if not (hasattr(request, 'apikey') and request.apikey.status == 'A'):
        return HttpResponse("Authorization Required", status=401)

    name = request.REQUEST.get('name', '').strip()
    email = request.REQUEST.get('email', '').strip()
    organization = None

    out = {}

    organization = ''
    org_info = None
    if email:
        parts = email.split("@")
        if len(parts) > 1:
            domain = parts[1]

            # if it's a US TLD, just use the actual purchased domain name (not subdomains) for a match
            if re.match(r'.*\.(com|net|org)$', domain):
                domain = '.'.join(domain.split('.')[-2:])

            orgs = lookup_domain(domain)

            if len(orgs) > 1:
                orgs = sorted(orgs,
                              key=lambda org: Levenshtein.ratio(
                                  domain, org['name'].lower()),
                              reverse=True)

            if orgs:
                organization = orgs[0]['name']
                matches = matching.match(organization)
                if matches:
                    org_info = get_entity_data(matches.keys()[0])

    results = None

    lat, lon = (None, None)
    geoip = ip_lookup(request.META['REMOTE_ADDR'])
    if geoip is not None:
        lat, lon = geoip

    if not lat or not lon:
        # hard-code DC's info for now so that it still works, since our API can't deal with not having geo data
        lat = '38.895112'
        lon = '-77.036366'

    if name and ' ' in name:
        results = api._get_url_json('contributions/contributor_geo.json',
                                    parse_json=True,
                                    query=name,
                                    lat=lat,
                                    lon=lon)

    sender_info = []
    if results:
        for result in results:
            loc = result['contributor_location'].split(', ')
            if len(loc) > 1:
                city = loc[0].split('-')[0]
                state = loc[1].replace(' MSA', '').split('-')[0]
            else:
                sloc = result['contributor_location'].split(' ')
                state = sloc[0]
                city = string.capwords(' '.join(sloc[1:]))
            sender_info.append({
                'name':
                standardize_name(result['contributor_name'], "individual"),
                'city':
                city,
                'state':
                state,
                'total':
                float(result['amount_total']),
                'dem_total':
                float(result['amount_democrat']),
                'rep_total':
                float(result['amount_republican']),
                'other_total':
                float(result['amount_total']) -
                (float(result['amount_democrat']) +
                 float(result['amount_republican'])),
                'count':
                result['count'],
                'url':
                base64.b64encode(
                    urllib.urlencode({
                        'contributor_ft': name,
                        'msa_ft': result['contributor_location']
                    }))
            })

    out = {
        'name': name,
        'email': email,
        'sender_info': sender_info,
        'url': base64.b64encode(urllib.urlencode({'contributor_ft': name})),
        'organization': organization,
        'org_info': org_info
    }

    if 'callback' in request.GET:
        return HttpResponse(
            '%s(%s)' % (request.GET['callback'], json.dumps(out)),
            'text/javascript')
    else:
        return HttpResponse(json.dumps(out), mimetype="application/json")