def parse_individual(emitter, node): person = emitter.make('Person') sanction = parse_common(emitter, person, node) person.add('title', values(node.find('./TITLE'))) firstName = node.findtext('./FIRST_NAME') secondName = node.findtext('./SECOND_NAME') thirdName = node.findtext('./THIRD_NAME') name = jointext(firstName, secondName, thirdName) person.add('name', name) person.add('firstName', firstName) person.add('secondName', secondName) person.add('middleName', thirdName) person.add('position', values(node.find('./DESIGNATION'))) for alias in node.findall('./INDIVIDUAL_ALIAS'): parse_alias(person, alias) for addr in node.findall('./INDIVIDUAL_ADDRESS'): parse_address(person, addr) for doc in node.findall('./INDIVIDUAL_DOCUMENT'): passport = emitter.make('Passport') number = doc.findtext('./NUMBER') date = doc.findtext('./DATE_OF_ISSUE') type_ = doc.findtext('./TYPE_OF_DOCUMENT') if number is None and date is None and type_ is None: continue passport.make_id(person.id, number, date, type_) passport.add('holder', person) passport.add('passportNumber', number) passport.add('startDate', date) passport.add('type', type_) passport.add('type', doc.findtext('./TYPE_OF_DOCUMENT2')) passport.add('summary', doc.findtext('./NOTE')) country = doc.findtext('./COUNTRY_OF_ISSUE') country = country or doc.findtext('./ISSUING_COUNTRY') passport.add('country', normalize_country(country)) emitter.emit(passport) for nat in node.findall('./NATIONALITY/VALUE'): person.add('nationality', normalize_country(nat.text)) for dob in node.findall('./INDIVIDUAL_DATE_OF_BIRTH'): date = dob.findtext('./DATE') or dob.findtext('./YEAR') person.add('birthDate', date) for pob in node.findall('./INDIVIDUAL_PLACE_OF_BIRTH'): person.add('country', normalize_country(pob.findtext('./COUNTRY'))) place = jointext(pob.findtext('./CITY'), pob.findtext('./STATE_PROVINCE'), pob.findtext('./COUNTRY'), sep=', ') person.add('birthPlace', place) emitter.emit(person) emitter.emit(sanction)
def parse(context, data): emitter = EntityEmitter(context) url = data.get('url') country = normalize_country(data.get('country')) with context.http.rehash(data) as res: doc = res.html # updated_at = doc.findtext('.//span[@id="lastUpdateDate"]') output = doc.find('.//div[@id="countryOutput"]') if output is None: return # component = None for row in output.findall('.//li'): # next_comp = row.findtext('./td[@class="componentName"]/strong') # if next_comp is not None: # component = next_comp # continue function = element_text(row.find('.//span[@class="title"]')) if function is None: continue name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: continue person = emitter.make('Person') person.make_id(country, name, function) person.add('name', name) person.add('country', country) person.add('position', function) person.add('sourceUrl', url) emitter.emit(person)
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for table in res.html.findall('.//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] entity = emitter.make('LegalEntity') entity.make_id(*values) names = clean_name(values[0]) if not len(names): context.log.warning("No name: %r", values) continue entity.add('name', names[0]) entity.add('address', values[1]) entity.add('country', normalize_country(values[2])) for name in names[1:]: entity.add('alias', name) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('authority', 'World Bank Debarrment') sanction.add('program', values[5]) sanction.add('startDate', clean_date(values[3])) sanction.add('endDate', clean_date(values[4])) sanction.add('sourceUrl', data.get('url')) emitter.emit(entity) emitter.emit(sanction)
def parse(context, data): emitter = EntityEmitter(context) entity = emitter.make('LegalEntity') name = data.get('SUPP_NAME') ent_id = data.get('SUPP_ID') reason = data.get('DEBAR_REASON') country = data.get('COUNTRY_NAME') city = data.get('SUPP_CITY') address = data.get('SUPP_ADDR') start_date = data.get('DEBAR_FROM_DATE') end_date = data.get('DEBAR_TO_DATE') entity.make_id(name, ent_id, country) names = clean_name(name) entity.add('name', names[0]) entity.add('address', address) entity.add('address', city) entity.add('country', normalize_country(country)) for name in names[1:]: entity.add('alias', name) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('authority', 'World Bank Debarrment') sanction.add('program', reason) sanction.add('startDate', clean_date(start_date)) sanction.add('endDate', clean_date(end_date)) sanction.add('sourceUrl', SOURCE) emitter.emit(entity) emitter.emit(sanction) emitter.finalize()
def parse_address(entity, addr): text = addr.xpath('string()').strip() if not len(text): return country = addr.findtext('./COUNTRY') address = jointext(addr.findtext('./NOTE'), addr.findtext('./STREET'), addr.findtext('./CITY'), addr.findtext('./STATE_PROVINCE'), country, sep=', ') entity.add('address', address) entity.add('country', normalize_country(country))
def officer(context, data): emitter = EntityEmitter(context) officer_id = data.get('officer_id') url = API_URL % officer_id with context.http.get(url, auth=AUTH) as res: if res.status_code != 200: return data = res.json person = emitter.make('Person') person.make_id(officer_id) source_url = urljoin(WEB_URL, data.get('links', {}).get('self', '/')) person.add('sourceUrl', source_url) last_name = data.pop('surname', None) person.add('lastName', last_name) forename = data.pop('forename', None) person.add('firstName', forename) other_forenames = data.pop('other_forenames', None) person.add('middleName', other_forenames) person.add('name', jointext(forename, other_forenames, last_name)) person.add('title', data.pop('title', None)) nationality = normalize_country(data.pop('nationality', None)) person.add('nationality', nationality) person.add('birthDate', data.pop('date_of_birth', None)) for disqual in data.pop('disqualifications', []): case = disqual.get('case_identifier') sanction = emitter.make('Sanction') sanction.make_id(person.id, case) sanction.add('entity', person) sanction.add('authority', 'UK Companies House') sanction.add('program', case) sanction.add('startDate', disqual.pop('disqualified_from', None)) sanction.add('endDate', disqual.pop('disqualified_until', None)) emitter.emit(sanction) address = disqual.pop('address', {}) locality = address.get('locality') locality = jointext(locality, address.get('postal_code')) street = address.get('address_line_1') premises = address.get('premises') street = jointext(street, premises) address = jointext(street, address.get('address_line_2'), locality, address.get('region'), sep=', ') person.add('address', address) emitter.emit(person)
def parse_entry(emitter, entry): link = entry.find('.//a') url = urljoin(URL, link.get('href')) _, member_id = url.rsplit('=', 1) person = emitter.make('Person') person.make_id(member_id) person.add('name', link.text) person.add('sourceUrl', url) last_name, first_name = link.text.split(', ', 1) person.add('lastName', last_name) person.add('firstName', first_name) person.add('position', entry.findtext('.//span[@class="fonction"]')) role, country = entry.findall('.//span[@class="infos"]') person.add('summary', role.text_content().strip()) country = normalize_country(country.text_content().strip()) person.add('nationality', country) person.add('keywords', ['PEP', 'PACE']) emitter.emit(person)
def parse_entry(emitter, node): # ids are per country and entry type (individual/entity) country = node.findtext('./Country') if ' / ' in country: country, _ = country.split(' / ') country_code = normalize_country(country) entity_name = node.findtext('./Entity') item = node.findtext('.//Item') entity = emitter.make('LegalEntity') if entity_name is None: entity = emitter.make('Person') entity.make_id(country, entity_name, item) entity.add('name', entity_name) entity.add('country', country_code) sanction = emitter.make('Sanction') sanction.make_id(entity.id) sanction.add('entity', entity) sanction.add('authority', 'Canadian international sanctions') sanction.add('program', node.findtext('.//Schedule')) given_name = node.findtext('.//GivenName') entity.add('firstName', given_name, quiet=True) last_name = node.findtext('.//LastName') entity.add('lastName', last_name, quiet=True) entity.add('name', jointext(given_name, last_name)) dob = node.findtext('.//DateOfBirth') if dob is not None: dob = '-'.join(reversed(dob.split('/'))) entity.add('birthDate', dob, quiet=True) names = node.findtext('.//Aliases') if names is None: return for name in names.split(', '): name = collapse_spaces(name) entity.add('alias', name) emitter.emit(entity) emitter.emit(sanction)
def parse_node(emitter, node): mep_id = node.findtext('.//id') person = emitter.make("Person") person.make_id(mep_id) name = node.findtext('.//fullName') person.add("name", name) url = 'http://www.europarl.europa.eu/meps/en/%s' % mep_id person.add("sourceUrl", url) first_name, last_name = split_name(name) person.add("firstName", first_name) person.add("lastName", last_name) country = normalize_country(node.findtext('.//country')) person.add("nationality", country) person.add("keywords", ['PEP', 'MEP']) emitter.emit(person) party_name = node.findtext('.//nationalPoliticalGroup') if party_name not in ['Independent']: party = emitter.make('Organization') party.make_id('nationalPoliticalGroup', party_name) party.add('name', party_name) party.add('country', country) emitter.emit(party) membership = emitter.make('Membership') membership.make_id(person.id, party.id) membership.add('member', person) membership.add('organization', party) emitter.emit(membership) group_name = node.findtext('.//politicalGroup') group = emitter.make('Organization') group.make_id('politicalGroup', group_name) group.add('name', group_name) group.add('country', 'eu') emitter.emit(group) membership = emitter.make('Membership') membership.make_id(person.id, group.id) membership.add('member', person) membership.add('organization', group) emitter.emit(membership)
def parse_reference(emitter, reference, rows): entity = emitter.make('LegalEntity') entity.make_id(reference) entity.add('sourceUrl', URL) sanction = emitter.make('Sanction') sanction.make_id(entity.id) sanction.add( 'authority', 'Australian Department of Foreign Affairs and Trade Consolidated Sanctions' ) # noqa sanction.add('entity', entity) for row in rows: if row.pop('type') == 'Individual': entity.schema = model.get('Person') name = row.pop('name_of_individual_or_entity', None) if row.pop('name_type') == 'aka': entity.add('alias', name) else: entity.add('name', name) entity.add('address', row.pop('address')) entity.add('notes', row.pop('additional_information')) sanction.add('program', row.pop('committees')) nationality = normalize_country(row.pop('citizenship')) entity.add('nationality', nationality, quiet=True) entity.add('birthDate', row.pop('date_of_birth'), quiet=True) entity.add('birthPlace', row.pop('place_of_birth'), quiet=True) entity.add('status', row.pop('listing_information'), quiet=True) control_date = int(row.pop('control_date')) base_date = datetime(1900, 1, 1).toordinal() dt = datetime.fromordinal(base_date + control_date - 2) sanction.add('modifiedAt', dt.date()) entity.add('modifiedAt', dt.date()) emitter.emit(entity) emitter.emit(sanction)
def parse_row(emitter, row): entity = emitter.make('LegalEntity') entity.make_id(row.get('Effective_Date'), row.get('Name')) entity.add('name', row.get('Name')) entity.add('notes', row.get('Action')) entity.add('country', normalize_country(row.get('Country'))) # entity.updated_at = row.get('Effective_Date') address = jointext(row.get('Street_Address'), row.get('Postal_Code'), row.get('City'), row.get('State'), sep=', ') entity.add('address', address) emitter.emit(entity) sanction = emitter.make('Sanction') sanction.make_id(entity.id, row.get('FR_Citation')) sanction.add('entity', entity) sanction.add('program', row.get('FR_Citation')) sanction.add('authority', 'US Bureau of Industry and Security') sanction.add('country', 'us') sanction.add('startDate', row.get('Effective_Date')) emitter.emit(sanction)
def parse_entry(emitter, group, rows): entity = emitter.make('LegalEntity') entity.make_id(group) sanction = emitter.make('Sanction') sanction.make_id(entity.id, 'Sanction') sanction.add('entity', entity) sanction.add('authority', 'HM Treasury Financial sanctions targets') sanction.add('country', 'gb') for row in rows: if row.pop('Group Type') == 'Individual': entity.schema = model.get('Person') row.pop('Alias Type', None) name1 = row.pop('Name 1') entity.add('firstName', name1, quiet=True) name2 = row.pop('Name 2') name3 = row.pop('Name 3') name4 = row.pop('Name 4') name5 = row.pop('Name 5') name6 = row.pop('Name 6') entity.add('lastName', name6, quiet=True) name = jointext(name1, name2, name3, name4, name5, name6) if not entity.has('name'): entity.add('name', name) else: entity.add('alias', name) entity.add('title', row.pop('Title'), quiet=True) sanction.add('program', row.pop('Regime')) last_updated = parse_date(row.pop('Last Updated')) sanction.add('modifiedAt', last_updated) sanction.add('startDate', parse_date(row.pop('Listed On'))) entity.add('modifiedAt', last_updated) entity.add('position', row.pop('Position'), quiet=True) entity.add('notes', row.pop('Other Information'), quiet=True) entity.add('birthDate', parse_date(row.pop('DOB')), quiet=True) nationality = normalize_country(row.pop('Nationality', None)) entity.add('nationality', nationality, quiet=True) country = row.pop('Country', None) entity.add('country', normalize_country(country)) address = jointext(row.pop('Address 1', None), row.pop('Address 2', None), row.pop('Address 3', None), row.pop('Address 4', None), row.pop('Address 5', None), row.pop('Address 6', None), row.pop('Post/Zip Code', None), country) entity.add('address', address, quiet=True) passport = row.pop('Passport Details', None) entity.add('passportNumber', passport, quiet=True) national_id = row.pop('NI Number', None) entity.add('nationalId', national_id, quiet=True) country_of_birth = [] for country in split_items(row.pop('Country of Birth')): code = normalize_country(country) country_of_birth.append(country) entity.add('country', code) for town in split_items(row.pop('Town of Birth', None)): entity.add('birthPlace', town) emitter.emit(entity) emitter.emit(sanction)
def parse_entry(emitter, entry): entity = emitter.make('LegalEntity') if entry.findtext('./type-entry') == '2': entity = emitter.make('Person') entity.make_id(entry.findtext('number-entry')) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('entity', entity) sanction.add('authority', 'State Financial Monitoring Service of Ukraine') sanction.add( 'sourceUrl', 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en') # noqa sanction.add('program', entry.findtext('./program-entry')) date_entry = entry.findtext('./date-entry') if date_entry: date = datetime.strptime(date_entry, '%Y%m%d').date() sanction.add('startDate', date) for aka in entry.findall('./aka-list'): first_name = aka.findtext('./aka-name1') entity.add('firstName', first_name, quiet=True) second_name = aka.findtext('./aka-name2') entity.add('secondName', second_name, quiet=True) third_name = aka.findtext('./aka-name3') entity.add('middleName', third_name, quiet=True) last_name = aka.findtext('./aka-name4') entity.add('lastName', last_name, quiet=True) name = jointext(first_name, second_name, third_name, last_name) if aka.findtext('type-aka') == 'N': entity.add('name', name) else: if aka.findtext('./quality-aka') == '2': entity.add('weakAlias', name) else: entity.add('alias', name) for node in entry.findall('./title-list'): entity.add('title', node.text, quiet=True) for doc in entry.findall('./document-list'): reg = doc.findtext('./document-reg') number = doc.findtext('./document-id') country = normalize_country(doc.findtext('./document-country')) passport = emitter.make('Passport') passport.make_id('Passport', entity.id, reg, number, country) passport.add('holder', entity) passport.add('passportNumber', number) passport.add('summary', reg) passport.add('country', country) emitter.emit(passport) for doc in entry.findall('./id-number-list'): entity.add('idNumber', doc.text) for node in entry.findall('./address-list'): entity.add('address', node.findtext('./address')) for pob in entry.findall('./place-of-birth-list'): entity.add('birthPlace', pob.text, quiet=True) for dob in entry.findall('./date-of-birth-list'): entity.add('birthDate', parse_date(dob.text), quiet=True) for nat in entry.findall('./nationality-list'): entity.add('nationality', normalize_country(nat.text), quiet=True) emitter.emit(entity) emitter.emit(sanction)