def parse_case(emit, case): url = case.get('url') name = combine_name(*reversed(case.get('name').split(', '))) updated = dateutil_parse(case.get('last_updated')) record = { 'uid': make_id('interpol', url.split('/')[-1]), 'source_url': url, 'name': name, 'summary': case.get('reason'), 'updated_at': updated.date().isoformat(), 'place_of_birth': case.get('place_of_birth'), 'gender': case.get('sex', '').lower(), 'first_name': case.get('forename'), 'last_name': case.get('present_family_name'), 'nationality': normalize_country(case.get('nationality')), 'identities': [], 'addresses': [], 'other_names': [] } record.update(SOURCE) birth = case.get('date_of_birth').split(' ')[0] try: dt = datetime.strptime(birth, '%Y').date().isoformat() record['date_of_birth'] = dt except Exception: try: dt = datetime.strptime(birth, '%d/%m/%Y').date().isoformat() record['date_of_birth'] = dt except Exception as ex: log.exception(ex) emit.entity(record)
def parse_common(node, type_): program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) record = { 'uid': make_id('un', 'sc', node.findtext('./DATAID')), 'type': type_, 'program': program_ref, 'summary': node.findtext('./COMMENTS1'), 'name': combine_name(node.findtext('./FIRST_NAME'), node.findtext('./SECOND_NAME'), node.findtext('./THIRD_NAME')), 'function': node.findtext('./DESIGNATION/VALUE'), 'updated_at': node.findtext('./LISTED_ON'), 'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')), 'other_names': [], 'addresses': [], 'identities': [] } record.update(BASE) orig = node.findtext('./NAME_ORIGINAL_SCRIPT') if orig is not None: record['name'] = orig last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE') if last_updated is not None: record['updated_at'] = last_updated if ':' in record['updated_at']: record['updated_at'] = record['updated_at'].rsplit('-', 1)[0] # print etree.tostring(node, pretty_print=True) return record
def wbdeb_parse(emit, html_file): doc = html.parse(html_file) for table in doc.findall('//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') print table.get('summary'), len(rows) for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): log.warning("No name: %r", values) continue record = { 'uid': make_id('wb', 'debarred', uid), 'name': values[0], 'nationality': normalize_country(values[2]), 'program': values[5], 'addresses': [{ 'text': values[1], 'country': normalize_country(values[2]) }], 'other_names': [], 'updated_at': dateutil_parse(values[3]).date().isoformat() } for name in names[1:]: record['other_names'].append({'other_name': name}) record.update(SOURCE) emit.entity(record)
def parse_row(emit, row): record = SOURCE.copy() record.update({ 'uid': make_id('us', 'bis', row.get('Effective_Date'), row.get('Name')), 'name': row.get('Name'), 'program': row.get('FR_Citation'), 'summary': row.get('Action'), 'updated_at': row.get('Last Update'), 'nationality': normalize_country(row.get('Country')), 'addresses': [{ 'address1': row.get('Street_Address'), 'postal_code': row.get('Postal_Code'), 'region': row.get('State'), 'city': row.get('City'), 'country': normalize_country(row.get('Country')) }] }) emit.entity(record)
def everypolitician_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for policitian in data.get('politicians'): # from pprint import pprint # pprint(policitian) # TODO: add politician country = normalize_country(policitian.get('country_code')) entity = { 'uid': make_id('evpo', policitian.get('id').split('-')[-1]), 'name': policitian.get('name'), 'type': 'individual', 'addresses': [{'country': country}], 'updated_at': parse_ts(policitian.get('legislature_lastmod')), 'source_url': policitian.get('source_url') } entity.update(PUBLISHER) emit.entity(entity)
def wbdeb_parse(emit, html_file): doc = html.parse(html_file) for table in doc.findall('//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') print table.get('summary'), len(rows) for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): log.warning("No name: %r", values) continue record = { 'uid': make_id('wb', 'debarred', uid), 'name': values[0], 'nationality': normalize_country(values[2]), 'program': values[5], 'addresses': [{ 'text': values[1], 'country': normalize_country(values[2]) }], 'other_names': [], 'updated_at': dateutil_parse(values[3]).date().isoformat() } for name in names[1:]: record['other_names'].append({ 'other_name': name }) record.update(SOURCE) emit.entity(record)
def parse_entry(emit, doc, target, sanctions, places): node = target.find('./individual') if node is None: node = target.find('./entity') record = { 'uid': make_id('ch', 'seco', target.get('ssid')), 'type': node.tag, 'updated_at': doc.getroot().get('date'), 'program': sanctions.get(target.get('sanctions-set-id')), 'function': node.findtext('./other-information'), 'summary': node.findtext('./justification'), 'other_names': [], 'addresses': [], 'identities': [] } record.update(PUBLISHER) for ident in node.findall('./identity'): parse_identity(record, ident, places) # from pprint import pprint # pprint(record) emit.entity(record)
def parse_common(node, type_): program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) record = { 'uid': make_id('un', 'sc', node.findtext('./DATAID')), 'type': type_, 'program': program_ref, 'summary': node.findtext('./COMMENTS1'), 'name': combine_name(node.findtext('./FIRST_NAME'), node.findtext('./LAST_NAME')), 'function': node.findtext('./DESIGNATION/VALUE'), 'updated_at': node.findtext('./LISTED_ON'), 'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')), 'other_names': [], 'addresses': [], 'identities': [] } record.update(BASE) orig = node.findtext('./NAME_ORIGINAL_SCRIPT') if orig is not None: record['name'] = orig last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE') if last_updated is not None: record['updated_at'] = last_updated if ':' in record['updated_at']: record['updated_at'] = record['updated_at'].rsplit('-', 1)[0] # print etree.tostring(node, pretty_print=True) return record
def everypolitician_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for policitian in data.get('politicians'): # from pprint import pprint # pprint(policitian) # TODO: add politician country = normalize_country(policitian.get('country_code')) entity = { 'uid': make_id('evpo', policitian.get('id').split('-')[-1]), 'name': policitian.get('name'), 'type': 'individual', 'addresses': [{ 'country': country }], 'updated_at': parse_ts(policitian.get('legislature_lastmod')), 'source_url': policitian.get('source_url') } entity.update(PUBLISHER) emit.entity(entity)
def worldleaders_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for leader in data.get('leaders'): if not len(leader.get('name')): log.warning('No name on entity: %(title)s (%(country)s)', leader) continue country = normalize_country(leader.get('country')) summary = leader.get('title') if leader.get('component'): summary = '%s (%s)' % (summary, leader.get('component')) entity = { 'uid': make_id('us', 'cia', 'worldleaders', country, leader.get('name')), 'name': leader.get('name'), 'type': 'individual', 'summary': summary, 'addresses': [{'country': country}], 'updated_at': parse_date(leader.get('last_update')), 'source_url': leader.get('url') } entity.update(PUBLISHER) emit.entity(entity)
def parse_entry(emit, record, entry): uid = entry.findtext('uid') record.update({ 'uid': make_id('us', 'ofac', uid), 'type': 'individual', 'program': entry.findtext('./programList/program'), 'summary': entry.findtext('./remarks'), 'first_name': entry.findtext('./firstName'), 'last_name': entry.findtext('./lastName'), 'name': combine_name(entry.findtext('./firstName'), entry.findtext('./lastName')) }) is_entity = entry.findtext('./sdnType') != 'Individual' if is_entity: record['type'] = 'entity' record.pop('last_name', None) record['other_names'] = [] for aka in entry.findall('./akaList/aka'): data = { 'type': aka.findtext('./type'), 'quality': aka.findtext('./category'), 'first_name': aka.findtext('./firstName'), 'last_name': aka.findtext('./lastName'), 'other_name': combine_name(aka.findtext('./firstName'), aka.findtext('./lastName')) } if is_entity: data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for ident in entry.findall('./idList/id'): data = { 'type': ident.findtext('./idType'), 'number': ident.findtext('./idNumber'), 'country': normalize_country(ident.findtext('./idCountry')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./addressList/address'): data = { 'address1': address.findtext('./address1'), 'address2': address.findtext('./address2'), 'city': address.findtext('./city'), 'country': normalize_country(address.findtext('./country')) } record['addresses'].append(data) for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'): if pob.findtext('./mainEntry') == 'true': record['place_of_birth'] = pob.findtext('./placeOfBirth') for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'): if pob.findtext('./mainEntry') == 'true': dt = pob.findtext('./dateOfBirth') record['date_of_birth'] = parse_date(dt) # print etree.tostring(entry, pretty_print=True) if is_entity: record.pop('last_name', None) emit.entity(record)
def parse_entry(emit, record, entry): uid = entry.get('Id') record.update({ 'uid': make_id('eu', 'eeas', uid), 'type': 'individual', 'updated_at': entry.get('reg_date'), 'source_url': entry.get('pdf_link'), 'program': entry.get('programme'), 'summary': entry.get('remark') }) is_entity = entry.get('Type') != 'P' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./NAME'): data = { 'first_name': aka.findtext('./FIRSTNAME'), 'last_name': aka.findtext('./LASTNAME'), 'middle_name': aka.findtext('./MIDDLENAME'), 'other_name': aka.findtext('./WHOLENAME') } funct = aka.findtext('./FUNCTION') if funct and len(funct) > len(record.get('function', '')): record['function'] = funct gender = aka.findtext('./GENDER') if gender == 'M': data['gender'] = 'male' if gender == 'F': data['gender'] = 'female' if 'name' not in record: record['name'] = data.pop('other_name') record.update(data) else: record['other_names'].append(data) record['identities'] = [] for passport in entry.findall('./PASSPORT'): data = { 'type': 'Passport', 'number': passport.findtext('./NUMBER'), 'country': normalize_country(passport.findtext('./COUNTRY')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./ADDRESS'): data = { 'address1': address.findtext('./STREET'), 'address2': address.findtext('./NUMBER'), 'city': address.findtext('./CITY'), 'postal_code': address.findtext('./ZIPCODE'), 'country': normalize_country(address.findtext('./COUNTRY')) } record['addresses'].append(data) for birth in entry.findall('./BIRTH'): place = birth.findtext('./PLACE') if place and len(place) > len(record.get('place_of_birth', '')): record['place_of_birth'] = place date_ = parse_date(birth.findtext('./DATE')) if date_ and len(date_) > len(record.get('date_of_birth', '')): record['date_of_birth'] = date_ country = normalize_country(birth.findtext('./COUNTRY')) if country and len(country) > len(record.get('country_of_birth', '')): record['country_of_birth'] = country # print etree.tostring(entry, pretty_print=True) emit.entity(record)
def parse_entry(emit, group, rows): record = SOURCE.copy() record.update({ 'uid': make_id('gb', 'hmt', group), 'identities': [], 'addresses': [], 'other_names': [] }) for row in rows: record.update({ 'type': row.pop('Group Type').lower(), 'date_of_birth': parse_date(row.pop('DOB')), 'place_of_birth': row.pop('Town of Birth'), 'country_of_birth': normalize_country(row.pop('Country of Birth')), 'nationality': normalize_country(row.get('Nationality')), 'program': row.pop('Regime'), 'summary': row.pop('Other Information'), 'updated_at': parse_date(row.pop('Last Updated')), 'function': row.pop('Position') }) names = { 'first_name': row.get('Name 1'), 'second_name': row.get('Name 2'), 'middle_name': row.get('Name 3'), 'last_name': row.get('Name 6') } name = [ row.pop('Title'), row.pop('Name 1'), row.pop('Name 2'), row.pop('Name 3'), row.pop('Name 4'), row.pop('Name 5'), row.pop('Name 6') ] name = combine_name(*name) if 'name' not in record: record['name'] = name record.update(names) else: names['other_name'] = name names['type'] = row.pop('Alias Type') record['other_names'].append(names) addr = [ row.pop('Address 1'), row.pop('Address 2'), row.pop('Address 3'), row.pop('Address 4'), row.pop('Address 5'), row.pop('Address 6') ] addr = combine_name(*addr) if len(addr): record['addresses'].append({ 'text': addr, 'postal_code': row.pop('Post/Zip Code') }) if row.get('Passport Details'): record['identities'].append({ 'type': 'Passport', 'number': row.pop('Passport Details'), 'country': normalize_country(row.get('Nationality')) }) if row.get('NI Number'): record['identities'].append({ 'type': 'NI', 'number': row.pop('NI Number'), 'country': normalize_country(row.get('Country')) }) # from pprint import pprint # pprint(row) emit.entity(record)
def parse_entry(emit, group, rows): record = SOURCE.copy() record.update({ 'uid': make_id('gb', 'hmt', group), 'identities': [], 'addresses': [], 'other_names': [] }) for row in rows: record.update({ 'type': row.pop('Group Type').lower(), 'date_of_birth': parse_date(row.pop('DOB')), 'place_of_birth': row.pop('Town of Birth'), 'country_of_birth': normalize_country(row.pop('Country of Birth')), 'nationality': normalize_country(row.get('Nationality')), 'program': row.pop('Regime'), 'summary': row.pop('Other Information'), 'updated_at': parse_date(row.pop('Last Updated')), 'function': row.pop('Position') }) names = { 'first_name': row.get('Name 1'), 'second_name': row.get('Name 2'), 'middle_name': row.get('Name 3'), 'last_name': row.get('Name 6') } name = [row.pop('Title'), row.pop('Name 1'), row.pop('Name 2'), row.pop('Name 3'), row.pop('Name 4'), row.pop('Name 5'), row.pop('Name 6')] name = combine_name(*name) if 'name' not in record: record['name'] = name record.update(names) else: names['other_name'] = name names['type'] = row.pop('Alias Type') record['other_names'].append(names) addr = [row.pop('Address 1'), row.pop('Address 2'), row.pop('Address 3'), row.pop('Address 4'), row.pop('Address 5'), row.pop('Address 6')] addr = combine_name(*addr) if len(addr): record['addresses'].append({ 'text': addr, 'postal_code': row.pop('Post/Zip Code') }) if row.get('Passport Details'): record['identities'].append({ 'type': 'Passport', 'number': row.pop('Passport Details'), 'country': normalize_country(row.get('Nationality')) }) if row.get('NI Number'): record['identities'].append({ 'type': 'NI', 'number': row.pop('NI Number'), 'country': normalize_country(row.get('Country')) }) # from pprint import pprint # pprint(row) emit.entity(record)
def parse_entry(emit, entry): uid = entry.findtext('number-entry') record = { 'uid': make_id('ua', 'sdfm', uid), 'type': 'individual', 'publisher': 'State Financial Monitoring Service of Ukraine', 'publisher_url': 'http://www.sdfm.gov.ua/', 'source_id': 'UA-SDFM-SANC', 'source_url': 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en', 'program': entry.findtext('./program-entry'), 'summary': entry.findtext('./comments') } date_entry = entry.findtext('./date-entry') if date_entry: date_entry = datetime.strptime(date_entry, '%Y%m%d') record['updated_at'] = date_entry.date().isoformat() is_entity = entry.findtext('./type-entry') != '1' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./aka-list'): data = get_names(aka) if aka.findtext('type-aka') == 'N': record['name'] = data.pop('other_name', None) record.update(data) else: data['type'] = aka.findtext('./type-aka') data['quality'] = aka.findtext('./quality-aka') if data['quality'] == '1': data['quality'] = 'strong' if data['quality'] == '2': data['quality'] = 'weak' # if is_entity: # data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for doc in entry.findall('./document-list'): data = { 'text': doc.findtext('./document-reg'), 'number': doc.findtext('./document-id'), 'country': normalize_country(doc.findtext('./document-country')) } record['identities'].append(data) for doc in entry.findall('./id-number-list'): data = {'text': doc.text.strip()} record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./address-list'): data = { 'text': address.findtext('./address') } record['addresses'].append(data) # FIXME: handle multiple for pob in entry.findall('./place-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple places of birth: %(name)s', record) record['place_of_birth'] = pob.text.strip() # FIXME: handle multiple for dob in entry.findall('./date-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple dates of birth: %(name)s', record) record['date_of_birth'] = parse_date(dob.text) # print etree.tostring(entry, pretty_print=True) emit.entity(record)