def scrape_person(context, doc, url): hierarchy = doc.find( './/span[@itemtype="http://data-vocabulary.org/Breadcrumb"]') # Remove empty items in the list hierarchy = [ item.text_content() for item in hierarchy if item.text_content() and item.text_content().strip() ] # Strip first item ('institution') and last item ('name of person') hierarchy = hierarchy[1:-1] name = doc.find('.//h3[@itemprop="name"]').text_content() title = doc.findtext('.//td[@itemprop="jobTitle"]') entity_id = make_id(name, title) entity = Entity.create('eu-whoiswho', entity_id) entity.name = name entity.url = url entity.function = title address = entity.create_address() address.street = doc.findtext('.//span[@itemprop="streetAddress"]') address.postal_code = doc.findtext('.//span[@itemprop="postalCode"]') address.text = doc.findtext('.//span[@itemprop="addressLocality"]') # address.phone = doc.findtext('.//span[@itemprop="telephone"]') if len(hierarchy) > 1: entity.program = hierarchy[1] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def scrape_entity(context, data): row = data.get("row") legislature = data.get("legislature") country = data.get("country") if row.get('id') is None: context.log.warning("No ID for entry: %r", row) entity = Entity.create('everypolitician', row.get('id')) entity.type = entity.TYPE_INDIVIDUAL entity.updated_at = parse_ts(legislature.get('lastmod')) entity.name = row.get('name') entity.function = row.get('group') entity.program = legislature.get('name') entity.gender = GENDERS[row.get('gender')] nationality = entity.create_nationality() nationality.country = country.get('name') nationality.country_code = country.get('code') if row.get('name') != row.get('sort_name'): alias = entity.create_alias() alias.name = row.get('sort_name') # TODO: email # TODO: socialmedia # TODO: photograph # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry): type_ = ENTITY_TYPES[entry.get('Type')] entity = Entity.create('eu-eeas-sanctions', entry.get('Id')) entity.type = type_ entity.updated_at = entry.get('reg_date') entity.url = entry.get('pdf_link') entity.program = entry.get('programme') entity.summary = entry.get('remark') for name in entry.findall('./NAME'): if entity.name is None: obj = entity else: obj = entity.create_alias() obj.title = name.findtext('./TITLE') obj.name = name.findtext('./WHOLENAME') obj.first_name = name.findtext('./FIRSTNAME') obj.second_name = name.findtext('./MIDDLENAME') obj.last_name = name.findtext('./LASTNAME') if entity.function is None: entity.function = name.findtext('./FUNCTION') if entity.gender is None: entity.gender = GENDERS[name.findtext('./GENDER')] for passport in entry.findall('./PASSPORT'): identifier = entity.create_identifier() identifier.type = Identifier.TYPE_PASSPORT identifier.number = passport.findtext('./NUMBER') identifier.country = passport.findtext('./COUNTRY') for node in entry.findall('./ADDRESS'): address = entity.create_address() address.street = node.findtext('./STREET') address.street_2 = node.findtext('./NUMBER') address.city = node.findtext('./CITY') address.postal_code = node.findtext('./ZIPCODE') address.country = node.findtext('./COUNTRY') for birth in entry.findall('./BIRTH'): place = stringify(birth.findtext('./PLACE')) country = stringify(birth.findtext('./COUNTRY')) if place is not None or country is not None: birth_place = entity.create_birth_place() birth_place.place = place birth_place.country = country date_ = stringify(parse_date(birth.findtext('./DATE'))) if date_ is not None: birth_date = entity.create_birth_date() birth_date.date = date_ for country in entry.findall('./CITIZEN/COUNTRY'): nationality = entity.create_nationality() nationality.country = country.text # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def handle_organisation(context, data): header = ["No", "Name", "Reason for inclusion", "Category of entity", "Date of inclusion"] data = {key: value for key, value in zip(header, data)} entity_id = make_id(data["Name"], data["Reason for inclusion"]) entity = Entity.create("kg-fiu-national", entity_id) entity.type = entity.TYPE_ENTITY if "," in data["Name"]: data["Name"] = data["Name"].split(",") else: data["Name"] = [data["Name"]] entity.name = data["Name"][0] for alias in data["Name"][1:]: entity.create_alias(alias) entity.program = data["Category of entity"] entity.summary = data["Reason for inclusion"] entity.listed_at = data["Date of inclusion"] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def handle_individual(context, data): header = ["No", "Last Name", "Name", "Middle Name", "Date of birth", "Place of birth", "Reason for inclusion", "Category of entity", "Date of inclusion"] data = {key: value for key, value in zip(header, data)} entity_id = make_id(data["Last Name"], data["Middle Name"], data["Name"], data["Reason for inclusion"]) entity = Entity.create("kg-fiu-national", entity_id) entity.type = entity.TYPE_INDIVIDUAL entity.last_name = data["Last Name"] entity.first_name = data["Name"] entity.second_name = data["Middle Name"] birth_date = entity.create_birth_date() birth_date.date = data["Date of birth"] birth_place = entity.create_birth_date() birth_place.place = data["Place of birth"] entity.program = data["Category of entity"] entity.summary = data["Reason for inclusion"] entity.listed_at = data["Date of inclusion"] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry): uid = entry.findtext('number-entry') entity = Entity.create('ua-sdfm-blacklist', uid) entity.type = ENTITY_TYPES[entry.findtext('./type-entry')] entity.program = entry.findtext('./program-entry') entity.summary = entry.findtext('./comments') entity.url = 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en' date_entry = entry.findtext('./date-entry') if date_entry: date_entry = datetime.strptime(date_entry, '%Y%m%d') entity.updated_at = date_entry.date().isoformat() for aka in entry.findall('./aka-list'): if aka.findtext('type-aka') == 'N': obj = entity else: obj = entity.create_alias() obj.type = aka.findtext('./category-aka') obj.description = aka.findtext('./type-aka') obj.quality = ALIAS_QUALITY[aka.findtext('./quality-aka')] obj.first_name = aka.findtext('./aka-name1') obj.second_name = aka.findtext('./aka-name2') obj.third_name = aka.findtext('./aka-name3') obj.last_name = aka.findtext('./aka-name4') for node in entry.findall('./title-list'): entity.title = node.text for doc in entry.findall('./document-list'): identifier = entity.create_identifier() identifier.type = Identifier.TYPE_PASSPORT identifier.description = doc.findtext('./document-reg') identifier.number = doc.findtext('./document-id') identifier.country = doc.findtext('./document-country') for doc in entry.findall('./id-number-list'): identifier = entity.create_identifier() identifier.type = Identifier.TYPE_NATIONALID identifier.description = doc.text for node in entry.findall('./address-list'): address = entity.create_address() address.text = node.findtext('./address') for pob in entry.findall('./place-of-birth-list'): birth_place = entity.create_birth_place() birth_place.place = pob.text for dob in entry.findall('./date-of-birth-list'): birth_date = entity.create_birth_date() birth_date.date = parse_date(context, dob.text) for nat in entry.findall('./nationality-list'): nationality = entity.create_nationality() nationality.country = nat.text # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry, url, updated_at): uid = entry.findtext('uid') type_ = ENTITY_TYPES[entry.findtext('./sdnType')] if type_ is None: return entity = Entity.create('us-ofac', make_id(url, uid)) entity.type = type_ entity.updated_at = updated_at programs = [p.text for p in entry.findall('./programList/program')] entity.program = '; '.join(programs) entity.summary = entry.findtext('./remarks') entity.function = entry.findtext('./title') entity.first_name = entry.findtext('./firstName') entity.last_name = entry.findtext('./lastName') for aka in entry.findall('./akaList/aka'): alias = entity.create_alias() alias.first_name = aka.findtext('./firstName') alias.last_name = aka.findtext('./lastName') alias.type = aka.findtext('./type') alias.quality = ALIAS_QUALITY[aka.findtext('./category')] for ident in entry.findall('./idList/id'): type_ = ID_TYPES.get(ident.findtext('./idType'), Identifier.TYPE_OTHER) if type_ is None: continue identifier = entity.create_identifier() identifier.type = type_ identifier.number = ident.findtext('./idNumber') identifier.country = ident.findtext('./idCountry') identifier.description = ident.findtext('./idType') for addr in entry.findall('./addressList/address'): address = entity.create_address() address.street = addr.findtext('./address1') address.street_2 = addr.findtext('./address2') address.city = addr.findtext('./city') address.country = addr.findtext('./country') for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'): birth_place = entity.create_birth_place() birth_place.place = pob.findtext('./placeOfBirth') birth_place.quality = BirthPlace.QUALITY_WEAK if pob.findtext('./mainEntry') == 'true': birth_place.quality = BirthPlace.QUALITY_STRONG for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'): birth_date = entity.create_birth_date() birth_date.date = stringify(parse_date(pob.findtext('./dateOfBirth'))) birth_date.quality = BirthDate.QUALITY_WEAK if pob.findtext('./mainEntry') == 'true': birth_date.quality = BirthDate.QUALITY_STRONG # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_common(node): entity = Entity.create('un-sc-sanctions', node.findtext('./DATAID')) entity.program = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) entity.summary = node.findtext('./COMMENTS1') entity.function = node.findtext('./DESIGNATION/VALUE') entity.listed_at = node.findtext('./LISTED_ON') entity.updated_at = node.findtext('./LAST_DAY_UPDATED/VALUE') entity.name = node.findtext('./NAME_ORIGINAL_SCRIPT') entity.first_name = node.findtext('./FIRST_NAME') entity.second_name = node.findtext('./SECOND_NAME') entity.third_name = node.findtext('./THIRD_NAME') return entity
def parse_entry(context, node): entity = Entity.create('eu-meps', node.findtext('.//id')) entity.type = Entity.TYPE_INDIVIDUAL entity.name = node.findtext('.//fullName') entity.first_name, entity.last_name = split_name(entity.name) group = node.findtext('.//nationalPoliticalGroup') or '' entity.summary = '%s (%s)' % (node.findtext('.//politicalGroup') or '', group) nationality = entity.create_nationality() nationality.country = node.findtext('.//country') # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def scrape_case(context, data): url = data.get('url') res = context.http.get(url) doc = res.html name = element_text(doc.find('.//div[@class="nom_fugitif_wanted"]')) if name is None or name == 'Identity unknown': return uid = make_id(url) entity = Entity.create('interpol-red-notices', uid) entity.url = url entity.type = entity.TYPE_INDIVIDUAL entity.name = name entity.program = element_text(doc.find('.//span[@class="nom_fugitif_wanted_small"]')) # noqa if ', ' in name: last, first = name.split(', ', 1) alias = entity.create_alias() alias.name = ' '.join((first, last)) for row in doc.findall('.//div[@class="bloc_detail"]//tr'): title, value = row.findall('./td') name = slugify(element_text(title), sep='_') value = element_text(value) if value is None: continue if name == 'charges': entity.summary = value elif name == 'present_family_name': entity.last_name = value elif name == 'forename': entity.first_name = value elif name == 'nationality': for country in value.split(', '): nationality = entity.create_nationality() nationality.country = country elif name == 'sex': entity.gender = SEXES[value] elif name == 'date_of_birth': birth_date = entity.create_birth_date() birth_date.date = value.split('(')[0] elif name == 'place_of_birth': birth_place = entity.create_birth_place() birth_place.date = value # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry): link = entry.find('.//a') url_template = context.params.get('url') url = urljoin(url_template, link.get('href')) _, member_id = url.rsplit('=', 1) entity = Entity.create('coe_assembly', member_id) entity.type = Entity.TYPE_INDIVIDUAL entity.url = url entity.last_name, entity.first_name = link.text.split(', ', 1) entity.function = entry.findtext('.//span[@class="fonction"]') role, country = entry.findall('.//span[@class="infos"]') entity.summary = role.text_content().strip() nationality = entity.create_nationality() nationality.country = country.text_content().strip() # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_row(context, data): row = data.get('row') uid = make_id(row.get('Effective_Date'), row.get('Name')) entity = Entity.create('us-bis-denied', uid) entity.type = Entity.TYPE_ENTITY entity.name = row.get('Name') entity.updated_at = row.get('Effective_Date') entity.program = row.get('FR_Citation') entity.summary = row.get('Action') address = entity.create_address() address.street = row.get('Street_Address') address.postal_code = row.get('Postal_Code') address.region = row.get('State') address.city = row.get('City') address.country = row.get('Country') # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry): url = entry.get('href') res = context.http.get('https://www.worldpresidentsdb.com/' + url) doc = res.html content = doc.find('.//main/div') uid = make_id(url) entity = Entity.create('worldpresidentsdb', uid) entity.type = Entity.TYPE_INDIVIDUAL entity.function = 'President' entity.url = url entity.first_name, entity.last_name = content.find('h1').text.split(' ', 1) for element in content.findall('.//p'): type = element.find('.//b') if type is None: continue else: type = type.text if type == 'Country:': nationality = entity.create_nationality() nationality.country = element.find('a').text elif type == 'Date of Birth:': value = element[0].tail.strip() month, day, year = value.split('-', 2) birth_date = entity.create_birth_date() birth_date.date = year + '-' + month + '-' + day birth_date.quality = 'strong' elif type == 'Birth Place:': value = element[0].tail.strip() birth_place = entity.create_birth_place() birth_place.place = value elif type == 'Political Party:': value = element[0].tail.strip() entity.program = value elif type == 'Other Political Titles:': value = element[0].tail.strip() entity.summary = value # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entity(context, url, country, component, row, updated_at): function = element_text(row.find('.//span[@class="title"]')) if function is None: return name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: return uid = make_id(country, name, function) entity = Entity.create('us-cia-world-leaders', uid) entity.name = name entity.type = entity.TYPE_INDIVIDUAL entity.function = function entity.program = country entity.url = url entity.updated_at = updated_at nationality = entity.create_nationality() nationality.country = country # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse(context, data): url = context.params.get('url') res = context.http.rehash(data) doc = res.html for table in doc.findall('.//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): context.log.warning("No name: %r", values) continue entity = Entity.create('zz-wb-debarred', uid) entity.program = values[5] entity.name = names[0] entity.updated_at = dateutil_parse(values[3]).date().isoformat() entity.url = url for name in names[1:]: entity.create_alias(name=name) nationality = entity.create_nationality() nationality.country = values[2] address = entity.create_address() address.text = values[1] address.country = values[2] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, data): rows = data.get('rows') primary = rows[0] if slugify(primary.get('type', '')) == 'individual': type_ = Entity.TYPE_INDIVIDUAL else: type_ = Entity.TYPE_ENTITY entity = Entity.create('au-dfat-sanctions', primary.get('reference')) entity.type = type_ entity.url = 'http://dfat.gov.au/international-relations/security/sanctions/Pages/sanctions.aspx' # noqa entity.name = primary.get('name_of_individual_or_entity', '') entity.program = primary.get('committees', '') entity.summary = primary.get('additional_information', '') country = primary.get('citizenship', '') if not isinstance(country, float): # not NaN nationality = entity.create_nationality() nationality.country = country address = entity.create_address() address.text = primary.get('address', '') birth_date_text = primary.get('date_of_birth', '') if not isinstance(birth_date_text, float): birth_date = entity.create_birth_date() birth_date.date = birth_date_text birth_place_text = primary.get('place_of_birth', '') if not isinstance(birth_place_text, float): birth_place = entity.create_birth_place() birth_place.place = birth_place_text if rows[1:]: for row in rows[1:]: alias = entity.create_alias() alias.name = row.get('name_of_individual_or_entity', '') # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, data): rows = data.get('rows') primary = rows[0] if primary['Type'] == 'Individual': type_ = Entity.TYPE_INDIVIDUAL else: type_ = Entity.TYPE_ENTITY entity = Entity.create('au-dfat-sanctions', primary.get('Reference')) entity.type = type_ entity.url = 'http://dfat.gov.au/international-relations/security/sanctions/Pages/sanctions.aspx' # noqa entity.name = primary['Name of Individual or Entity'] entity.program = primary['Committees'] entity.summary = primary['Additional Information'] country = primary['Citizenship'] if not isinstance(country, float): # not NaN nationality = entity.create_nationality() nationality.country = country address = entity.create_address() address.text = primary['Address'] birth_date_text = primary['Date of Birth'] if not isinstance(birth_date_text, float): birth_date = entity.create_birth_date() birth_date.date = birth_date_text birth_place_text = primary['Place of Birth'] if not isinstance(birth_place_text, float): birth_place = entity.create_birth_place() birth_place.place = birth_place_text if rows[1:]: for row in rows[1:]: alias = entity.create_alias() alias.name = row['Name of Individual or Entity'] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, target, updated_at, sanctions, places): node = target.find('./individual') type_ = Entity.TYPE_INDIVIDUAL if node is None: node = target.find('./entity') type_ = Entity.TYPE_ENTITY if node is None: # node = target.find('./object') # TODO: build out support for these! return entity = Entity.create('ch-seco-sanctions', target.get('ssid')) entity.type = type_ entity.updated_at = updated_at entity.program = sanctions.get(target.get('sanctions-set-id')) entity.function = node.findtext('./other-information') entity.summary = node.findtext('./justification') for inode in node.findall('./identity'): parse_identity(entity, inode, places) # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def crawl_officer(context, data): officer_id = data.get('officer_id') for type_ in ('natural', 'corporate'): url = API_URL % (type_, officer_id) res = context.http.get(url, auth=AUTH) if res.status_code != 200: continue # TODO: check if this existed entity = Entity.create('gb-coh-disqualified', officer_id) data = res.json entity.title = data.get('title') entity.first_name = data.get('forename') entity.second_name = data.get('other_forenames') entity.last_name = data.get('surname') entity.summary = data.get('kind') entity.url = urljoin(WEB_URL, data.get('links', {}).get('self', '/')) if data.get('date_of_birth'): birth_date = entity.create_birth_date() birth_date.date = data.get('date_of_birth') if data.get('nationality'): nationality = entity.create_nationality() nationality.country = data.get('nationality') for disqualification in data.get('disqualifications', []): entity.program = disqualification.get('case_identifier') addr = disqualification.get('address') address = entity.create_address() address.street = addr.get('address_line_1') address.street_2 = addr.get('address_line_2') address.city = addr.get('locality') address.region = addr.get('region') address.postal_code = addr.get('postal_code') # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, data): group = data.get('group') rows = data.get('rows') seen = defaultdict(set) entity = Entity.create('gb-hmt-sanctions', group) for row in rows: entity.type = ENTITY_TYPES[row.pop('Group Type')] names = (row.pop('Name 1'), row.pop('Name 2'), row.pop('Name 3'), row.pop('Name 4'), row.pop('Name 5'), row.pop('Name 6')) names = [n for n in names if n is not None] row['_name'] = ' '.join(names) if fresh_value(seen, row, '_name'): name = entity if entity.name is not None: name = entity.create_alias() name.type = row.get('Alias Type') name.title = row.get('Title') name.last_name = names.pop() if len(names): name.first_name = names.pop(0) if len(names): name.second_name = names.pop(0) if len(names): name.third_name = ' '.join(names) if row.get('Regime'): entity.program = row.pop('Regime') if row.get('Position'): entity.function = row.pop('Position') if row.get('Other Information'): entity.summary = row.pop('Other Information') if row.get('Last Updated'): entity.updated_at = row.pop('Last Updated') if fresh_value(seen, row, 'DOB'): dob_text = row.get('DOB') if dob_text is None or not len(dob_text.strip()): continue dob = parse_date(dob_text) if dob is None and '/' in dob_text: _, dob = dob_text.rsplit('/', 1) birth_date = entity.create_birth_date() birth_date.date = stringify(dob) if fresh_value(seen, row, 'Town of Birth') or \ fresh_value(seen, row, 'Country of Birth'): birth_place = entity.create_birth_place() birth_place.place = row.pop('Town of Birth') birth_place.country = row.pop('Country of Birth') addr = [ row.pop('Address 1'), row.pop('Address 2'), row.pop('Address 3'), row.pop('Address 4'), row.pop('Address 5'), row.pop('Address 6') ] addr_ids = addr + [row.get('Post/Zip Code'), row.get('Post/Zip Code')] row['_addr'] = ' '.join([a for a in addr_ids if a is not None]) if fresh_value(seen, row, '_addr'): address = entity.create_address() address.country = row.pop('Country') address.postal_code = row.pop('Post/Zip Code') address.text = ', '.join([a for a in addr if a is not None]) if fresh_value(seen, row, 'Passport Details'): identifier = entity.create_identifier() identifier.type = Identifier.TYPE_PASSPORT identifier.number = row.pop('Passport Details') identifier.country = row.get('Nationality') if fresh_value(seen, row, 'NI Number'): identifier = entity.create_identifier() identifier.type = Identifier.TYPE_NATIONALID identifier.number = row.pop('NI Number') identifier.country = row.get('Nationality') if fresh_value(seen, row, 'Nationality'): has_match = False text = row.pop('Nationality') for name in text.split(')'): code = name if code is not None: nationality = entity.create_nationality() nationality.country = name has_match = True if not has_match: nationality = entity.create_nationality() nationality.country = text # pprint(entity.to_dict()) context.emit(data=entity.to_dict())