def crawl_company(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "company") for row in data["data"]: row = clean_row(row) # context.pprint(row) company_id = row.pop("company_id") name_en = row.pop("name_en", None) name = row.pop("name", None) or name_en entity = context.make("Organization") entity.id = context.make_slug("company", company_id, name) if entity.id is None: entity.id = context.make_slug( "company", company_id, row.pop("ogrn", None), strict=False, ) entity.add("name", name) entity.add("name", name_en) entity.add("name", row.pop("name_uk", None)) entity.add("name", row.pop("name_ru", None)) entity.add("innCode", row.pop("inn", None)) entity.add_cast("Company", "ogrnCode", row.pop("ogrn", None)) country = row.pop("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) context.emit(entity, target=True) row.pop("logo_en", None)
def parse_entry(context: Context, entry, parties): party_id = context.make_slug(entry.get("ProfileID")) party = parties[party_id] sanction = h.make_sanction(context, party, key=entry.get("ID")) sanction.add("program", ref_value("List", entry.get("ListID"))) for event in entry.findall("./EntryEvent"): date = parse_date(event.find("./Date")) party.add("createdAt", date) sanction.add("summary", event.findtext("./Comment")) basis = ref_value("LegalBasis", event.get("LegalBasisID")) sanction.add("reason", basis) party.add("topics", "sanction") sanction.add("listingDate", party.get("createdAt")) sanction.add("startDate", party.get("modifiedAt")) for measure in entry.findall("./SanctionsMeasure"): sanction.add("summary", measure.findtext("./Comment")) type_id = measure.get("SanctionsTypeID") sanction.add("program", ref_value("SanctionsType", type_id)) context.emit(sanction) context.emit(party, target=True)
def parse_party(context: Context, distinct_party, locations, documents): profile = distinct_party.find("Profile") sub_type = ref_get("PartySubType", profile.get("PartySubTypeID")) schema = TYPES.get(sub_type.get("Value")) type_ = ref_value("PartyType", sub_type.get("PartyTypeID")) schema = TYPES.get(type_, schema) if schema is None: context.log.error("Unknown party type", value=type_) return party = context.make(schema) party.id = context.make_slug(profile.get("ID")) party.add("notes", h.clean_note(distinct_party.findtext("Comment"))) party.add("sourceUrl", URL % profile.get("ID")) for identity in profile.findall("./Identity"): parts = {} for group in identity.findall(".//NamePartGroup"): type_id = group.get("NamePartTypeID") parts[group.get("ID")] = ref_value("NamePartType", type_id) for alias in identity.findall("./Alias"): parse_alias(party, parts, alias) for regdoc in documents.get(identity.get("ID"), []): parse_registration_doc(context, party, regdoc) for feature in profile.findall("./Feature"): parse_feature(context, feature, party, locations) context.emit(party, target=True) # pprint(party.to_dict()) # context.log.info("[%s] %s" % (party.schema.name, party.caption)) return party
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def crawl_person(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "person") for row in data["data"]: row = clean_row(row) person_id = row.pop("person_id") name_en = row.pop("name_en", None) name_ru = row.pop("name_ru", None) name_uk = row.pop("name_uk", None) name = name_en or name_ru or name_uk entity = context.make("Person") entity.id = context.make_slug("person", person_id, name) entity.add("name", name) entity.add("alias", name_ru) entity.add("alias", name_uk) entity.add("birthDate", parse_date(row.pop("date_bd", None))) url = "https://sanctions.nazk.gov.ua/sanction-person/%s/" entity.add("sourceUrl", url % person_id) if row.get("city_bd_en") != "N/A": entity.add("birthPlace", row.pop("city_bd_en", None)) entity.add("birthPlace", row.pop("city_bd_ru", None)) entity.add("birthPlace", row.pop("city_bd_uk", None)) entity.add("position", row.pop("position_en", None)) entity.add("position", row.pop("position_ru", None)) entity.add("position", row.pop("position_uk", None)) entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) country = row.get("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") context.emit(entity, target=True)
def parse_row(context: Context, row): entity = context.make("LegalEntity") entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name")) entity.add("name", row.get("Name")) entity.add("notes", row.get("Action")) entity.add("country", row.get("Country")) entity.add("modifiedAt", row.get("Last_Update")) address = h.make_address( context, street=row.get("Street_Address"), postal_code=row.get("Postal_Code"), city=row.get("City"), region=row.get("State"), country=row.get("Country"), ) h.apply_address(context, entity, address) context.emit(entity, target=True) citation = row.get("FR_Citation") sanction = h.make_sanction(context, entity, key=citation) sanction.add("program", citation) sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS)) sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS)) # pprint(row) context.emit(sanction)
def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def parse_person(context: Context, data, country, lastmod): person_id = data.pop("id", None) person = context.make("Person") person.id = context.make_slug(person_id) person.add("nationality", country) name = data.get("name") if name is None or name.lower().strip() in ("unknown", ): return person.add("modifiedAt", lastmod.date()) person.add("name", data.pop("name", None)) person.add("alias", data.pop("sort_name", None)) for other in data.pop("other_names", []): person.add("alias", other.get("name")) person.add("gender", data.pop("gender", None)) person.add("title", data.pop("honorific_prefix", None)) person.add("title", data.pop("honorific_suffix", None)) person.add("firstName", data.pop("given_name", None)) person.add("lastName", data.pop("family_name", None)) person.add("fatherName", data.pop("patronymic_name", None)) person.add("birthDate", data.pop("birth_date", None)) person.add("deathDate", data.pop("death_date", None)) person.add("email", h.clean_emails(data.pop("email", None))) person.add("notes", data.pop("summary", None)) person.add("topics", "role.pep") for link in data.pop("links", []): url = link.get("url") if link.get("note") in ("website", "blog", "twitter", "facebook"): person.add("website", url) # elif "Wikipedia (" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # elif "wikipedia" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # else: # person.log.info("Unknown URL", url=url, note=link.get("note")) for ident in data.pop("identifiers", []): identifier = ident.get("identifier") scheme = ident.get("scheme") if scheme == "wikidata" and identifier.startswith("Q"): person.add("wikidataId", identifier) for contact_detail in data.pop("contact_details", []): value = contact_detail.get("value") if "email" == contact_detail.get("type"): person.add("email", h.clean_emails(value)) if "phone" == contact_detail.get("type"): person.add("phone", h.clean_phones(value)) if check_person_cutoff(person): return # data.pop("image", None) # data.pop("images", None) # if len(data): # pprint(data) context.emit(person, target=True) # entities[person_id] = person.id return person.id
def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_node(context: Context, node): mep_id = node.findtext(".//id") person = context.make("Person") person.id = context.make_slug(mep_id) url = "http://www.europarl.europa.eu/meps/en/%s" % mep_id person.add("sourceUrl", url) name = node.findtext(".//fullName") person.add("name", name) first_name, last_name = split_name(name) person.add("firstName", first_name) person.add("lastName", last_name) person.add("nationality", node.findtext(".//country")) person.add("topics", "role.pep") context.emit(person, target=True) party_name = node.findtext(".//nationalPoliticalGroup") if party_name not in ["Independent"]: party = context.make("Organization") party.id = context.make_slug("npg", party_name) if party.id is not None: party.add("name", party_name) party.add("country", node.findtext(".//country")) context.emit(party) membership = context.make("Membership") membership.id = context.make_id(person.id, party.id) membership.add("member", person) membership.add("organization", party) context.emit(membership) group_name = node.findtext(".//politicalGroup") group = context.make("Organization") group.id = context.make_slug("pg", group_name) if group.id is not None: group.add("name", group_name) group.add("country", "eu") context.emit(group) membership = context.make("Membership") membership.id = context.make_id(person.id, group.id) membership.add("member", person) membership.add("organization", group) context.emit(membership)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context: Context, node: _Element): entity_name = node.findtext("./Entity") dob = node.findtext("./DateOfBirth") schedule = node.findtext("./Schedule") if schedule == "N/A": schedule = "" program = node.findtext("./Country") item = node.findtext("./Item") if entity_name is not None: entity = context.make("LegalEntity") entity.add("name", entity_name.split("/")) else: entity = context.make("Person") given_name = node.findtext("./GivenName") last_name = node.findtext("./LastName") entity_name = h.make_name(given_name=given_name, last_name=last_name) entity.add("name", entity_name) entity.add("birthDate", dob) country = program if program is not None and "/" in program: country, _ = program.split("/") entity.add("country", country) entity.id = context.make_slug( schedule, item, entity.first("country"), entity_name, strict=False, ) sanction = h.make_sanction(context, entity) sanction.add("program", program) sanction.add("reason", schedule) sanction.add("authorityId", item) names = node.findtext("./Aliases") if names is not None: for name in names.split(", "): name = collapse_spaces(name) entity.add("alias", name) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for entry in data.get("result", []): wallet = context.make("CryptoWallet", target=True) wallet.id = context.make_slug(entry.get("address")) wallet.add("publicKey", entry.pop("address")) wallet.add("topics", "crime.theft") wallet.add("createdAt", entry.pop("createdAt")) wallet.add("modifiedAt", entry.pop("updatedAt")) wallet.add("alias", entry.pop("family")) wallet.add("balance", format_number(entry.pop("balance"))) wallet.add("amountUsd", format_number(entry.pop("balanceUSD"))) wallet.add("currency", entry.pop("blockchain")) h.audit_data(entry, ignore=["transactions"]) context.emit(wallet)
def crawl_physical(context: Context) -> None: data = json_resource(context, PHYSICAL_URL, "physical") for row in data: entity = context.make("Person") entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", row.pop("additional", None)) for country in multi_split(row.pop("citizenship", None), [", "]): entity.add("nationality", country) entity.add("birthDate", row.pop("birthdate", None)) entity.add("birthPlace", row.pop("birthplace", None)) entity.add("position", row.pop("occupation", None)) handle_address(context, entity, row.pop("livingplace", None)) handle_sanction(context, entity, row) context.emit(entity, target=True)
def crawl_legal(context: Context) -> None: data = json_resource(context, LEGAL_URL, "legal") for row in data: entity = context.make("Organization") entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", row.pop("additional", None)) ipn = row.pop("ipn", "") or "" entity.add("taxNumber", ipn.replace("ІПН", "")) odrn = row.pop("odrn_edrpou", "") or "" entity.add("registrationNumber", odrn.replace("ОДРН", "")) handle_address(context, entity, row.pop("place", None)) handle_address(context, entity, row.pop("place_alternative", None)) handle_sanction(context, entity, row) # context.pprint(row) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for node in doc.findall("./entry"): entity = context.make("Organization") name = node.findtext("./title") entity.id = context.make_slug(node.findtext("./id"), name) entity.add("name", name) link = node.find("./link").get("href") entity.add("sourceUrl", link) aliases = node.findtext("./summary") if aliases != "N/A": aliases = aliases.split(", ") entity.add("alias", aliases) entity.add("notes", node.findtext("./content")) entity.add("createdAt", node.findtext("./published")) entity.add("modifiedAt", node.findtext("./updated")) entity.add("topics", "crime.terror") context.emit(entity, target=True)
def parse_common(context: Context, entity, node): entity.id = context.make_slug(node.findtext("./DATAID")) h.apply_name( entity, given_name=node.findtext("./FIRST_NAME"), second_name=node.findtext("./SECOND_NAME"), name3=node.findtext("./THIRD_NAME"), name4=node.findtext("./FOURTH_NAME"), quiet=True, ) entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT")) entity.add("notes", h.clean_note(node.findtext("./COMMENTS1"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) entity.add("createdAt", node.findtext("./LISTED_ON")) sanction.add("listingDate", node.findtext("./LISTED_ON")) sanction.add("startDate", node.findtext("./LISTED_ON")) sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) sanction.add("program", node.findtext("./UN_LIST_TYPE")) sanction.add("unscId", node.findtext("./REFERENCE_NUMBER")) return sanction
def crawl_country(context: Context, params, path, country): source_url = UI_URL % path context.log.debug("Crawling country: %s" % country) res = context.fetch_json(DATA_URL % path, params=params) data = res.get("result", {}).get("data", {}).get("page", {}) blocks = data.get("acf", {}).get("blocks", [{}])[0] content = blocks.get("free_form_content", []).get("content") doc = html.fromstring(content) function = None for i, el in enumerate(doc.getchildren()): text = el.text_content().strip() if el.tag == "h2": continue if el.tag == "h3": function = text continue if i == 0 and el.tag == "p": # this paragraph at the start is a note, not a person continue name = text.replace("(Acting)", "") if is_empty(name): continue context.log.debug( "Person", country=country, name=name, function=function, url=source_url, ) person = context.make("Person") person.id = context.make_slug(country, name, function) person.add("name", name) person.add("country", country) person.add("position", function) person.add("sourceUrl", source_url) person.add("topics", "role.pep") context.emit(person, target=True)
def crawl_entity(context: Context, data): # context.pprint(data) nature = data.pop("Nature") schema = SCHEMATA.get(nature) if schema is None: context.log.error("Unknown entity type", nature=nature) return entity = context.make(schema) entity.id = context.make_slug(data.pop("IdRegistre")) sanction = h.make_sanction(context, entity) for detail in data.pop("RegistreDetail"): field = detail.pop("TypeChamp") for value in detail.pop("Valeur"): apply_prop(context, entity, sanction, field, value) name = data.pop("Nom") h.apply_name( entity, first_name=entity.first("firstName", quiet=True), tail_name=name, quiet=True, ) entity.add("topics", "sanction") context.emit(entity, target=True)
def parse_entry(context: Context, target, programs, places, updated_at): entity = context.make("LegalEntity") node = target.find("./entity") if node is None: node = target.find("./individual") entity = context.make("Person") if node is None: node = target.find("./object") object_type = node.get("object-type") if object_type != "vessel": context.log.warning("Unknown target type", target=target, object_type=object_type) entity = context.make("Vessel") entity.id = context.make_slug(target.get("ssid")) entity.add("gender", node.get("sex"), quiet=True) for other in node.findall("./other-information"): value = other.text.strip() if entity.schema.is_a("Vessel") and value.lower().startswith("imo"): _, imo_num = value.split(":", 1) entity.add("imoNumber", imo_num) else: entity.add("notes", h.clean_note(value)) sanction = h.make_sanction(context, entity) dates = set() for mod in target.findall("./modification"): dates.add(mod.get("publication-date")) sanction.add("listingDate", mod.get("publication-date")) sanction.add("startDate", mod.get("effective-date")) dates_ = [d for d in dates if d is not None] if len(dates_): entity.add("createdAt", min(dates_)) entity.add("modifiedAt", max(dates_)) ssid = target.get("sanctions-set-id") sanction.add("program", programs.get(ssid)) for justification in node.findall("./justification"): # TODO: should this go into sanction:reason? entity.add("notes", h.clean_note(justification.text)) for relation in node.findall("./relation"): rel_type = relation.get("relation-type") target_id = context.make_slug(relation.get("target-id")) res = context.lookup("relations", rel_type) if res is None: context.log.warn( "Unknown relationship type", type=rel_type, source=entity, target=target_id, ) continue rel = context.make(res.schema) rel.id = context.make_slug(relation.get("ssid")) rel.add(res.source, entity.id) rel.add(res.target, target_id) rel.add(res.text, rel_type) # rel_target = context.make(rel.schema.get(res.target).range) # rel_target.id = target_id # context.emit(rel_target) entity.add_schema(rel.schema.get(res.source).range) context.emit(rel) for identity in node.findall("./identity"): parse_identity(context, entity, identity, places) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_item(context: Context, listing): links = listing.get("links", {}) url = urljoin(API_URL, links.get("self")) data = http_get(context, url, cache_days=14) person = context.make("Person") _, officer_id = url.rsplit("/", 1) person.id = context.make_slug(officer_id) person.add("name", listing.get("title")) person.add("notes", listing.get("description")) person.add("topics", "crime") source_url = urljoin(WEB_URL, links.get("self")) person.add("sourceUrl", source_url) last_name = data.pop("surname", None) person.add("lastName", last_name) forename = data.pop("forename", None) person.add("firstName", forename) other_forenames = data.pop("other_forenames", None) person.add("middleName", other_forenames) person.add("title", data.pop("title", None)) nationality = data.pop("nationality", None) if nationality is not None: person.add("nationality", nationality.split(",")) person.add("birthDate", data.pop("date_of_birth", None)) person.add("topics", "crime") address = listing.get("address", {}) address = h.make_address( context, full=listing.get("address_snippet"), street=address.get("address_line_1"), street2=address.get("premises"), city=address.get("locality"), postal_code=address.get("postal_code"), region=address.get("region"), # country_code=person.first("nationality"), ) h.apply_address(context, person, address) for disqual in data.pop("disqualifications", []): case_id = disqual.get("case_identifier") sanction = h.make_sanction(context, person, key=case_id) sanction.add("recordId", case_id) sanction.add("startDate", disqual.get("disqualified_from")) sanction.add("endDate", disqual.get("disqualified_until")) sanction.add("listingDate", disqual.get("undertaken_on")) for key, value in disqual.get("reason", {}).items(): value = value.replace("-", " ") reason = f"{key}: {value}" sanction.add("reason", reason) sanction.add("country", "gb") context.emit(sanction) address = disqual.get("address", {}) address = h.make_address( context, full=listing.get("address_snippet"), street=address.get("address_line_1"), street2=address.get("premises"), city=address.get("locality"), postal_code=address.get("postal_code"), region=address.get("region"), # country_code=person.first("nationality"), ) for company_name in disqual.get("company_names", []): company = context.make("Company") company.id = context.make_slug("named", company_name) company.add("name", company_name) company.add("jurisdiction", "gb") context.emit(company) h.apply_address(context, company, address) directorship = context.make("Directorship") directorship.id = context.make_id(person.id, company.id) directorship.add("director", person) directorship.add("organization", company) context.emit(directorship) context.emit(person, target=True)
def company_id(context: Context, id: str): return context.make_slug("company", id)
def short_id(context: Context, id: str) -> str: id = id.replace(context.make_slug("person"), "p") id = id.replace(context.make_slug("company"), "c") return id
def crawl_person(context: Context, data: Dict[str, Any]): is_pep = data.pop("is_pep", False) entity = context.make("Person", target=is_pep) wikidata_id = clean_wdid(data.pop("wikidata_id", None)) entity.id = person_id(context, data.pop("id"), wikidata_id) entity.add("sourceUrl", data.pop("url_en", None)) data.pop("url_ru", None) entity.add("modifiedAt", data.pop("last_change", None)) entity.add("wikidataId", wikidata_id) entity.add("name", data.pop("full_name_en", None)) entity.add("name", data.pop("full_name_ru", None)) entity.add("alias", data.pop("inversed_full_name_en", None)) entity.add("alias", data.pop("inversed_full_name_ru", None)) entity.add("alias", data.pop("also_known_as_en", None)) entity.add("alias", data.pop("also_known_as_ru", None)) entity.add("alias", split_names(data.pop("names", []))) entity.add("birthDate", parse_date(data.pop("date_of_birth", None))) entity.add("deathDate", parse_date(data.pop("termination_date_human", None))) entity.add("birthPlace", data.pop("city_of_birth_ru", None)) entity.add("birthPlace", data.pop("city_of_birth_en", None)) entity.add("innCode", data.pop("inn", None)) entity.add("firstName", data.pop("first_name_en", None)) entity.add("firstName", data.pop("first_name_ru", None)) entity.add("fatherName", data.pop("patronymic_en", None)) entity.add("fatherName", data.pop("patronymic_ru", None)) entity.add("lastName", data.pop("last_name_en", None)) entity.add("lastName", data.pop("last_name_ru", None)) for suffix in ("", "_en", "_ru"): role = data.pop(f"last_job_title{suffix}", None) org = data.pop(f"last_workplace{suffix}", None) if org is None or not len(org.strip()): continue position = org if role is not None and len(role.strip()): position = f"{org} ({role})" entity.add("position", position) for country_data in data.pop("related_countries", []): rel_type = country_data.pop("relationship_type") country_name = country_data.pop("to_country_en", None) country_name = country_name or country_data.pop("to_country_ru") # print(country_name) res = context.lookup("country_links", rel_type) if res is None: context.log.warn( "Unknown country link", rel_type=rel_type, entity=entity, country=country_name, ) continue if res.prop is not None: entity.add(res.prop, country_name) # h.audit_data(country_data) for rel_data in data.pop("related_persons", []): other_pep = rel_data.pop("is_pep", False) other_wdid = clean_wdid(rel_data.pop("person_wikidata_id")) other = context.make("Person", target=other_pep) other.id = person_id(context, rel_data.pop("person_id"), other_wdid) other.add("name", rel_data.pop("person_en", None)) other.add("name", rel_data.pop("person_ru", None)) other.add("wikidataId", other_wdid) rel_type = rel_data.pop("relationship_type_en", None) rel_type_ru = rel_data.pop("relationship_type_ru", None) rel_type = rel_type or rel_type_ru res = context.lookup("person_relations", rel_type) if res is None: context.log.warn( "Unknown person/person relation type", rel_type=rel_type, entity=entity, other=other, ) continue # print("LINK", (entity.id, other.id)) id_a, id_b = sorted((entity.id, other.id)) rel = context.make(res.schema) id_a_short = short_id(context, id_a) id_b_short = short_id(context, id_b) rel.id = context.make_slug(id_a_short, res.schema, id_b_short) rel.add(res.from_prop, id_a) rel.add(res.to_prop, id_b) rel.add(res.desc_prop, rel_type) rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) rel.add("startDate", parse_date(rel_data.pop("date_established"))) rel.add("endDate", parse_date(rel_data.pop("date_finished"))) # h.audit_data(rel_data) context.emit(other, target=other_pep) context.emit(rel) data.pop("type_of_official_ru", None) person_type = data.pop("type_of_official_en", None) person_topic = context.lookup_value("person_type", person_type) if person_topic is None: context.log.warn("Unknown type of official", type=person_type) entity.add("topics", person_topic) if is_pep: entity.add("topics", "role.pep") entity.add("status", person_type) data.pop("died", None) data.pop("tags", None) data.pop("reason_of_termination_en", None) data.pop("reason_of_termination_ru", None) # TODO: store images data.pop("photo", None) data.pop("related_companies", None) data.pop("declarations", None) # h.audit_data(data) context.emit(entity, target=is_pep)
def parse_relation(context: Context, el, parties): type_id = el.get("RelationTypeID") type_ = ref_value("RelationType", el.get("RelationTypeID")) from_id = context.make_slug(el.get("From-ProfileID")) from_party = parties.get(from_id) if from_party is None: context.log.warn("Missing relation 'from' party", entity_id=from_id, type=type_) return to_id = context.make_slug(el.get("To-ProfileID")) to_party = parties.get(to_id) if to_party is None: context.log.warn("Missing relation 'to' party", entity_id=to_id, type=type_) return # if type_id == "15003": # print( # "REL", # from_party.caption, # from_party.schema.name, # type_, # to_party.caption, # to_party.schema.name, # ) relation = lookup("relations", type_id) if relation is None: context.log.warn( "Unknown relation type", type_id=type_id, type_value=type_, from_party=from_party, to_party=to_party, ) return entity = context.make(relation.schema) from_range = entity.schema.get(relation.from_prop).range to_range = entity.schema.get(relation.to_prop).range # HACK: Looks like OFAC just has some link in a direction that makes no # semantic sense, so we're flipping them here. if disjoint_schema(from_party, from_range) or disjoint_schema(to_party, to_range): # context.log.warning( # "Flipped relation", # from_party=from_party, # to_party=to_party, # schema=entity.schema, # type=type_, # ) from_party, to_party = to_party, from_party add_schema(from_party, from_range) add_schema(to_party, to_range) context.emit(from_party, target=True) context.emit(to_party, target=True) entity.id = context.make_id("Relation", from_party.id, to_party.id, el.get("ID")) entity.add(relation.from_prop, from_party) entity.add(relation.to_prop, to_party) entity.add(relation.description_prop, type_) entity.add("summary", el.findtext("./Comment")) context.emit(entity) context.log.debug("Relation", from_=from_party, type=type_, to=to_party)
def crawl_person(context: Context, name, url): context.log.debug("Crawling member", name=name, url=url) doc = context.fetch_html(url) _, person_id = url.rsplit("/", 1) person = context.make("Person") person.id = context.make_slug(person_id) person.add("sourceUrl", url) person.add("name", name) person.add("topics", "role.pep") last_name, first_name = name.split(", ", 1) person.add("firstName", first_name) person.add("lastName", last_name) address = {} details = doc.find('.//div[@class="regular-details"]') for row in details.findall('.//ul[@class="no-bullet"]/li'): children = row.getchildren() title = children[0] title_text = collapse_spaces(stringify(title.text_content())) title_text = title_text or title.get("class") value = collapse_spaces(title.tail) if title_text in ("Full name:", "Address:", "Declaration of interests"): # ignore these. continue if title_text == "Emails:": emails = [e.text for e in row.findall(".//a")] person.add("email", emails) continue if "glyphicon-phone" in title_text: person.add("phone", value.split(",")) continue if "fa-fax" in title_text: # TODO: yeah, no # person.add("phone", value) continue if title_text in ("Web sites:", "list-inline"): sites = [e.get("href") for e in row.findall(".//a")] person.add("website", sites) continue if title_text == "Represented Country:": person.add("country", value) continue if title_text == "Languages:": # TODO: missing in FtM # person.add("languages", value.split(',')) continue if "Regions since:" in title_text: date = h.parse_date(value, FORMATS) person.add("createdAt", date) continue if "Date of birth:" in title_text: person.add("birthDate", h.parse_date(value, FORMATS)) continue if "Commissions:" in title_text: for com in row.findall(".//li"): text = collapse_spaces(com.text_content()) sep = "Mandate - " if sep in text: _, text = text.split(sep, 1) person.add("sector", text) continue if "Areas of interest:" in title_text: for area in row.findall(".//li"): person.add("keywords", area.text_content()) continue if title.tag == "i" and value is None: person.add("position", title_text) continue if title_text in ("Country:"): person.add("country", value) if title_text in ("Street:", "Postal code:", "City:", "Country:"): address[title_text.replace(":", "")] = value continue if title_text == "Political group:": group = context.make("Organization") group.add("name", value) slug = value if "(" in slug: _, slug = slug.rsplit("(", 1) slug = slugify(slug, sep="-") group.id = f"eu-cor-group-{slug}" context.emit(group) member = context.make("Membership") member.id = context.make_id("Membership", person.id, group.id) member.add("member", person) member.add("organization", group) context.emit(member) continue address = h.make_address( context, street=address.get("Street"), city=address.get("City"), postal_code=address.get("Posal code"), country=address.get("Country"), ) h.apply_address(context, person, address) context.emit(person, target=True)
def crawl(context: Context): for page in count(1): url = str(context.dataset.data.url) url = url.replace("pPageNumber=1", "pPageNumber=%s" % page) headers = { "Accept": "application/json", "Referer": "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals", } res = context.http.get(url, headers=headers) ids = [] for row in res.json(): for field, value in list(row.items()): if value == "N/A": row[field] = "" row_id = row.pop("id") ids.append(row_id) entity_type = row.pop("entity") schema = context.lookup_value("types", entity_type) if schema is None: context.log.warning("Unknown entity type", entity=entity_type) continue entity = context.make(schema) entity.id = context.make_slug(row_id) entity.add("name", row.pop("firmName")) entity.add("topics", "debarment") entity.add("alias", row.pop("additionalName")) entity.add("notes", row.pop("title")) entity.add("notes", row.pop("additionalTitle")) entity.add("country", parse_countries(row.pop("country"))) nat = "nationality" if schema == "Company": nat = "jurisdiction" entity.add(nat, parse_countries(row.pop("nationality"))) affiliated = row.pop("affiliatedWithEntityId") if len(affiliated): link = context.make("UnknownLink") link.id = context.make_id(row_id, affiliated) link.add("subject", entity.id) link.add("object", context.make_slug(affiliated)) context.emit(link) sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("statusName")) sanction.add("reason", row.pop("grounds")) sanction.add("authority", row.pop("source")) sanction.add("authority", row.pop("idBinstSource")) sanction.add("program", row.pop("idBinstType")) sanction.add("startDate", h.parse_date(row.pop("datefrom"), FORMATS)) sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS)) # context.pprint(row) context.emit(sanction) context.emit(entity, target=True) if min(ids) == 1: return
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) tables = doc.findall(".//table") assert len(tables) == 1 rows = tables[0].findall(".//tr") for row in rows[2:]: cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] index = cells[0] body = cells[1] decision = cells[2] un_id = cells[3] listing_date = cells[4] entity = context.make("Thing") entity.id = context.make_slug(index, un_id) entity.add("notes", cells[5]) sanction = h.make_sanction(context, entity) sanction.add("listingDate", clean_date(listing_date)) sanction.add("program", decision) sanction.add("recordId", un_id) body, gender = maybe_rsplit(body, "пол:") entity.add_cast("Person", "gender", h.clean_gender(gender)) body, gender = maybe_rsplit(body, "Пол:") entity.add_cast("Person", "gender", h.clean_gender(gender)) body, location = maybe_rsplit(body, "местонахождение:") entity.add_cast("LegalEntity", "country", location) body, imo_num = maybe_rsplit(body, "Присвоенный ИМО номер компании:") body, imo_num = maybe_rsplit(body, "Номер ИМО:") body, emails = maybe_rsplit(body, "Адрес эл. почты:") for email in letter_split(emails): entity.add_cast("LegalEntity", "email", email) body, fax = maybe_rsplit(body, "Номер факса:") body, fax = maybe_rsplit(body, "Факс:") body, phones = maybe_rsplit(body, "Номера телефонов:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, phones = maybe_rsplit(body, "Тел.:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, swift = maybe_rsplit(body, "СВИФТ-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, swift = maybe_rsplit(body, "СВИФТ/БИК-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, other_info = maybe_rsplit(body, "Прочая информация:") entity.add_cast("Thing", "notes", other_info) body, listing_date = maybe_rsplit(body, "Дата внесения в перечень:") body, addresses = maybe_rsplit(body, "Адрес:") for address in letter_split(addresses): country = address if ", " in country: country = address.rsplit(", ", 1) code = registry.country.clean(country, fuzzy=True) obj = h.make_address(context, full=address, country_code=code) h.apply_address(context, entity, obj) entity.add("country", code) body, national_ids = maybe_rsplit( body, "Национальный идентификационный номер:") for national_id in letter_split(national_ids): entity.add_cast("LegalEntity", "idNumber", national_id) body, passport_nos = maybe_rsplit(body, "Паспорт №:") for passport_no in letter_split(passport_nos): entity.add_cast("Person", "passportNumber", passport_no) body, citizenship = maybe_rsplit(body, "Гражданство:") entity.add_cast("Person", "nationality", citizenship) aka = "На основании менее достоверных источников также известен как:" body, aka = maybe_rsplit(body, aka) entity.add("alias", letter_split(aka)) strong_aka = "На основании достоверных источников также известен как:" body, strong_aka = maybe_rsplit(body, strong_aka) entity.add("alias", letter_split(strong_aka)) body, rik_no = maybe_rsplit(body, "Р.И.К.:") body, birth_place = maybe_rsplit(body, "Место рождения:") entity.add_cast("Person", "birthPlace", birth_place) body, birth_dates = maybe_rsplit(body, "Дата рождения:") for birth_date in letter_split(birth_dates): entity.add_cast("Person", "birthDate", clean_date(birth_date)) body, position = maybe_rsplit(body, "Должность:") entity.add_cast("Person", "position", position) body, job = maybe_rsplit(body, "Обращение:") entity.add_cast("Person", "position", job) body, aliases = maybe_rsplit(body, "Другие названия:") entity.add("alias", letter_split(aliases)) body, aliases = maybe_rsplit(body, "Вымышленные названия:") entity.add("alias", letter_split(aliases)) names = body.split(", ") entity.add("name", names) # context.pprint(names) if entity.schema.name == "Thing": entity.schema = model.get("LegalEntity") context.emit(entity, target=True) context.emit(sanction)