def parse_reference(context, reference, rows): entity = context.make("LegalEntity") entity.id = context.make_slug(reference) # entity.add("sourceUrl", context.dataset.url) sanction = h.make_sanction(context, entity) for row in rows: if row.pop("type") == "Individual": entity.schema = model.get("Person") name = row.pop("name_of_individual_or_entity", None) if row.pop("name_type") == "aka": entity.add("alias", name) else: entity.add("name", name) address = h.make_address(context, full=row.pop("address")) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", row.pop("additional_information")) entity.add("notes", row.pop("listing_information"), quiet=True) control_date = row.pop("control_date") sanction.add("modifiedAt", control_date) entity.add("modifiedAt", control_date) entity.context["updated_at"] = control_date.isoformat() entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for table in doc.findall("//table"): headers = table.findall("./thead/tr/td") headers = [h.text_content() for h in headers] assert "Vendor name" in headers, headers assert "From" in headers, headers for row in table.findall("./tbody/tr"): cells = [h.text_content() for h in row.findall("./td")] if len(cells[0]) == 0: continue entity = context.make("LegalEntity") entity.id = context.make_id(*cells) entity.add("name", cells[0]) entity.add("country", cells[1]) entity.add("topics", "crime.fraud") cc = entity.first("country") address = h.make_address(context, full=cells[2], country_code=cc) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("reason", cells[3]) sanction.add("program", cells[4]) sanction.add("startDate", parse_date(cells[5])) sanction.add("endDate", parse_date(cells[6])) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def parse_row(context: Context, row): entity = context.make("LegalEntity") entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name")) entity.add("name", row.get("Name")) entity.add("notes", row.get("Action")) entity.add("country", row.get("Country")) entity.add("modifiedAt", row.get("Last_Update")) address = h.make_address( context, street=row.get("Street_Address"), postal_code=row.get("Postal_Code"), city=row.get("City"), region=row.get("State"), country=row.get("Country"), ) h.apply_address(context, entity, address) context.emit(entity, target=True) citation = row.get("FR_Citation") sanction = h.make_sanction(context, entity, key=citation) sanction.add("program", citation) sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS)) sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS)) # pprint(row) context.emit(sanction)
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def apply_prop(context, entity, sanction, field, value): if field == "ALIAS": entity.add("alias", value.pop("Alias")) elif field == "SEXE": entity.add("gender", h.clean_gender(value.pop("Sexe"))) elif field == "PRENOM": entity.add("firstName", value.pop("Prenom")) elif field == "NATIONALITE": entity.add("nationality", value.pop("Pays")) elif field == "TITRE": entity.add("position", value.pop("Titre")) elif field == "SITE_INTERNET": entity.add("website", value.pop("SiteInternet")) elif field == "TELEPHONE": entity.add("phone", value.pop("Telephone")) elif field == "COURRIEL": entity.add("email", value.pop("Courriel")) elif field == "NUMERO_OMI": entity.add("imoNumber", value.pop("NumeroOMI")) elif field == "DATE_DE_NAISSANCE": date = parse_parts(value.pop("Annee"), value.pop("Mois"), value.pop("Jour")) entity.add("birthDate", date) elif field in ("ADRESSE_PM", "ADRESSE_PP"): address = h.make_address( context, full=value.pop("Adresse"), country=value.pop("Pays"), ) h.apply_address(context, entity, address) elif field == "LIEU_DE_NAISSANCE": entity.add("birthPlace", value.pop("Lieu")) entity.add("country", value.pop("Pays")) elif field == "PASSEPORT": entity.add("passportNumber", value.pop("NumeroPasseport")) elif field == "IDENTIFICATION": comment = value.pop("Commentaire") content = value.pop("Identification") result = context.lookup("identification", comment) if result is None: context.log.warning( "Unknown Identification type", comment=comment, content=content, ) else: schema = result.schema or entity.schema entity.add_cast(schema, result.prop, content) if result.prop == "notes": entity.add(result.prop, comment) elif field == "AUTRE_IDENTITE": entity.add("idNumber", value.pop("NumeroCarte")) elif field == "REFERENCE_UE": sanction.add("program", value.pop("ReferenceUe")) elif field == "REFERENCE_ONU": sanction.add("program", value.pop("ReferenceOnu")) elif field == "FONDEMENT_JURIDIQUE": sanction.add("reason", value.pop("FondementJuridiqueLabel")) elif field == "MOTIFS": sanction.add("reason", value.pop("Motifs"))
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_address(context: Context, data): return h.make_address( context, remarks=data.pop("NOTE", None), street=data.pop("STREET", None), city=data.pop("CITY", None), region=data.pop("STATE_PROVINCE", None), postal_code=data.pop("ZIP_CODE", None), country=data.pop("COUNTRY", None), )
def parse_address(context: Context, node): return h.make_address( context, remarks=node.findtext("./NOTE"), street=node.findtext("./STREET"), city=node.findtext("./CITY"), region=node.findtext("./STATE_PROVINCE"), postal_code=node.findtext("./ZIP_CODE"), country=node.findtext("./COUNTRY"), )
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def handle_address(context, entity, text): if text is None: return country = text if "," in country: country, _ = country.split(",", 1) code = registry.country.clean(country, fuzzy=True) if code is not None: entity.add("country", code) address = h.make_address(context, full=text, country_code=code) h.apply_address(context, entity, address)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def parse_address(context: Context, el): country = el.get("countryDescription") if country == "UNKNOWN": country = None # context.log.info("Addrr", el=el) return h.make_address( context, street=el.get("street"), po_box=el.get("poBox"), city=el.get("city"), place=el.get("place"), postal_code=el.get("zipCode"), region=el.get("region"), country=country, country_code=parse_country(el), )
def load_locations(context: Context, doc): locations = {} for location in doc.findall("./Locations/Location"): location_id = location.get("ID") countries = set() for area in location.findall("./LocationAreaCode"): area_code = ref_get("AreaCode", area.get("AreaCodeID")) countries.add(area_code.get("Description")) for country in location.findall("./LocationCountry"): country_obj = ref_get("Country", country.get("CountryID")) countries.add(country_obj.get("Value")) if len(countries) > 1: context.log.warn("Multiple countries", countries=countries) parts = {} for part in location.findall("./LocationPart"): type_ = ref_value("LocPartType", part.get("LocPartTypeID")) parts[type_] = part.findtext("./LocationPartValue/Value") country = first(countries) unknown = parts.get("Unknown") if registry.country.clean(unknown, fuzzy=True): country = unknown if country == "undetermined": country = unknown = None address = h.make_address( context, full=unknown, street=parts.get("ADDRESS1"), street2=parts.get("ADDRESS2"), street3=parts.get("ADDRESS3"), city=parts.get("CITY"), postal_code=parts.get("POSTAL CODE"), region=parts.get("REGION"), state=parts.get("STATE/PROVINCE"), country=country, ) if address.id is not None: context.emit(address) locations[location_id] = address return locations
def parse_russian_persons(context, entity, text): while "," in text: text, section = text.rsplit(",", 1) fragment = section.strip() if not len(fragment): continue date = parse_format(fragment, "%d.%m.%Y г.р.") if date.text is not None: entity.add("birthDate", date) continue if fragment.startswith("("): fragment = fragment.replace(")", "") entity.add("alias", fragment) continue obj = h.make_address(context, full=fragment, country_code="ru") h.apply_address(context, entity, obj) parse_name(entity, text)
def compose_address(context: Context, entity, place, el): addr = dict(place) addr.update(parse_address(el)) entity.add("country", addr.get("country")) po_box = addr.get("p-o-box") if po_box is not None: po_box = f"P.O. Box {po_box}" return h.make_address( context, remarks=addr.get("remarks"), summary=addr.get("co"), street=addr.get("address-details"), city=addr.get("location"), po_box=po_box, postal_code=addr.get("zip-code"), region=addr.get("area"), country=addr.get("country"), )
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find(".//article//table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./td") ] headers = headers[:-2] + ["from", "to"] + headers[-1:] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) if "prohibited_practice" not in cells: continue name = cells.pop("firm_name") nationality = cells.pop("nationality") entity = context.make("Company") entity.id = context.make_id(name, nationality) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", nationality) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("prohibited_practice")) sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS)) sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS)) full = cells.pop("address") address = h.make_address(context, full=full, country=nationality) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def crawl_person(context: Context, name, url): context.log.debug("Crawling member", name=name, url=url) doc = context.fetch_html(url) _, person_id = url.rsplit("/", 1) person = context.make("Person") person.id = context.make_slug(person_id) person.add("sourceUrl", url) person.add("name", name) person.add("topics", "role.pep") last_name, first_name = name.split(", ", 1) person.add("firstName", first_name) person.add("lastName", last_name) address = {} details = doc.find('.//div[@class="regular-details"]') for row in details.findall('.//ul[@class="no-bullet"]/li'): children = row.getchildren() title = children[0] title_text = collapse_spaces(stringify(title.text_content())) title_text = title_text or title.get("class") value = collapse_spaces(title.tail) if title_text in ("Full name:", "Address:", "Declaration of interests"): # ignore these. continue if title_text == "Emails:": emails = [e.text for e in row.findall(".//a")] person.add("email", emails) continue if "glyphicon-phone" in title_text: person.add("phone", value.split(",")) continue if "fa-fax" in title_text: # TODO: yeah, no # person.add("phone", value) continue if title_text in ("Web sites:", "list-inline"): sites = [e.get("href") for e in row.findall(".//a")] person.add("website", sites) continue if title_text == "Represented Country:": person.add("country", value) continue if title_text == "Languages:": # TODO: missing in FtM # person.add("languages", value.split(',')) continue if "Regions since:" in title_text: date = h.parse_date(value, FORMATS) person.add("createdAt", date) continue if "Date of birth:" in title_text: person.add("birthDate", h.parse_date(value, FORMATS)) continue if "Commissions:" in title_text: for com in row.findall(".//li"): text = collapse_spaces(com.text_content()) sep = "Mandate - " if sep in text: _, text = text.split(sep, 1) person.add("sector", text) continue if "Areas of interest:" in title_text: for area in row.findall(".//li"): person.add("keywords", area.text_content()) continue if title.tag == "i" and value is None: person.add("position", title_text) continue if title_text in ("Country:"): person.add("country", value) if title_text in ("Street:", "Postal code:", "City:", "Country:"): address[title_text.replace(":", "")] = value continue if title_text == "Political group:": group = context.make("Organization") group.add("name", value) slug = value if "(" in slug: _, slug = slug.rsplit("(", 1) slug = slugify(slug, sep="-") group.id = f"eu-cor-group-{slug}" context.emit(group) member = context.make("Membership") member.id = context.make_id("Membership", person.id, group.id) member.add("member", person) member.add("organization", group) context.emit(member) continue address = h.make_address( context, street=address.get("Street"), city=address.get("City"), postal_code=address.get("Posal code"), country=address.get("Country"), ) h.apply_address(context, person, address) context.emit(person, target=True)
def crawl_item(context: Context, listing): links = listing.get("links", {}) url = urljoin(API_URL, links.get("self")) data = http_get(context, url, cache_days=14) person = context.make("Person") _, officer_id = url.rsplit("/", 1) person.id = context.make_slug(officer_id) person.add("name", listing.get("title")) person.add("notes", listing.get("description")) person.add("topics", "crime") source_url = urljoin(WEB_URL, links.get("self")) person.add("sourceUrl", source_url) last_name = data.pop("surname", None) person.add("lastName", last_name) forename = data.pop("forename", None) person.add("firstName", forename) other_forenames = data.pop("other_forenames", None) person.add("middleName", other_forenames) person.add("title", data.pop("title", None)) nationality = data.pop("nationality", None) if nationality is not None: person.add("nationality", nationality.split(",")) person.add("birthDate", data.pop("date_of_birth", None)) person.add("topics", "crime") address = listing.get("address", {}) address = h.make_address( context, full=listing.get("address_snippet"), street=address.get("address_line_1"), street2=address.get("premises"), city=address.get("locality"), postal_code=address.get("postal_code"), region=address.get("region"), # country_code=person.first("nationality"), ) h.apply_address(context, person, address) for disqual in data.pop("disqualifications", []): case_id = disqual.get("case_identifier") sanction = h.make_sanction(context, person, key=case_id) sanction.add("recordId", case_id) sanction.add("startDate", disqual.get("disqualified_from")) sanction.add("endDate", disqual.get("disqualified_until")) sanction.add("listingDate", disqual.get("undertaken_on")) for key, value in disqual.get("reason", {}).items(): value = value.replace("-", " ") reason = f"{key}: {value}" sanction.add("reason", reason) sanction.add("country", "gb") context.emit(sanction) address = disqual.get("address", {}) address = h.make_address( context, full=listing.get("address_snippet"), street=address.get("address_line_1"), street2=address.get("premises"), city=address.get("locality"), postal_code=address.get("postal_code"), region=address.get("region"), # country_code=person.first("nationality"), ) for company_name in disqual.get("company_names", []): company = context.make("Company") company.id = context.make_slug("named", company_name) company.add("name", company_name) company.add("jurisdiction", "gb") context.emit(company) h.apply_address(context, company, address) directorship = context.make("Directorship") directorship.id = context.make_id(person.id, company.id) directorship.add("director", person) directorship.add("organization", company) context.emit(directorship) context.emit(person, target=True)
def parse_entry(context: Context, entry): entity = context.make("LegalEntity") if entry.findtext("./type-entry") == "2": entity = context.make("Person") entry_id = entry.findtext("number-entry") entity.id = context.make_slug(entry_id) sanction = h.make_sanction(context, entity) sanction.add("program", entry.findtext("./program-entry")) date_entry = entry.findtext("./date-entry") if date_entry: date = datetime.strptime(date_entry, "%Y%m%d") entity.add("createdAt", date.date()) sanction.add("listingDate", date.date()) sanction.add("startDate", date.date()) for aka in entry.findall("./aka-list"): h.apply_name( entity, name1=aka.findtext("./aka-name1"), name2=aka.findtext("./aka-name2"), name3=aka.findtext("./aka-name3"), tail_name=aka.findtext("./aka-name4"), alias=aka.findtext("type-aka") != "N", is_weak=aka.findtext("./quality-aka") == "2", quiet=True, ) for node in entry.findall("./title-list"): entity.add("title", node.text, quiet=True) for doc in entry.findall("./document-list"): reg = doc.findtext("./document-reg") number = doc.findtext("./document-id") country = doc.findtext("./document-country") passport = context.make("Passport") passport.id = context.make_id("Passport", entity.id, reg, number, country) passport.add("holder", entity) passport.add("passportNumber", number) passport.add("summary", reg) passport.add("country", country) context.emit(passport) for doc in entry.findall("./id-number-list"): entity.add("idNumber", doc.text) for node in entry.findall("./address-list"): address = h.make_address(context, full=node.findtext("./address")) h.apply_address(context, entity, address) for pob in entry.findall("./place-of-birth-list"): entity.add_cast("Person", "birthPlace", pob.text) for dob in entry.findall("./date-of-birth-list"): date = parse_date(dob.text) entity.add_cast("Person", "birthDate", date) for nat in entry.findall("./nationality-list"): for country in multi_split(nat.text, [";", ","]): country = remove_bracketed(country) entity.add("nationality", country, quiet=True) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_row(context: Context, row): group_type = row.pop("GroupTypeDescription") schema = TYPES.get(group_type) if schema is None: context.log.error("Unknown group type", group_type=group_type) return entity = context.make(schema) entity.id = context.make_slug(row.pop("GroupID")) sanction = h.make_sanction(context, entity) sanction.add("program", row.pop("RegimeName")) sanction.add("authority", row.pop("ListingType", None)) listed_date = h.parse_date(row.pop("DateListed"), FORMATS) sanction.add("listingDate", listed_date) designated_date = h.parse_date(row.pop("DateDesignated"), FORMATS) sanction.add("startDate", designated_date) entity.add("createdAt", listed_date) if not entity.has("createdAt"): entity.add("createdAt", designated_date) sanction.add("authorityId", row.pop("UKSanctionsListRef", None)) sanction.add("unscId", row.pop("UNRef", None)) sanction.add("status", row.pop("GroupStatus", None)) sanction.add("reason", row.pop("UKStatementOfReasons", None)) last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS) sanction.add("modifiedAt", last_updated) entity.add("modifiedAt", last_updated) # TODO: derive topics and schema from this?? entity_type = row.pop("Entity_Type", None) entity.add_cast("LegalEntity", "legalForm", entity_type) reg_number = row.pop("Entity_BusinessRegNumber", None) entity.add_cast("LegalEntity", "registrationNumber", reg_number) row.pop("Ship_Length", None) entity.add_cast("Vessel", "flag", row.pop("Ship_Flag", None)) flags = split_new(row.pop("Ship_PreviousFlags", None)) entity.add_cast("Vessel", "pastFlags", flags) entity.add_cast("Vessel", "type", row.pop("Ship_Type", None)) entity.add_cast("Vessel", "tonnage", row.pop("Ship_Tonnage", None)) entity.add_cast("Vessel", "buildDate", row.pop("Ship_YearBuilt", None)) entity.add_cast("Vessel", "imoNumber", row.pop("Ship_IMONumber", None)) ship_owner = row.pop("Ship_CurrentOwners", None) if ship_owner is not None: owner = context.make("LegalEntity") owner.id = context.make_slug("named", ship_owner) owner.add("name", ship_owner) context.emit(owner) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", owner.id) ownership.add("owner", owner) ownership.add("asset", entity) context.emit(ownership) countries = parse_countries(row.pop("Country", None)) entity.add("country", countries) title = split_items(row.pop("Title", None)) entity.add("title", title, quiet=True) pobs = split_items(row.pop("Individual_TownOfBirth", None)) entity.add_cast("Person", "birthPlace", pobs) dob = h.parse_date(row.pop("Individual_DateOfBirth", None), FORMATS) entity.add_cast("Person", "birthDate", dob) cob = parse_countries(row.pop("Individual_CountryOfBirth", None)) entity.add_cast("Person", "country", cob) nationalities = parse_countries(row.pop("Individual_Nationality", None)) entity.add_cast("Person", "nationality", nationalities) positions = split_items(row.pop("Individual_Position", None)) entity.add_cast("Person", "position", positions) entity.add_cast("Person", "gender", row.pop("Individual_Gender", None)) name_type = row.pop("AliasType", None) name_prop = NAME_TYPES.get(name_type) if name_prop is None: context.log.warning("Unknown name type", type=name_type) return name_quality = row.pop("AliasQuality", None) is_weak = WEAK_QUALITY.get(name_quality) if is_weak is None: context.log.warning("Unknown name quality", quality=name_quality) return h.apply_name( entity, name1=row.pop("name1", None), name2=row.pop("name2", None), name3=row.pop("name3", None), name4=row.pop("name4", None), name5=row.pop("name5", None), tail_name=row.pop("Name6", None), name_prop=name_prop, is_weak=is_weak, quiet=True, ) entity.add("alias", row.pop("NameNonLatinScript", None)) full_address = join_text( row.pop("Address1", None), row.pop("Address2", None), row.pop("Address3", None), row.pop("Address4", None), row.pop("Address5", None), row.pop("Address6", None), sep=", ", ) address = h.make_address( context, full=full_address, postal_code=row.pop("PostCode", None), country=first(countries), ) h.apply_address(context, entity, address) passport_number = row.pop("Individual_PassportNumber", None) passport_numbers = split_items(passport_number) entity.add_cast("Person", "passportNumber", passport_numbers) passport_detail = row.pop("Individual_PassportDetails", None) # passport_details = split_items(passport_detail) # TODO: where do I stuff this? ni_number = row.pop("Individual_NINumber", None) ni_numbers = split_items(ni_number) entity.add_cast("Person", "idNumber", ni_numbers) ni_detail = row.pop("Individual_NIDetails", None) # ni_details = split_items(ni_detail) # TODO: where do I stuff this? for phone in split_new(row.pop("PhoneNumber", None)): entity.add_cast("LegalEntity", "phone", phone) for email in split_new(row.pop("EmailAddress", None)): entity.add_cast("LegalEntity", "email", email) for website in split_new(row.pop("Website", None)): entity.add_cast("LegalEntity", "website", website) for name in parse_companies(context, row.pop("Entity_ParentCompany", None)): parent = context.make("Organization") parent.id = context.make_slug("named", name) parent.add("name", name) context.emit(parent) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", parent.id) ownership.add("owner", parent) ownership.add("asset", entity) context.emit(ownership) for name in parse_companies(context, row.pop("Entity_Subsidiaries", None)): subsidiary = context.make("Company") subsidiary.id = context.make_slug("named", name) subsidiary.add("name", name) context.emit(subsidiary) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", subsidiary.id) ownership.add("owner", entity) ownership.add("asset", subsidiary) context.emit(ownership) grp_status = row.pop("GrpStatus", None) if grp_status != "A": context.log.warning("Unknown GrpStatus", value=grp_status) entity.add("notes", h.clean_note(row.pop("OtherInformation", None))) h.audit_data(row, ignore=["NonLatinScriptLanguage", "NonLatinScriptType"]) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) tables = doc.findall(".//table") assert len(tables) == 1 rows = tables[0].findall(".//tr") for row in rows[2:]: cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] index = cells[0] body = cells[1] decision = cells[2] un_id = cells[3] listing_date = cells[4] entity = context.make("Thing") entity.id = context.make_slug(index, un_id) entity.add("notes", cells[5]) sanction = h.make_sanction(context, entity) sanction.add("listingDate", clean_date(listing_date)) sanction.add("program", decision) sanction.add("recordId", un_id) body, gender = maybe_rsplit(body, "пол:") entity.add_cast("Person", "gender", h.clean_gender(gender)) body, gender = maybe_rsplit(body, "Пол:") entity.add_cast("Person", "gender", h.clean_gender(gender)) body, location = maybe_rsplit(body, "местонахождение:") entity.add_cast("LegalEntity", "country", location) body, imo_num = maybe_rsplit(body, "Присвоенный ИМО номер компании:") body, imo_num = maybe_rsplit(body, "Номер ИМО:") body, emails = maybe_rsplit(body, "Адрес эл. почты:") for email in letter_split(emails): entity.add_cast("LegalEntity", "email", email) body, fax = maybe_rsplit(body, "Номер факса:") body, fax = maybe_rsplit(body, "Факс:") body, phones = maybe_rsplit(body, "Номера телефонов:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, phones = maybe_rsplit(body, "Тел.:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, swift = maybe_rsplit(body, "СВИФТ-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, swift = maybe_rsplit(body, "СВИФТ/БИК-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, other_info = maybe_rsplit(body, "Прочая информация:") entity.add_cast("Thing", "notes", other_info) body, listing_date = maybe_rsplit(body, "Дата внесения в перечень:") body, addresses = maybe_rsplit(body, "Адрес:") for address in letter_split(addresses): country = address if ", " in country: country = address.rsplit(", ", 1) code = registry.country.clean(country, fuzzy=True) obj = h.make_address(context, full=address, country_code=code) h.apply_address(context, entity, obj) entity.add("country", code) body, national_ids = maybe_rsplit( body, "Национальный идентификационный номер:") for national_id in letter_split(national_ids): entity.add_cast("LegalEntity", "idNumber", national_id) body, passport_nos = maybe_rsplit(body, "Паспорт №:") for passport_no in letter_split(passport_nos): entity.add_cast("Person", "passportNumber", passport_no) body, citizenship = maybe_rsplit(body, "Гражданство:") entity.add_cast("Person", "nationality", citizenship) aka = "На основании менее достоверных источников также известен как:" body, aka = maybe_rsplit(body, aka) entity.add("alias", letter_split(aka)) strong_aka = "На основании достоверных источников также известен как:" body, strong_aka = maybe_rsplit(body, strong_aka) entity.add("alias", letter_split(strong_aka)) body, rik_no = maybe_rsplit(body, "Р.И.К.:") body, birth_place = maybe_rsplit(body, "Место рождения:") entity.add_cast("Person", "birthPlace", birth_place) body, birth_dates = maybe_rsplit(body, "Дата рождения:") for birth_date in letter_split(birth_dates): entity.add_cast("Person", "birthDate", clean_date(birth_date)) body, position = maybe_rsplit(body, "Должность:") entity.add_cast("Person", "position", position) body, job = maybe_rsplit(body, "Обращение:") entity.add_cast("Person", "position", job) body, aliases = maybe_rsplit(body, "Другие названия:") entity.add("alias", letter_split(aliases)) body, aliases = maybe_rsplit(body, "Вымышленные названия:") entity.add("alias", letter_split(aliases)) names = body.split(", ") entity.add("name", names) # context.pprint(names) if entity.schema.name == "Thing": entity.schema = model.get("LegalEntity") context.emit(entity, target=True) context.emit(sanction)
def emit_row(context: Context, sheet: str, section: str, row: Dict[str, List[str]]): schema = context.lookup_value("schema", section) if schema is None: context.log.warning("No schema for section", section=section, sheet=sheet) return entity = context.make(schema) entity.id = context.make_id(*row.get("name_english"), *row.get("name_japanese")) if entity.id is None: # context.pprint((sheet, row)) return entity.add("name", parse_names(row.pop("name_english"))) if not entity.has("name"): entity.add("name", parse_names(row.pop("name_japanese"))) else: entity.add("alias", parse_names(row.pop("name_japanese"))) entity.add("alias", parse_names(row.pop("alias", []))) entity.add("alias", parse_names(row.pop("known_alias", []))) entity.add("weakAlias", parse_names(row.pop("weak_alias", []))) entity.add("weakAlias", parse_names(row.pop("nickname", []))) entity.add("previousName", parse_names(row.pop("past_alias", []))) entity.add("previousName", parse_names(row.pop("old_name", []))) entity.add_cast("Person", "position", row.pop("position", [])) birth_date = parse_date(row.pop("birth_date", [])) entity.add_cast("Person", "birthDate", birth_date) entity.add_cast("Person", "birthPlace", row.pop("birth_place", [])) entity.add_cast("Person", "passportNumber", row.pop("passport_number", [])) entity.add("idNumber", row.pop("id_number", [])) entity.add("idNumber", row.pop("identification_number", [])) entity.add("notes", row.pop("other_information", [])) entity.add("notes", row.pop("details", [])) entity.add("phone", row.pop("phone", [])) entity.add("phone", row.pop("fax", [])) for address_full in row.pop("address", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) for address_full in row.pop("where", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) title = row.pop("title", []) if entity.schema.is_a("Person"): entity.add("title", title) else: entity.add("notes", title) entity.add("country", row.pop("citizenship", [])) entity.add("country", row.pop("activity_area", [])) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("reason", row.pop("root_nomination", None)) sanction.add("reason", row.pop("reason_res1483", None)) sanction.add("recordId", row.pop("notification_number", None)) sanction.add("startDate", parse_date(row.pop("notification_date", []))) sanction.add("startDate", parse_date(row.pop("designated_date", []))) sanction.add("listingDate", parse_date(row.pop("publication_date", []))) row.pop("designated_un", None) # if len(row): # context.pprint(row) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_row(context, row): group_type = row.pop("GroupTypeDescription") org_type = row.pop("OrgType", None) if group_type == "Individual": base_schema = "Person" elif row.get("TypeOfVessel") is not None: base_schema = "Vessel" elif group_type == "Entity": base_schema = context.lookup_value("org_type", org_type, "Organization") else: context.log.error("Unknown entity type", group_type=group_type) return entity = context.make(base_schema) entity.id = context.make_slug(row.pop("GroupID")) if org_type is not None: org_types = split_items(org_type) entity.add_cast("LegalEntity", "legalForm", org_types) sanction = h.make_sanction(context, entity) # entity.add("position", row.pop("Position"), quiet=True) entity.add("notes", row.pop("OtherInformation", None), quiet=True) entity.add("notes", row.pop("FurtherIdentifiyingInformation", None), quiet=True) sanction.add("program", row.pop("RegimeName")) sanction.add("authority", row.pop("ListingType", None)) sanction.add("startDate", h.parse_date(row.pop("DateListed"), FORMATS)) sanction.add("recordId", row.pop("FCOId", None)) sanction.add("status", row.pop("GroupStatus", None)) sanction.add("reason", row.pop("UKStatementOfReasons", None)) last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS) if last_updated is not None: sanction.add("modifiedAt", last_updated) sanction.context["updated_at"] = last_updated entity.add("modifiedAt", last_updated) entity.context["updated_at"] = last_updated # DoB is sometimes a year only row.pop("DateOfBirth", None) dob = parse_parts( row.pop("YearOfBirth", 0), row.pop("MonthOfBirth", 0), row.pop("DayOfBirth", 0), ) entity.add_cast("Person", "birthDate", dob) gender = h.clean_gender(row.pop("Gender", None)) entity.add_cast("Person", "gender", gender) id_number = row.pop("NationalIdNumber", None) entity.add_cast("LegalEntity", "idNumber", split_items(id_number)) passport = row.pop("PassportDetails", None) entity.add_cast("Person", "passportNumber", split_items(passport)) flag = row.pop("FlagOfVessel", None) entity.add_cast("Vessel", "flag", flag) prev_flag = row.pop("PreviousFlags", None) entity.add_cast("Vessel", "pastFlags", prev_flag) year = row.pop("YearBuilt", None) entity.add_cast("Vehicle", "buildDate", year) type_ = row.pop("TypeOfVessel", None) entity.add_cast("Vehicle", "type", type_) imo = row.pop("IMONumber", None) entity.add_cast("Vessel", "imoNumber", imo) tonnage = row.pop("TonnageOfVessel", None) entity.add_cast("Vessel", "tonnage", tonnage) row.pop("LengthOfVessel", None) # entity.add("legalForm", org_type) title = split_items(row.pop("NameTitle", None)) entity.add("title", title, quiet=True) entity.add("firstName", row.pop("name1", None), quiet=True) entity.add("secondName", row.pop("name2", None), quiet=True) entity.add("middleName", row.pop("name3", None), quiet=True) entity.add("middleName", row.pop("name4", None), quiet=True) entity.add("middleName", row.pop("name5", None), quiet=True) name6 = row.pop("Name6", None) entity.add("lastName", name6, quiet=True) full_name = row.pop("FullName", name6) row.pop("AliasTypeName") if row.pop("AliasType") == "AKA": entity.add("alias", full_name) else: entity.add("name", full_name) nationalities = parse_countries(row.pop("Nationality", None)) entity.add("nationality", nationalities, quiet=True) position = split_items(row.pop("Position", None)) entity.add("position", position, quiet=True) birth_countries = parse_countries(row.pop("CountryOfBirth", None)) entity.add("country", birth_countries, quiet=True) countries = parse_countries(row.pop("Country", None)) entity.add("country", countries) pob = split_items(row.pop("TownOfBirth", None)) entity.add("birthPlace", pob, quiet=True) address = h.make_address( context, full=row.pop("FullAddress", None), street=row.pop("address1", None), street2=row.pop("address2", None), street3=row.pop("address3", None), city=row.pop("address4", None), place=row.pop("address5", None), region=row.pop("address6", None), postal_code=row.pop("PostCode", None), country=first(countries), ) h.apply_address(context, entity, address) reg_number = row.pop("BusinessRegNumber", None) entity.add_cast("LegalEntity", "registrationNumber", reg_number) phones = split_items(row.pop("PhoneNumber", None), comma=True) phones = h.clean_phones(phones) entity.add_cast("LegalEntity", "phone", phones) website = split_items(row.pop("Website", None), comma=True) entity.add_cast("LegalEntity", "website", website) emails = split_items(row.pop("EmailAddress", None), comma=True) emails = h.clean_emails(emails) entity.add_cast("LegalEntity", "email", emails) # TODO: graph row.pop("Subsidiaries", None) row.pop("ParentCompany", None) row.pop("CurrentOwners", None) row.pop("DateListedDay", None) row.pop("DateListedMonth", None) row.pop("DateListedYear", None) row.pop("LastUpdatedDay", None) row.pop("LastUpdatedMonth", None) row.pop("LastUpdatedYear", None) row.pop("GrpStatus", None) row.pop("ID", None) row.pop("DateOfBirthId", None) row.pop("DateListedDay", None) if len(row): pprint(row) entity.add("topics", "sanction") context.emit(entity, target=True, unique=True) context.emit(sanction)
def parse_entry(context, entry): entity = context.make("LegalEntity") if entry.findtext("./type-entry") == "2": entity = context.make("Person") entry_id = entry.findtext("number-entry") entity.id = context.make_slug(entry_id) sanction = h.make_sanction(context, entity) sanction.add("program", entry.findtext("./program-entry")) date_entry = entry.findtext("./date-entry") if date_entry: date = datetime.strptime(date_entry, "%Y%m%d") entity.context["created_at"] = date.isoformat() sanction.add("startDate", date.date()) for aka in entry.findall("./aka-list"): first_name = aka.findtext("./aka-name1") entity.add("firstName", first_name, quiet=True) second_name = aka.findtext("./aka-name2") entity.add("secondName", second_name, quiet=True) third_name = aka.findtext("./aka-name3") entity.add("middleName", third_name, quiet=True) last_name = aka.findtext("./aka-name4") entity.add("lastName", last_name, quiet=True) name = jointext(first_name, second_name, third_name, last_name) if aka.findtext("type-aka") == "N": entity.add("name", name) else: if aka.findtext("./quality-aka") == "2": entity.add("weakAlias", name) else: entity.add("alias", name) for node in entry.findall("./title-list"): entity.add("title", node.text, quiet=True) for doc in entry.findall("./document-list"): reg = doc.findtext("./document-reg") number = doc.findtext("./document-id") country = doc.findtext("./document-country") passport = context.make("Passport") passport.id = context.make_id("Passport", entity.id, reg, number, country) passport.add("holder", entity) passport.add("passportNumber", number) passport.add("summary", reg) passport.add("country", country) context.emit(passport) for doc in entry.findall("./id-number-list"): entity.add("idNumber", doc.text) for node in entry.findall("./address-list"): address = h.make_address(context, full=node.findtext("./address")) h.apply_address(context, entity, address) for pob in entry.findall("./place-of-birth-list"): entity.add_cast("Person", "birthPlace", pob.text) for dob in entry.findall("./date-of-birth-list"): date = parse_date(dob.text) entity.add_cast("Person", "birthDate", date) for nat in entry.findall("./nationality-list"): for country in multi_split(nat.text, [";", ","]): country = remove_bracketed(country) entity.add("nationality", country, quiet=True) entity.add("topics", "sanction") context.emit(entity, target=True, unique=True) context.emit(sanction)
def crawl_organizations(context: Context): path = context.fetch_resource("organizations.xlsx", ORG_URL) context.export_resource(path, XLSX, title=context.SOURCE_TITLE) seq_ids = {} links = [] for record in excel_records(path): seq_id = record.pop("internal_seq_id", None) name_en = record.pop("organization_name_english", None) name_he = record.pop("organization_name_hebrew", None) entity = context.make("Organization") entity.id = context.make_id(name_en, name_he) if entity.id is None: continue if seq_id is not None: seq_ids[seq_id] = entity.id entity.add("name", name_en) entity.add("name", name_he) entity.add("topics", "crime.terror") entity.add("notes", lang_pick(record, "comments")) entity.add("notes", record.pop("column_42", None)) entity.add("email", record.pop("email", None)) entity.add("country", record.pop("country_hebrew", None)) entity.add("country", record.pop("country_english", None)) entity.add("registrationNumber", record.pop("corporation_id", None)) entity.add("legalForm", lang_pick(record, "corporation_type")) entity.add("jurisdiction", lang_pick(record, "location_of_formation")) date = parse_date(record.pop("date_of_corporation", None)) entity.add("incorporationDate", date) for field in list(record.keys()): if field.startswith("organization_name_"): entity.add("alias", record.pop(field, None)) if field.startswith("telephone"): entity.add("phone", record.pop(field, None)) if field.startswith("website"): entity.add("website", record.pop(field, None)) entity.add("phone", record.pop("column_70", None)) entity.add("website", record.pop("column_73", None)) sanction = h.make_sanction(context, entity) sanction.add("recordId", seq_id) sanction.add("recordId", record.pop("seq_num_in_other_countries", None)) sanction.add("program", record.pop("designation_type", None)) sanction.add("reason", lang_pick(record, "designation_justification")) sanction.add("authority", lang_pick(record, "designated_by")) sanction.add("publisher", record.pop("public_records_references", None)) lang_pick(record, "designated_by_abroad") record.pop("date_designated_in_other_countries", None) linked = record.pop("linked_to_internal_seq_id", "") for link in linked.split(";"): links.append((max(link, seq_id), min(link, seq_id))) street = lang_pick(record, "street") city = lang_pick(record, "city_village") if street or city: address = h.make_address( context, street=street, city=city, country_code=entity.first("country") ) h.apply_address(context, entity, address) for field in ( "date_of_temporary_designation", "date_of_permenant_designation", "date_designation_in_west_bank", ): sanction.add("startDate", parse_date(record.pop(field, None))) context.emit(entity, target=True) context.emit(sanction) if len(record): context.pprint(record) for (subject, object) in links: subject_id = seq_ids.get(subject) object_id = seq_ids.get(object) if subject_id is None or object_id is None: continue link = context.make("UnknownLink") link.id = context.make_id(subject_id, object_id) link.add("subject", subject_id) link.add("object", object_id) context.emit(link)
def parse_result(context, result): type_ = result.pop("type", None) schema = context.lookup_value("type", type_) if schema is None: context.log.error("Unknown result type", type=type_) return entity = context.make(schema) entity.id = context.make_slug(result.pop("id")) entity_number = result.pop("entity_number", None) if entity_number is not None: assert int(entity_number) entity.id = SDN.make_slug(entity_number) entity.add("name", result.pop("name", None)) for alias in ensure_list(result.pop("alt_names", "")): entity.add("alias", alias.split("; ")) entity.add("notes", result.pop("remarks", None)) entity.add("country", result.pop("country", None)) if entity.schema.is_a("Person"): entity.add("position", result.pop("title", None)) entity.add("nationality", result.pop("nationalities", None)) entity.add("nationality", result.pop("citizenships", None)) for dob in result.pop("dates_of_birth", []): entity.add("birthDate", h.parse_date(dob, FORMATS)) entity.add("birthPlace", result.pop("places_of_birth", None)) elif entity.schema.is_a("Vessel"): entity.add("flag", result.pop("vessel_flag", None)) entity.add("callSign", result.pop("call_sign", None)) entity.add("type", result.pop("vessel_type", None)) grt = result.pop("gross_registered_tonnage", None) entity.add("grossRegisteredTonnage", grt) gt = result.pop("gross_tonnage", None) entity.add("tonnage", gt) # TODO: make adjacent owner entity result.pop("vessel_owner", None) assert result.pop("title", None) is None assert not len(result.pop("nationalities", [])) assert not len(result.pop("citizenships", [])) assert not len(result.pop("dates_of_birth", [])) assert not len(result.pop("places_of_birth", [])) for address in result.pop("addresses", []): obj = h.make_address( context, street=address.get("address"), city=address.get("city"), postal_code=address.get("postal_code"), region=address.get("state"), country=address.get("country"), ) h.apply_address(context, entity, obj) for ident in result.pop("ids", []): country = ident.pop("country") entity.add("country", country) h.apply_feature( context, entity, ident.pop("type"), ident.pop("number"), country=country, date_formats=FORMATS, start_date=ident.pop("issue_date", None), end_date=ident.pop("expiration_date", None), ) sanction = context.make("Sanction") sanction.id = context.make_id(entity.id, "Sanction") sanction.add("entity", entity) sanction.add("program", result.pop("programs", [])) sanction.add("status", result.pop("license_policy", [])) sanction.add("reason", result.pop("license_requirement", [])) sanction.add("reason", result.pop("federal_register_notice", None)) sanction.add("startDate", result.pop("start_date", None)) sanction.add("endDate", result.pop("end_date", None)) sanction.add("country", "us") sanction.add("authority", result.pop("source", None)) # TODO: deref source_url = deref_url(context, result.pop("source_information_url")) sanction.add("sourceUrl", source_url) result.pop("source_list_url") # TODO: what is this? result.pop("standard_order", None) context.emit(sanction) context.emit(entity, target=True) if len(result): context.pprint(result)
def crawl_row(context: Context, data: Dict[str, str]): entity = context.make("LegalEntity") ind_id = data.pop("INDIVIDUAL_Id", data.pop("IndividualID")) entity.id = context.make_slug(ind_id) assert entity.id, data entity.add("notes", h.clean_note(data.pop("COMMENTS", None))) entity.add("notes", h.clean_note(data.pop("Comments", None))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("notes", h.clean_note(data.pop("NOTE1", None))) entity.add("notes", h.clean_note(data.pop("NOTE2", None))) entity.add("notes", h.clean_note(data.pop("NOTE3", None))) entity.add_cast("Person", "nationality", data.pop("NATIONALITY", None)) entity.add_cast("Person", "nationality", data.pop("Nationality", None)) entity.add_cast("Person", "title", data.pop("TITLE", None)) entity.add_cast("Person", "title", data.pop("Title", None)) entity.add_cast("Person", "position", data.pop("DESIGNATION", None)) entity.add_cast("Person", "position", data.pop("Designation", None)) entity.add_cast("Person", "birthPlace", data.pop("PLACEOFBIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("IndividualPlaceOfBirth", None)) entity.add_cast("Person", "birthPlace", data.pop("CITY_OF_BIRTH", None)) entity.add_cast("Person", "birthDate", data.pop("YEAR", None)) entity.add_cast("Person", "gender", data.pop("GENDER", None)) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE", None))) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE_OF_BIRTH", None))) dob = parse_date(data.pop("IndividualDateOfBirth", None)) entity.add_cast("Person", "birthDate", dob) data.pop("BIRTHPLACE_x0020_CITY", None) data.pop("BIRTHPLACE_x0020_STATE_PROVINCE", None) entity.add("country", data.pop("BIRTHPLACE_x0020_COUNTRY", None)) entity.add("country", data.pop("COUNTRY_OF_BIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("BIRTHPLACE_x0020_NOTE", None)) h.apply_name( entity, full=data.pop("FullName", None), given_name=data.pop("FIRST_NAME", None), second_name=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) alias = data.pop("NAME_ORIGINAL_SCRIPT", None) if alias is not None and "?" not in alias: entity.add("alias", alias) entity.add("alias", data.pop("SORT_KEY", None)) data.pop("IndividualAlias", None) entity.add_cast("Person", "passportNumber", data.pop("PASSPORT", None)) entity.add_cast("Person", "passportNumber", data.pop("IndividualDocument", None)) data.pop("DATE_OF_ISSUE", None) data.pop("CITY_OF_ISSUE", None) entity.add("country", data.pop("COUNTRY_OF_ISSUE", None)) entity.add_cast("Person", "idNumber", data.pop("IDNUMBER", None)) address = h.make_address( context, # remarks=data.pop("NOTE"), full=data.pop("IndividualAddress", None), street=data.pop("STREET", None), city=data.pop("CITY", None), region=data.pop("STATE_PROVINCE", None), postal_code=data.pop("ZIP_CODE", None), country=data.pop("COUNTRY", None), ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) inserted_at = parse_date(data.pop("DateInserted", None)) listed_on = data.pop("ListedON", data.pop("ListedOn", None)) listed_at = parse_date(listed_on) entity.add("createdAt", inserted_at or listed_at) sanction.add("listingDate", listed_at or inserted_at) sanction.add("startDate", data.pop("FROM_YEAR", None)) sanction.add("endDate", data.pop("TO_YEAR", None)) sanction.add("program", data.pop("UN_LIST_TYPE", None)) sanction.add("unscId", data.pop("REFERENCE_NUMBER", None)) sanction.add("unscId", data.pop("ReferenceNumber", None)) sanction.add("authority", data.pop("SUBMITTED_BY", None)) entity.add("topics", "sanction") h.audit_data(data, ignore=["VERSIONNUM", "TYPE_OF_DATE", "ApplicationStatus"]) context.emit(entity, target=True) context.emit(sanction)
def crawl_company(context: Context, data: Dict[str, Any]): entity = context.make("Organization") entity.id = company_id(context, data.pop("id")) entity.add("sourceUrl", data.pop("url_en", None)) data.pop("url_ru", None) entity.add("name", data.pop("name_en", None)) entity.add("name", data.pop("name_ru", None)) entity.add("name", data.pop("name_suggest_output_ru", None)) entity.add("alias", data.pop("also_known_as", None)) entity.add("alias", data.pop("short_name_en", None)) entity.add("alias", data.pop("short_name_ru", None)) entity.add("incorporationDate", parse_date(data.pop("founded", None))) entity.add("dissolutionDate", parse_date(data.pop("closed", None))) entity.add("status", data.pop("status_en", data.pop("status_ru", None))) entity.add("status", data.pop("status", None)) entity.add_cast("Company", "ogrnCode", data.pop("ogrn_code", None)) entity.add("registrationNumber", data.pop("edrpou", None)) for country_data in data.pop("related_countries", []): rel_type = country_data.pop("relationship_type") country_name = country_data.pop("to_country_en", None) country_name = country_name or country_data.pop("to_country_ru") # print(country_name) res = context.lookup("country_links", rel_type) if res is None: context.log.warn( "Unknown country link", rel_type=rel_type, entity=entity, country=country_name, ) continue if res.prop is not None: entity.add(res.prop, country_name) # h.audit_data(country_data) for rel_data in data.pop("related_persons", []): other_wdid = clean_wdid(rel_data.pop("person_wikidata_id")) other_id = person_id(context, rel_data.pop("person_id"), other_wdid) rel_type = rel_data.pop("relationship_type_en", None) rel_type_ru = rel_data.pop("relationship_type_ru", None) rel_type = rel_type or rel_type_ru res = context.lookup("person_relations", rel_type) if res is None: context.log.info( "Unknown company/person relation type", rel_type=rel_type, entity=entity, other=other_id, ) continue if res.schema is None: continue if res.schema == "Organization" and res.from_prop == "asset": entity.schema = model.get("Company") rel = context.make(res.schema) id_a_short = short_id(context, entity.id) id_b_short = short_id(context, other_id) rel.id = context.make_slug(id_a_short, res.schema, id_b_short) rel.add(res.from_prop, entity.id) rel.add(res.to_prop, other_id) rel.add(res.desc_prop, rel_type) rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) rel.add("startDate", parse_date(rel_data.pop("date_established"))) rel.add("endDate", parse_date(rel_data.pop("date_finished"))) context.emit(rel) for rel_data in data.pop("related_companies", []): # pprint(rel_data) # other_id = company_id(context, rel_data.pop("company_id")) # rel_type = rel_data.pop("relationship_type_en", None) # rel_type_ru = rel_data.pop("relationship_type_ru", None) # rel_type = rel_type or rel_type_ru # res = context.lookup("company_relations", rel_type) # if res is None: # context.log.warn( # "Unknown company/company relation type", # rel_type=rel_type, # entity=entity, # other=other_id, # ) # continue # if res.schema is None: # continue # if res.schema == "Organization" and res.from_prop == "asset": # entity.schema = model.get("Company") # rel = context.make(res.schema) # id_a_short = short_id(context, entity.id) # id_b_short = short_id(context, other_id) # rel.id = context.make_slug(id_a_short, res.schema, id_b_short) # rel.add(res.from_prop, entity.id) # rel.add(res.to_prop, other_id) # rel.add(res.desc_prop, rel_type) # rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) # rel.add("startDate", parse_date(rel_data.pop("date_established"))) # rel.add("endDate", parse_date(rel_data.pop("date_finished"))) # context.emit(rel) pass address = h.make_address( context, street=data.pop("street", None), city=data.pop("city", None), ) h.apply_address(context, entity, address) if data.pop("state_company", False): entity.add("topics", "gov.soe") ignore = [ "wiki", "bank_name", "other_founders", "other_owners", "other_managers", "other_recipient", ] h.audit_data(data, ignore=ignore) # print(entity.to_dict()) context.emit(entity)