def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context: Context, target, programs, places, updated_at): entity = context.make("LegalEntity") node = target.find("./entity") if node is None: node = target.find("./individual") entity = context.make("Person") if node is None: node = target.find("./object") object_type = node.get("object-type") if object_type != "vessel": context.log.warning("Unknown target type", target=target, object_type=object_type) entity = context.make("Vessel") entity.id = context.make_slug(target.get("ssid")) entity.add("gender", node.get("sex"), quiet=True) for other in node.findall("./other-information"): value = other.text.strip() if entity.schema.is_a("Vessel") and value.lower().startswith("imo"): _, imo_num = value.split(":", 1) entity.add("imoNumber", imo_num) else: entity.add("notes", h.clean_note(value)) sanction = h.make_sanction(context, entity) dates = set() for mod in target.findall("./modification"): dates.add(mod.get("publication-date")) sanction.add("listingDate", mod.get("publication-date")) sanction.add("startDate", mod.get("effective-date")) dates_ = [d for d in dates if d is not None] if len(dates_): entity.add("createdAt", min(dates_)) entity.add("modifiedAt", max(dates_)) ssid = target.get("sanctions-set-id") sanction.add("program", programs.get(ssid)) for justification in node.findall("./justification"): # TODO: should this go into sanction:reason? entity.add("notes", h.clean_note(justification.text)) for relation in node.findall("./relation"): rel_type = relation.get("relation-type") target_id = context.make_slug(relation.get("target-id")) res = context.lookup("relations", rel_type) if res is None: context.log.warn( "Unknown relationship type", type=rel_type, source=entity, target=target_id, ) continue rel = context.make(res.schema) rel.id = context.make_slug(relation.get("ssid")) rel.add(res.source, entity.id) rel.add(res.target, target_id) rel.add(res.text, rel_type) # rel_target = context.make(rel.schema.get(res.target).range) # rel_target.id = target_id # context.emit(rel_target) entity.add_schema(rel.schema.get(res.source).range) context.emit(rel) for identity in node.findall("./identity"): parse_identity(context, entity, identity, places) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_person(context: Context, data: Dict[str, Any]): is_pep = data.pop("is_pep", False) entity = context.make("Person", target=is_pep) wikidata_id = clean_wdid(data.pop("wikidata_id", None)) entity.id = person_id(context, data.pop("id"), wikidata_id) entity.add("sourceUrl", data.pop("url_en", None)) data.pop("url_ru", None) entity.add("modifiedAt", data.pop("last_change", None)) entity.add("wikidataId", wikidata_id) entity.add("name", data.pop("full_name_en", None)) entity.add("name", data.pop("full_name_ru", None)) entity.add("alias", data.pop("inversed_full_name_en", None)) entity.add("alias", data.pop("inversed_full_name_ru", None)) entity.add("alias", data.pop("also_known_as_en", None)) entity.add("alias", data.pop("also_known_as_ru", None)) entity.add("alias", split_names(data.pop("names", []))) entity.add("birthDate", parse_date(data.pop("date_of_birth", None))) entity.add("deathDate", parse_date(data.pop("termination_date_human", None))) entity.add("birthPlace", data.pop("city_of_birth_ru", None)) entity.add("birthPlace", data.pop("city_of_birth_en", None)) entity.add("innCode", data.pop("inn", None)) entity.add("firstName", data.pop("first_name_en", None)) entity.add("firstName", data.pop("first_name_ru", None)) entity.add("fatherName", data.pop("patronymic_en", None)) entity.add("fatherName", data.pop("patronymic_ru", None)) entity.add("lastName", data.pop("last_name_en", None)) entity.add("lastName", data.pop("last_name_ru", None)) for suffix in ("", "_en", "_ru"): role = data.pop(f"last_job_title{suffix}", None) org = data.pop(f"last_workplace{suffix}", None) if org is None or not len(org.strip()): continue position = org if role is not None and len(role.strip()): position = f"{org} ({role})" entity.add("position", position) for country_data in data.pop("related_countries", []): rel_type = country_data.pop("relationship_type") country_name = country_data.pop("to_country_en", None) country_name = country_name or country_data.pop("to_country_ru") # print(country_name) res = context.lookup("country_links", rel_type) if res is None: context.log.warn( "Unknown country link", rel_type=rel_type, entity=entity, country=country_name, ) continue if res.prop is not None: entity.add(res.prop, country_name) # h.audit_data(country_data) for rel_data in data.pop("related_persons", []): other_pep = rel_data.pop("is_pep", False) other_wdid = clean_wdid(rel_data.pop("person_wikidata_id")) other = context.make("Person", target=other_pep) other.id = person_id(context, rel_data.pop("person_id"), other_wdid) other.add("name", rel_data.pop("person_en", None)) other.add("name", rel_data.pop("person_ru", None)) other.add("wikidataId", other_wdid) rel_type = rel_data.pop("relationship_type_en", None) rel_type_ru = rel_data.pop("relationship_type_ru", None) rel_type = rel_type or rel_type_ru res = context.lookup("person_relations", rel_type) if res is None: context.log.warn( "Unknown person/person relation type", rel_type=rel_type, entity=entity, other=other, ) continue # print("LINK", (entity.id, other.id)) id_a, id_b = sorted((entity.id, other.id)) rel = context.make(res.schema) id_a_short = short_id(context, id_a) id_b_short = short_id(context, id_b) rel.id = context.make_slug(id_a_short, res.schema, id_b_short) rel.add(res.from_prop, id_a) rel.add(res.to_prop, id_b) rel.add(res.desc_prop, rel_type) rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) rel.add("startDate", parse_date(rel_data.pop("date_established"))) rel.add("endDate", parse_date(rel_data.pop("date_finished"))) # h.audit_data(rel_data) context.emit(other, target=other_pep) context.emit(rel) data.pop("type_of_official_ru", None) person_type = data.pop("type_of_official_en", None) person_topic = context.lookup_value("person_type", person_type) if person_topic is None: context.log.warn("Unknown type of official", type=person_type) entity.add("topics", person_topic) if is_pep: entity.add("topics", "role.pep") entity.add("status", person_type) data.pop("died", None) data.pop("tags", None) data.pop("reason_of_termination_en", None) data.pop("reason_of_termination_ru", None) # TODO: store images data.pop("photo", None) data.pop("related_companies", None) data.pop("declarations", None) # h.audit_data(data) context.emit(entity, target=is_pep)
def crawl_company(context: Context, data: Dict[str, Any]): entity = context.make("Organization") entity.id = company_id(context, data.pop("id")) entity.add("sourceUrl", data.pop("url_en", None)) data.pop("url_ru", None) entity.add("name", data.pop("name_en", None)) entity.add("name", data.pop("name_ru", None)) entity.add("name", data.pop("name_suggest_output_ru", None)) entity.add("alias", data.pop("also_known_as", None)) entity.add("alias", data.pop("short_name_en", None)) entity.add("alias", data.pop("short_name_ru", None)) entity.add("incorporationDate", parse_date(data.pop("founded", None))) entity.add("dissolutionDate", parse_date(data.pop("closed", None))) entity.add("status", data.pop("status_en", data.pop("status_ru", None))) entity.add("status", data.pop("status", None)) entity.add_cast("Company", "ogrnCode", data.pop("ogrn_code", None)) entity.add("registrationNumber", data.pop("edrpou", None)) for country_data in data.pop("related_countries", []): rel_type = country_data.pop("relationship_type") country_name = country_data.pop("to_country_en", None) country_name = country_name or country_data.pop("to_country_ru") # print(country_name) res = context.lookup("country_links", rel_type) if res is None: context.log.warn( "Unknown country link", rel_type=rel_type, entity=entity, country=country_name, ) continue if res.prop is not None: entity.add(res.prop, country_name) # h.audit_data(country_data) for rel_data in data.pop("related_persons", []): other_wdid = clean_wdid(rel_data.pop("person_wikidata_id")) other_id = person_id(context, rel_data.pop("person_id"), other_wdid) rel_type = rel_data.pop("relationship_type_en", None) rel_type_ru = rel_data.pop("relationship_type_ru", None) rel_type = rel_type or rel_type_ru res = context.lookup("person_relations", rel_type) if res is None: context.log.info( "Unknown company/person relation type", rel_type=rel_type, entity=entity, other=other_id, ) continue if res.schema is None: continue if res.schema == "Organization" and res.from_prop == "asset": entity.schema = model.get("Company") rel = context.make(res.schema) id_a_short = short_id(context, entity.id) id_b_short = short_id(context, other_id) rel.id = context.make_slug(id_a_short, res.schema, id_b_short) rel.add(res.from_prop, entity.id) rel.add(res.to_prop, other_id) rel.add(res.desc_prop, rel_type) rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) rel.add("startDate", parse_date(rel_data.pop("date_established"))) rel.add("endDate", parse_date(rel_data.pop("date_finished"))) context.emit(rel) for rel_data in data.pop("related_companies", []): # pprint(rel_data) # other_id = company_id(context, rel_data.pop("company_id")) # rel_type = rel_data.pop("relationship_type_en", None) # rel_type_ru = rel_data.pop("relationship_type_ru", None) # rel_type = rel_type or rel_type_ru # res = context.lookup("company_relations", rel_type) # if res is None: # context.log.warn( # "Unknown company/company relation type", # rel_type=rel_type, # entity=entity, # other=other_id, # ) # continue # if res.schema is None: # continue # if res.schema == "Organization" and res.from_prop == "asset": # entity.schema = model.get("Company") # rel = context.make(res.schema) # id_a_short = short_id(context, entity.id) # id_b_short = short_id(context, other_id) # rel.id = context.make_slug(id_a_short, res.schema, id_b_short) # rel.add(res.from_prop, entity.id) # rel.add(res.to_prop, other_id) # rel.add(res.desc_prop, rel_type) # rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) # rel.add("startDate", parse_date(rel_data.pop("date_established"))) # rel.add("endDate", parse_date(rel_data.pop("date_finished"))) # context.emit(rel) pass address = h.make_address( context, street=data.pop("street", None), city=data.pop("city", None), ) h.apply_address(context, entity, address) if data.pop("state_company", False): entity.add("topics", "gov.soe") ignore = [ "wiki", "bank_name", "other_founders", "other_owners", "other_managers", "other_recipient", ] h.audit_data(data, ignore=ignore) # print(entity.to_dict()) context.emit(entity)
def apply_prop(context: Context, entity, sanction, field, value): if field == "ALIAS": entity.add("alias", value.pop("Alias")) elif field == "SEXE": entity.add("gender", value.pop("Sexe")) elif field == "PRENOM": entity.add("firstName", value.pop("Prenom")) elif field == "NATIONALITE": entity.add("nationality", value.pop("Pays")) elif field == "TITRE": entity.add("position", value.pop("Titre")) elif field == "SITE_INTERNET": entity.add("website", value.pop("SiteInternet")) elif field == "TELEPHONE": entity.add("phone", value.pop("Telephone")) elif field == "COURRIEL": entity.add("email", value.pop("Courriel")) elif field == "NUMERO_OMI": entity.add("imoNumber", value.pop("NumeroOMI")) elif field == "DATE_DE_NAISSANCE": date = parse_parts(value.pop("Annee"), value.pop("Mois"), value.pop("Jour")) entity.add("birthDate", date) elif field in ("ADRESSE_PM", "ADRESSE_PP"): address = h.make_address( context, full=value.pop("Adresse"), country=value.pop("Pays"), ) h.apply_address(context, entity, address) elif field == "LIEU_DE_NAISSANCE": entity.add("birthPlace", value.pop("Lieu")) entity.add("country", value.pop("Pays")) elif field == "PASSEPORT": entity.add("passportNumber", value.pop("NumeroPasseport")) elif field == "IDENTIFICATION": comment = value.pop("Commentaire") content = value.pop("Identification") result = context.lookup("identification", comment) if result is None: context.log.warning( "Unknown Identification type", comment=comment, content=content, ) elif result.prop is not None: schema = result.schema or entity.schema entity.add_cast(schema, result.prop, content) if result.prop == "notes": entity.add(result.prop, h.clean_note(comment)) elif field == "AUTRE_IDENTITE": entity.add("idNumber", value.pop("NumeroCarte")) elif field == "REFERENCE_UE": sanction.add("authorityId", value.pop("ReferenceUe")) elif field == "REFERENCE_ONU": sanction.add("unscId", value.pop("ReferenceOnu")) elif field == "FONDEMENT_JURIDIQUE": sanction.add("program", value.pop("FondementJuridiqueLabel")) # TODO: derive target countries? elif field == "MOTIFS": motifs = value.pop("Motifs") sanction.add("reason", motifs) entity.add("notes", motifs) else: context.log.warning("Unknown field", field=field, value=value)