def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def parse_party(context: Context, distinct_party, locations, documents): profile = distinct_party.find("Profile") sub_type = ref_get("PartySubType", profile.get("PartySubTypeID")) schema = TYPES.get(sub_type.get("Value")) type_ = ref_value("PartyType", sub_type.get("PartyTypeID")) schema = TYPES.get(type_, schema) if schema is None: context.log.error("Unknown party type", value=type_) return party = context.make(schema) party.id = context.make_slug(profile.get("ID")) party.add("notes", h.clean_note(distinct_party.findtext("Comment"))) party.add("sourceUrl", URL % profile.get("ID")) for identity in profile.findall("./Identity"): parts = {} for group in identity.findall(".//NamePartGroup"): type_id = group.get("NamePartTypeID") parts[group.get("ID")] = ref_value("NamePartType", type_id) for alias in identity.findall("./Alias"): parse_alias(party, parts, alias) for regdoc in documents.get(identity.get("ID"), []): parse_registration_doc(context, party, regdoc) for feature in profile.findall("./Feature"): parse_feature(context, feature, party, locations) context.emit(party, target=True) # pprint(party.to_dict()) # context.log.info("[%s] %s" % (party.schema.name, party.caption)) return party
def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def make_entity(context: Context, el, schema, entity_id): entity = context.make(schema, target=True) entity.id = entity_id entity.add("notes", h.clean_note(el.findtext("./note"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("summary", el.findtext("./correction")) context.emit(sanction) return entity
def parse_foreign_persons(context: Context, entity, text): while "," in text: text, section = text.rsplit(",", 1) fragment = section.strip() if not len(fragment): continue date = parse_format(fragment, "%d.%m.%Y г. р.") if date.text is not None: entity.add("birthDate", date) continue entity.add("notes", h.clean_note(fragment)) parse_name(entity, text)
def crawl_physical(context: Context) -> None: data = json_resource(context, PHYSICAL_URL, "physical") for row in data: entity = context.make("Person") entity.id = context.make_slug("person", row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", h.clean_note(row.pop("additional", None))) for country in multi_split(row.pop("citizenship", None), [", "]): entity.add("nationality", country) entity.add("birthDate", row.pop("birthdate", None)) entity.add("birthPlace", row.pop("birthplace", None)) entity.add("position", remove_control_chars(row.pop("occupation", None))) handle_address(context, entity, row.pop("livingplace", None)) handle_sanction(context, entity, row) context.emit(entity, target=True)
def parse_common(context: Context, entity, node): entity.id = context.make_slug(node.findtext("./DATAID")) h.apply_name( entity, given_name=node.findtext("./FIRST_NAME"), second_name=node.findtext("./SECOND_NAME"), name3=node.findtext("./THIRD_NAME"), name4=node.findtext("./FOURTH_NAME"), quiet=True, ) entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT")) entity.add("notes", h.clean_note(node.findtext("./COMMENTS1"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) entity.add("createdAt", node.findtext("./LISTED_ON")) sanction.add("listingDate", node.findtext("./LISTED_ON")) sanction.add("startDate", node.findtext("./LISTED_ON")) sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) sanction.add("program", node.findtext("./UN_LIST_TYPE")) sanction.add("unscId", node.findtext("./REFERENCE_NUMBER")) return sanction
def parse_entry(context: Context, target, programs, places, updated_at): entity = context.make("LegalEntity") node = target.find("./entity") if node is None: node = target.find("./individual") entity = context.make("Person") if node is None: node = target.find("./object") object_type = node.get("object-type") if object_type != "vessel": context.log.warning("Unknown target type", target=target, object_type=object_type) entity = context.make("Vessel") entity.id = context.make_slug(target.get("ssid")) entity.add("gender", node.get("sex"), quiet=True) for other in node.findall("./other-information"): value = other.text.strip() if entity.schema.is_a("Vessel") and value.lower().startswith("imo"): _, imo_num = value.split(":", 1) entity.add("imoNumber", imo_num) else: entity.add("notes", h.clean_note(value)) sanction = h.make_sanction(context, entity) dates = set() for mod in target.findall("./modification"): dates.add(mod.get("publication-date")) sanction.add("listingDate", mod.get("publication-date")) sanction.add("startDate", mod.get("effective-date")) dates_ = [d for d in dates if d is not None] if len(dates_): entity.add("createdAt", min(dates_)) entity.add("modifiedAt", max(dates_)) ssid = target.get("sanctions-set-id") sanction.add("program", programs.get(ssid)) for justification in node.findall("./justification"): # TODO: should this go into sanction:reason? entity.add("notes", h.clean_note(justification.text)) for relation in node.findall("./relation"): rel_type = relation.get("relation-type") target_id = context.make_slug(relation.get("target-id")) res = context.lookup("relations", rel_type) if res is None: context.log.warn( "Unknown relationship type", type=rel_type, source=entity, target=target_id, ) continue rel = context.make(res.schema) rel.id = context.make_slug(relation.get("ssid")) rel.add(res.source, entity.id) rel.add(res.target, target_id) rel.add(res.text, rel_type) # rel_target = context.make(rel.schema.get(res.target).range) # rel_target.id = target_id # context.emit(rel_target) entity.add_schema(rel.schema.get(res.source).range) context.emit(rel) for identity in node.findall("./identity"): parse_identity(context, entity, identity, places) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_organizations(context: Context): path = context.fetch_resource("organizations.xlsx", ORG_URL) context.export_resource(path, XLSX, title=context.SOURCE_TITLE) seq_ids = {} links = [] for record in excel_records(path): seq_id = record.pop("internal_seq_id", None) name_en = record.pop("organization_name_english", None) name_he = record.pop("organization_name_hebrew", None) entity = context.make("Organization") entity.id = context.make_id(name_en, name_he) if entity.id is None: continue if seq_id is not None: seq_ids[seq_id] = entity.id entity.add("name", name_en) entity.add("name", name_he) entity.add("topics", "crime.terror") entity.add("notes", h.clean_note(lang_pick(record, "comments"))) entity.add("notes", h.clean_note(record.pop("column_42", None))) entity.add("email", record.pop("email", None)) entity.add("country", record.pop("country_hebrew", None)) entity.add("country", record.pop("country_english", None)) entity.add("registrationNumber", record.pop("corporation_id", None)) entity.add("legalForm", lang_pick(record, "corporation_type")) entity.add("jurisdiction", lang_pick(record, "location_of_formation")) date = parse_date(record.pop("date_of_corporation", None)) entity.add("incorporationDate", date) for field in list(record.keys()): if field.startswith("organization_name_"): entity.add("alias", record.pop(field, None)) if field.startswith("telephone"): entity.add("phone", record.pop(field, None)) if field.startswith("website"): entity.add("website", record.pop(field, None)) entity.add("phone", record.pop("column_70", None)) entity.add("website", record.pop("column_73", None)) sanction = h.make_sanction(context, entity) sanction.add("recordId", seq_id) sanction.add("recordId", record.pop("seq_num_in_other_countries", None)) sanction.add("program", record.pop("designation_type", None)) sanction.add("reason", lang_pick(record, "designation_justification")) sanction.add("authority", lang_pick(record, "designated_by")) sanction.add("publisher", record.pop("public_records_references", None)) lang_pick(record, "designated_by_abroad") record.pop("date_designated_in_other_countries", None) linked = record.pop("linked_to_internal_seq_id", "") for link in linked.split(";"): links.append((max(link, seq_id), min(link, seq_id))) street = lang_pick(record, "street") city = lang_pick(record, "city_village") if street or city: address = h.make_address( context, street=street, city=city, country_code=entity.first("country") ) h.apply_address(context, entity, address) for field in ( "date_of_temporary_designation", "date_of_permenant_designation", "date_designation_in_west_bank", ): parse_interval(sanction, record.pop(field, None)) context.emit(entity, target=True) context.emit(sanction) h.audit_data(record) for (subject, object) in links: subject_id = seq_ids.get(subject) object_id = seq_ids.get(object) if subject_id is None or object_id is None: continue link = context.make("UnknownLink") link.id = context.make_id(subject_id, object_id) link.add("subject", subject_id) link.add("object", object_id) context.emit(link)
def emit_row(context: Context, sheet: str, section: str, row: Dict[str, List[str]]): schema = context.lookup_value("schema", section) if schema is None: context.log.warning("No schema for section", section=section, sheet=sheet) return entity = context.make(schema) name_english = row.pop("name_english") name_japanese = row.pop("name_japanese") entity.id = context.make_id(*name_english, *name_japanese) if entity.id is None: # context.pprint((sheet, row)) return entity.add("name", parse_names(name_english)) if not entity.has("name"): entity.add("name", parse_names(name_japanese)) else: entity.add("alias", parse_names(name_japanese)) entity.add("alias", parse_names(row.pop("alias", []))) entity.add("alias", parse_names(row.pop("known_alias", []))) entity.add("weakAlias", parse_names(row.pop("weak_alias", []))) entity.add("weakAlias", parse_names(row.pop("nickname", []))) entity.add("previousName", parse_names(row.pop("past_alias", []))) entity.add("previousName", parse_names(row.pop("old_name", []))) entity.add_cast("Person", "position", row.pop("position", [])) birth_date = parse_date(row.pop("birth_date", [])) entity.add_cast("Person", "birthDate", birth_date) entity.add_cast("Person", "birthPlace", row.pop("birth_place", [])) entity.add_cast("Person", "passportNumber", row.pop("passport_number", [])) entity.add("idNumber", row.pop("id_number", [])) entity.add("idNumber", row.pop("identification_number", [])) entity.add("notes", h.clean_note(row.pop("other_information", None))) entity.add("notes", h.clean_note(row.pop("details", None))) entity.add("phone", row.pop("phone", [])) entity.add("phone", row.pop("fax", [])) for address_full in row.pop("address", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) for address_full in row.pop("where", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) title = row.pop("title", []) if entity.schema.is_a("Person"): entity.add("title", title) else: entity.add("notes", title) entity.add("country", row.pop("citizenship", [])) entity.add("country", row.pop("activity_area", [])) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("reason", row.pop("root_nomination", None)) sanction.add("reason", row.pop("reason_res1483", None)) sanction.add("authorityId", row.pop("notification_number", None)) sanction.add("unscId", row.pop("designated_un", None)) sanction.add("startDate", parse_date(row.pop("notification_date", []))) sanction.add("startDate", parse_date(row.pop("designated_date", []))) sanction.add("listingDate", parse_date(row.pop("publication_date", []))) # if len(row): # context.pprint(row) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def apply_prop(context: Context, entity, sanction, field, value): if field == "ALIAS": entity.add("alias", value.pop("Alias")) elif field == "SEXE": entity.add("gender", value.pop("Sexe")) elif field == "PRENOM": entity.add("firstName", value.pop("Prenom")) elif field == "NATIONALITE": entity.add("nationality", value.pop("Pays")) elif field == "TITRE": entity.add("position", value.pop("Titre")) elif field == "SITE_INTERNET": entity.add("website", value.pop("SiteInternet")) elif field == "TELEPHONE": entity.add("phone", value.pop("Telephone")) elif field == "COURRIEL": entity.add("email", value.pop("Courriel")) elif field == "NUMERO_OMI": entity.add("imoNumber", value.pop("NumeroOMI")) elif field == "DATE_DE_NAISSANCE": date = parse_parts(value.pop("Annee"), value.pop("Mois"), value.pop("Jour")) entity.add("birthDate", date) elif field in ("ADRESSE_PM", "ADRESSE_PP"): address = h.make_address( context, full=value.pop("Adresse"), country=value.pop("Pays"), ) h.apply_address(context, entity, address) elif field == "LIEU_DE_NAISSANCE": entity.add("birthPlace", value.pop("Lieu")) entity.add("country", value.pop("Pays")) elif field == "PASSEPORT": entity.add("passportNumber", value.pop("NumeroPasseport")) elif field == "IDENTIFICATION": comment = value.pop("Commentaire") content = value.pop("Identification") result = context.lookup("identification", comment) if result is None: context.log.warning( "Unknown Identification type", comment=comment, content=content, ) elif result.prop is not None: schema = result.schema or entity.schema entity.add_cast(schema, result.prop, content) if result.prop == "notes": entity.add(result.prop, h.clean_note(comment)) elif field == "AUTRE_IDENTITE": entity.add("idNumber", value.pop("NumeroCarte")) elif field == "REFERENCE_UE": sanction.add("authorityId", value.pop("ReferenceUe")) elif field == "REFERENCE_ONU": sanction.add("unscId", value.pop("ReferenceOnu")) elif field == "FONDEMENT_JURIDIQUE": sanction.add("program", value.pop("FondementJuridiqueLabel")) # TODO: derive target countries? elif field == "MOTIFS": motifs = value.pop("Motifs") sanction.add("reason", motifs) entity.add("notes", motifs) else: context.log.warning("Unknown field", field=field, value=value)
def parse_entry(context: Context, entry: Element): subject_type = entry.find("./subjectType") schema = context.lookup_value( "subject_type", subject_type.get("code"), dataset="eu_fsf", ) if schema is None: context.log.warning("Unknown subject type", type=subject_type) return entity = context.make(schema) eu_ref = entry.get("euReferenceNumber") if eu_ref is not None: entity.id = context.make_slug(eu_ref, dataset="eu_fsf") else: entity.id = context.make_slug("logical", entry.get("logicalId")) entity.add("notes", h.clean_note(entry.findtext("./remark"))) entity.add("topics", "sanction") parse_sanctions(context, entity, entry) for name in entry.findall("./nameAlias"): is_weak = not as_bool(name.get("strong")) h.apply_name( entity, full=name.get("wholeName"), first_name=name.get("firstName"), middle_name=name.get("middleName"), last_name=name.get("lastName"), is_weak=is_weak, quiet=True, ) entity.add("title", name.get("title"), quiet=True) entity.add("position", name.get("function"), quiet=True) entity.add("gender", name.get("gender"), quiet=True) for node in entry.findall("./identification"): type = node.get("identificationTypeCode") schema = "Passport" if type == "passport" else "Identification" passport = context.make(schema) passport.id = context.make_id("ID", entity.id, node.get("logicalId")) passport.add("holder", entity) passport.add("authority", node.get("issuedBy")) passport.add("type", node.get("identificationTypeDescription")) passport.add("number", node.get("number")) passport.add("number", node.get("latinNumber")) passport.add("startDate", node.get("issueDate")) passport.add("startDate", node.get("issueDate")) passport.add("country", parse_country(node)) passport.add("country", node.get("countryDescription")) for remark in node.findall("./remark"): passport.add("summary", remark.text) context.emit(passport) for node in entry.findall("./address"): address = parse_address(context, node) h.apply_address(context, entity, address) for child in node.getchildren(): if child.tag in ("regulationSummary"): continue elif child.tag == "remark": entity.add("notes", child.text) elif child.tag == "contactInfo": prop = context.lookup_value( "contact_info", child.get("key"), dataset="eu_fsf", ) if prop is None: context.log.warning("Unknown contact info", node=child) else: entity.add(prop, child.get("value")) else: context.log.warning("Unknown address component", node=child) for birth in entry.findall("./birthdate"): partialBirth = parse_parts(birth.get("year"), birth.get("month"), birth.get("day")) entity.add("birthDate", birth.get("birthdate")) entity.add("birthDate", partialBirth) address = parse_address(context, birth) if address is not None: entity.add("birthPlace", address.get("full")) entity.add("country", address.get("country")) for node in entry.findall("./citizenship"): entity.add("nationality", parse_country(node), quiet=True) entity.add("nationality", node.get("countryDescription"), quiet=True) context.emit(entity, target=True)
def crawl_row(context: Context, data: Dict[str, str]): entity = context.make("LegalEntity") ind_id = data.pop("INDIVIDUAL_Id", data.pop("IndividualID")) entity.id = context.make_slug(ind_id) assert entity.id, data entity.add("notes", h.clean_note(data.pop("COMMENTS", None))) entity.add("notes", h.clean_note(data.pop("Comments", None))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("notes", h.clean_note(data.pop("NOTE1", None))) entity.add("notes", h.clean_note(data.pop("NOTE2", None))) entity.add("notes", h.clean_note(data.pop("NOTE3", None))) entity.add_cast("Person", "nationality", data.pop("NATIONALITY", None)) entity.add_cast("Person", "nationality", data.pop("Nationality", None)) entity.add_cast("Person", "title", data.pop("TITLE", None)) entity.add_cast("Person", "title", data.pop("Title", None)) entity.add_cast("Person", "position", data.pop("DESIGNATION", None)) entity.add_cast("Person", "position", data.pop("Designation", None)) entity.add_cast("Person", "birthPlace", data.pop("PLACEOFBIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("IndividualPlaceOfBirth", None)) entity.add_cast("Person", "birthPlace", data.pop("CITY_OF_BIRTH", None)) entity.add_cast("Person", "birthDate", data.pop("YEAR", None)) entity.add_cast("Person", "gender", data.pop("GENDER", None)) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE", None))) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE_OF_BIRTH", None))) dob = parse_date(data.pop("IndividualDateOfBirth", None)) entity.add_cast("Person", "birthDate", dob) data.pop("BIRTHPLACE_x0020_CITY", None) data.pop("BIRTHPLACE_x0020_STATE_PROVINCE", None) entity.add("country", data.pop("BIRTHPLACE_x0020_COUNTRY", None)) entity.add("country", data.pop("COUNTRY_OF_BIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("BIRTHPLACE_x0020_NOTE", None)) h.apply_name( entity, full=data.pop("FullName", None), given_name=data.pop("FIRST_NAME", None), second_name=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) alias = data.pop("NAME_ORIGINAL_SCRIPT", None) if alias is not None and "?" not in alias: entity.add("alias", alias) entity.add("alias", data.pop("SORT_KEY", None)) data.pop("IndividualAlias", None) entity.add_cast("Person", "passportNumber", data.pop("PASSPORT", None)) entity.add_cast("Person", "passportNumber", data.pop("IndividualDocument", None)) data.pop("DATE_OF_ISSUE", None) data.pop("CITY_OF_ISSUE", None) entity.add("country", data.pop("COUNTRY_OF_ISSUE", None)) entity.add_cast("Person", "idNumber", data.pop("IDNUMBER", None)) address = h.make_address( context, # remarks=data.pop("NOTE"), full=data.pop("IndividualAddress", None), street=data.pop("STREET", None), city=data.pop("CITY", None), region=data.pop("STATE_PROVINCE", None), postal_code=data.pop("ZIP_CODE", None), country=data.pop("COUNTRY", None), ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) inserted_at = parse_date(data.pop("DateInserted", None)) listed_on = data.pop("ListedON", data.pop("ListedOn", None)) listed_at = parse_date(listed_on) entity.add("createdAt", inserted_at or listed_at) sanction.add("listingDate", listed_at or inserted_at) sanction.add("startDate", data.pop("FROM_YEAR", None)) sanction.add("endDate", data.pop("TO_YEAR", None)) sanction.add("program", data.pop("UN_LIST_TYPE", None)) sanction.add("unscId", data.pop("REFERENCE_NUMBER", None)) sanction.add("unscId", data.pop("ReferenceNumber", None)) sanction.add("authority", data.pop("SUBMITTED_BY", None)) entity.add("topics", "sanction") h.audit_data(data, ignore=["VERSIONNUM", "TYPE_OF_DATE", "ApplicationStatus"]) context.emit(entity, target=True) context.emit(sanction)
def parse_row(context: Context, row): group_type = row.pop("GroupTypeDescription") schema = TYPES.get(group_type) if schema is None: context.log.error("Unknown group type", group_type=group_type) return entity = context.make(schema) entity.id = context.make_slug(row.pop("GroupID")) sanction = h.make_sanction(context, entity) sanction.add("program", row.pop("RegimeName")) sanction.add("authority", row.pop("ListingType", None)) listed_date = h.parse_date(row.pop("DateListed"), FORMATS) sanction.add("listingDate", listed_date) designated_date = h.parse_date(row.pop("DateDesignated"), FORMATS) sanction.add("startDate", designated_date) entity.add("createdAt", listed_date) if not entity.has("createdAt"): entity.add("createdAt", designated_date) sanction.add("authorityId", row.pop("UKSanctionsListRef", None)) sanction.add("unscId", row.pop("UNRef", None)) sanction.add("status", row.pop("GroupStatus", None)) sanction.add("reason", row.pop("UKStatementOfReasons", None)) last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS) sanction.add("modifiedAt", last_updated) entity.add("modifiedAt", last_updated) # TODO: derive topics and schema from this?? entity_type = row.pop("Entity_Type", None) entity.add_cast("LegalEntity", "legalForm", entity_type) reg_number = row.pop("Entity_BusinessRegNumber", None) entity.add_cast("LegalEntity", "registrationNumber", reg_number) row.pop("Ship_Length", None) entity.add_cast("Vessel", "flag", row.pop("Ship_Flag", None)) flags = split_new(row.pop("Ship_PreviousFlags", None)) entity.add_cast("Vessel", "pastFlags", flags) entity.add_cast("Vessel", "type", row.pop("Ship_Type", None)) entity.add_cast("Vessel", "tonnage", row.pop("Ship_Tonnage", None)) entity.add_cast("Vessel", "buildDate", row.pop("Ship_YearBuilt", None)) entity.add_cast("Vessel", "imoNumber", row.pop("Ship_IMONumber", None)) ship_owner = row.pop("Ship_CurrentOwners", None) if ship_owner is not None: owner = context.make("LegalEntity") owner.id = context.make_slug("named", ship_owner) owner.add("name", ship_owner) context.emit(owner) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", owner.id) ownership.add("owner", owner) ownership.add("asset", entity) context.emit(ownership) countries = parse_countries(row.pop("Country", None)) entity.add("country", countries) title = split_items(row.pop("Title", None)) entity.add("title", title, quiet=True) pobs = split_items(row.pop("Individual_TownOfBirth", None)) entity.add_cast("Person", "birthPlace", pobs) dob = h.parse_date(row.pop("Individual_DateOfBirth", None), FORMATS) entity.add_cast("Person", "birthDate", dob) cob = parse_countries(row.pop("Individual_CountryOfBirth", None)) entity.add_cast("Person", "country", cob) nationalities = parse_countries(row.pop("Individual_Nationality", None)) entity.add_cast("Person", "nationality", nationalities) positions = split_items(row.pop("Individual_Position", None)) entity.add_cast("Person", "position", positions) entity.add_cast("Person", "gender", row.pop("Individual_Gender", None)) name_type = row.pop("AliasType", None) name_prop = NAME_TYPES.get(name_type) if name_prop is None: context.log.warning("Unknown name type", type=name_type) return name_quality = row.pop("AliasQuality", None) is_weak = WEAK_QUALITY.get(name_quality) if is_weak is None: context.log.warning("Unknown name quality", quality=name_quality) return h.apply_name( entity, name1=row.pop("name1", None), name2=row.pop("name2", None), name3=row.pop("name3", None), name4=row.pop("name4", None), name5=row.pop("name5", None), tail_name=row.pop("Name6", None), name_prop=name_prop, is_weak=is_weak, quiet=True, ) entity.add("alias", row.pop("NameNonLatinScript", None)) full_address = join_text( row.pop("Address1", None), row.pop("Address2", None), row.pop("Address3", None), row.pop("Address4", None), row.pop("Address5", None), row.pop("Address6", None), sep=", ", ) address = h.make_address( context, full=full_address, postal_code=row.pop("PostCode", None), country=first(countries), ) h.apply_address(context, entity, address) passport_number = row.pop("Individual_PassportNumber", None) passport_numbers = split_items(passport_number) entity.add_cast("Person", "passportNumber", passport_numbers) passport_detail = row.pop("Individual_PassportDetails", None) # passport_details = split_items(passport_detail) # TODO: where do I stuff this? ni_number = row.pop("Individual_NINumber", None) ni_numbers = split_items(ni_number) entity.add_cast("Person", "idNumber", ni_numbers) ni_detail = row.pop("Individual_NIDetails", None) # ni_details = split_items(ni_detail) # TODO: where do I stuff this? for phone in split_new(row.pop("PhoneNumber", None)): entity.add_cast("LegalEntity", "phone", phone) for email in split_new(row.pop("EmailAddress", None)): entity.add_cast("LegalEntity", "email", email) for website in split_new(row.pop("Website", None)): entity.add_cast("LegalEntity", "website", website) for name in parse_companies(context, row.pop("Entity_ParentCompany", None)): parent = context.make("Organization") parent.id = context.make_slug("named", name) parent.add("name", name) context.emit(parent) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", parent.id) ownership.add("owner", parent) ownership.add("asset", entity) context.emit(ownership) for name in parse_companies(context, row.pop("Entity_Subsidiaries", None)): subsidiary = context.make("Company") subsidiary.id = context.make_slug("named", name) subsidiary.add("name", name) context.emit(subsidiary) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", subsidiary.id) ownership.add("owner", entity) ownership.add("asset", subsidiary) context.emit(ownership) grp_status = row.pop("GrpStatus", None) if grp_status != "A": context.log.warning("Unknown GrpStatus", value=grp_status) entity.add("notes", h.clean_note(row.pop("OtherInformation", None))) h.audit_data(row, ignore=["NonLatinScriptLanguage", "NonLatinScriptType"]) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) tables = doc.findall(".//table") assert len(tables) == 1 rows = tables[0].findall(".//tr") for row in rows[2:]: cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] index = cells[0] body = cells[1] decision = cells[2] un_id = cells[3] listing_date = cells[4] entity = context.make("Thing") entity.id = context.make_slug(index, un_id) entity.add("notes", h.clean_note(cells[5])) sanction = h.make_sanction(context, entity) sanction.add("listingDate", clean_date(listing_date)) sanction.add("program", decision) sanction.add("recordId", un_id) body, gender = maybe_rsplit(body, "пол:") entity.add_cast("Person", "gender", gender) body, gender = maybe_rsplit(body, "Пол:") entity.add_cast("Person", "gender", gender) body, location = maybe_rsplit(body, "местонахождение:") entity.add_cast("LegalEntity", "country", location) body, imo_num = maybe_rsplit(body, "Присвоенный ИМО номер компании:") body, imo_num = maybe_rsplit(body, "Номер ИМО:") body, emails = maybe_rsplit(body, "Адрес эл. почты:") for email in letter_split(emails): entity.add_cast("LegalEntity", "email", email) body, fax = maybe_rsplit(body, "Номер факса:") body, fax = maybe_rsplit(body, "Факс:") body, phones = maybe_rsplit(body, "Номера телефонов:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, phones = maybe_rsplit(body, "Тел.:") for phone in letter_split(phones): entity.add_cast("LegalEntity", "phone", phone) body, swift = maybe_rsplit(body, "СВИФТ-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, swift = maybe_rsplit(body, "СВИФТ/БИК-код:") entity.add_cast("LegalEntity", "swiftBic", swift) body, other_info = maybe_rsplit(body, "Прочая информация:") entity.add_cast("Thing", "notes", other_info) body, listing_date = maybe_rsplit(body, "Дата внесения в перечень:") body, addresses = maybe_rsplit(body, "Адрес:") for address in letter_split(addresses): country = address if ", " in country: country = address.rsplit(", ", 1) code = registry.country.clean(country, fuzzy=True) obj = h.make_address(context, full=address, country_code=code) h.apply_address(context, entity, obj) entity.add("country", code) body, national_ids = maybe_rsplit( body, "Национальный идентификационный номер:") for national_id in letter_split(national_ids): entity.add_cast("LegalEntity", "idNumber", national_id) body, passport_nos = maybe_rsplit(body, "Паспорт №:") for passport_no in letter_split(passport_nos): entity.add_cast("Person", "passportNumber", passport_no) body, citizenship = maybe_rsplit(body, "Гражданство:") entity.add_cast("Person", "nationality", citizenship) aka = "На основании менее достоверных источников также известен как:" body, aka = maybe_rsplit(body, aka) entity.add("alias", letter_split(aka)) strong_aka = "На основании достоверных источников также известен как:" body, strong_aka = maybe_rsplit(body, strong_aka) entity.add("alias", letter_split(strong_aka)) body, rik_no = maybe_rsplit(body, "Р.И.К.:") body, birth_place = maybe_rsplit(body, "Место рождения:") entity.add_cast("Person", "birthPlace", birth_place) body, birth_dates = maybe_rsplit(body, "Дата рождения:") for birth_date in letter_split(birth_dates): entity.add_cast("Person", "birthDate", clean_date(birth_date)) body, position = maybe_rsplit(body, "Должность:") entity.add_cast("Person", "position", position) body, job = maybe_rsplit(body, "Обращение:") entity.add_cast("Person", "position", job) body, aliases = maybe_rsplit(body, "Другие названия:") entity.add("alias", letter_split(aliases)) body, aliases = maybe_rsplit(body, "Вымышленные названия:") entity.add("alias", letter_split(aliases)) names = body.split(", ") entity.add("name", names) # context.pprint(names) if entity.schema.name == "Thing": entity.schema = model.get("LegalEntity") context.emit(entity, target=True) context.emit(sanction)