def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_legislature(context: Context, country, legislature): lastmod_ = int(legislature.get("lastmod")) lastmod = datetime.utcfromtimestamp(lastmod_) url = legislature.get("popolo_url") # this isn't being updated, hence long interval: data = context.fetch_json(url, cache_days=30) persons: Dict[str, Optional[str]] = {} for person in data.pop("persons", []): pid = person.get("id") persons[pid] = parse_person(context, person, country, lastmod) organizations: Dict[str, Optional[str]] = {} for org in data.pop("organizations", []): org_id = org.pop("id", None) org_id = context.lookup_value("org_id", org_id, org_id) if org_id is None: continue name = org.pop("name", org.pop("sort_name", None)) organizations[org_id] = name events = data.pop("events", []) events = {e.get("id"): e for e in events} for membership in data.pop("memberships", []): parse_membership(context, membership, persons, organizations, events)
def crawl(context: Context): xls_url = fetch_xls_url(context) path = context.fetch_resource("source.xls", xls_url) context.export_resource(path, XLS, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) for sheet in xls.sheets(): headers = None row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)] sections = [c for c in row0 if c is not None] section = collapse_spaces(" / ".join(sections)) for r in range(1, sheet.nrows): row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)] # after a header is found, read normal data: if headers is not None: data: Dict[str, List[str]] = {} for header, cell in zip(headers, row): if header is None: continue values = [] if isinstance(cell, datetime): cell = cell.date() for value in multi_split(stringify(cell), SPLITS): if value is None: continue if value == "不明": continue if value is not None: values.append(value) data[header] = values emit_row(context, sheet.name, section, data) if not len(row) or row[0] is None: continue teaser = row[0].strip() # the first column of the common headers: if "告示日付" in teaser: if headers is not None: context.log.error("Found double header?", row=row) # print("SHEET", sheet, row) headers = [] for cell in row: cell = collapse_spaces(cell) header = context.lookup_value("columns", cell) if header is None: context.log.warning("Unknown column title", column=cell, sheet=sheet.name) headers.append(header)
def crawl(context: Context): for page in count(1): url = str(context.dataset.data.url) url = url.replace("pPageNumber=1", "pPageNumber=%s" % page) headers = { "Accept": "application/json", "Referer": "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals", } res = context.http.get(url, headers=headers) ids = [] for row in res.json(): for field, value in list(row.items()): if value == "N/A": row[field] = "" row_id = row.pop("id") ids.append(row_id) entity_type = row.pop("entity") schema = context.lookup_value("types", entity_type) if schema is None: context.log.warning("Unknown entity type", entity=entity_type) continue entity = context.make(schema) entity.id = context.make_slug(row_id) entity.add("name", row.pop("firmName")) entity.add("topics", "debarment") entity.add("alias", row.pop("additionalName")) entity.add("notes", row.pop("title")) entity.add("notes", row.pop("additionalTitle")) entity.add("country", parse_countries(row.pop("country"))) nat = "nationality" if schema == "Company": nat = "jurisdiction" entity.add(nat, parse_countries(row.pop("nationality"))) affiliated = row.pop("affiliatedWithEntityId") if len(affiliated): link = context.make("UnknownLink") link.id = context.make_id(row_id, affiliated) link.add("subject", entity.id) link.add("object", context.make_slug(affiliated)) context.emit(link) sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("statusName")) sanction.add("reason", row.pop("grounds")) sanction.add("authority", row.pop("source")) sanction.add("authority", row.pop("idBinstSource")) sanction.add("program", row.pop("idBinstType")) sanction.add("startDate", h.parse_date(row.pop("datefrom"), FORMATS)) sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS)) # context.pprint(row) context.emit(sanction) context.emit(entity, target=True) if min(ids) == 1: return
def emit_row(context: Context, sheet: str, section: str, row: Dict[str, List[str]]): schema = context.lookup_value("schema", section) if schema is None: context.log.warning("No schema for section", section=section, sheet=sheet) return entity = context.make(schema) entity.id = context.make_id(*row.get("name_english"), *row.get("name_japanese")) if entity.id is None: # context.pprint((sheet, row)) return entity.add("name", parse_names(row.pop("name_english"))) if not entity.has("name"): entity.add("name", parse_names(row.pop("name_japanese"))) else: entity.add("alias", parse_names(row.pop("name_japanese"))) entity.add("alias", parse_names(row.pop("alias", []))) entity.add("alias", parse_names(row.pop("known_alias", []))) entity.add("weakAlias", parse_names(row.pop("weak_alias", []))) entity.add("weakAlias", parse_names(row.pop("nickname", []))) entity.add("previousName", parse_names(row.pop("past_alias", []))) entity.add("previousName", parse_names(row.pop("old_name", []))) entity.add_cast("Person", "position", row.pop("position", [])) birth_date = parse_date(row.pop("birth_date", [])) entity.add_cast("Person", "birthDate", birth_date) entity.add_cast("Person", "birthPlace", row.pop("birth_place", [])) entity.add_cast("Person", "passportNumber", row.pop("passport_number", [])) entity.add("idNumber", row.pop("id_number", [])) entity.add("idNumber", row.pop("identification_number", [])) entity.add("notes", row.pop("other_information", [])) entity.add("notes", row.pop("details", [])) entity.add("phone", row.pop("phone", [])) entity.add("phone", row.pop("fax", [])) for address_full in row.pop("address", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) for address_full in row.pop("where", []): address = h.make_address(context, full=address_full) h.apply_address(context, entity, address) title = row.pop("title", []) if entity.schema.is_a("Person"): entity.add("title", title) else: entity.add("notes", title) entity.add("country", row.pop("citizenship", [])) entity.add("country", row.pop("activity_area", [])) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("reason", row.pop("root_nomination", None)) sanction.add("reason", row.pop("reason_res1483", None)) sanction.add("recordId", row.pop("notification_number", None)) sanction.add("startDate", parse_date(row.pop("notification_date", []))) sanction.add("startDate", parse_date(row.pop("designated_date", []))) sanction.add("listingDate", parse_date(row.pop("publication_date", []))) row.pop("designated_un", None) # if len(row): # context.pprint(row) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_persons(context: Context): for data in fetch(context, "personas"): entity = crawl_common(context, data, "personas", "Person") entity.add("title", values(data.pop("TITLE", None))) entity.add("nationality", values(data.pop("NATIONALITY", None))) entity.add("position", values(data.pop("DESIGNATION", None))) entity.add("gender", data.pop("GENDER", None)) entity.add("birthDate", data.pop("DATE_OF_BIRTH", None)) entity.add("birthDate", data.pop("YEAR", None)) entity.add("birthPlace", data.pop("CITY_OF_BIRTH", None)) entity.add("country", data.pop("COUNTRY_OF_BIRTH", None)) for dob in data.pop("INDIVIDUAL_DATE_OF_BIRTH", []): date = parse_date(dob.pop("DATE", None)) entity.add("birthDate", date) date = parse_date(dob.pop("TYPE_OF_DATE", None)) entity.add("birthDate", date) entity.add("birthDate", dob.pop("YEAR", None)) entity.add("birthDate", dob.pop("FROM_YEAR", None)) entity.add("birthDate", dob.pop("TO_YEAR", None)) h.audit_data(dob, ignore=["NOTE"]) for doc in data.pop("INDIVIDUAL_DOCUMENT", []): type_ = doc.pop("TYPE_OF_DOCUMENT", None) number = doc.pop("NUMBER", None) schema = context.lookup_value("doc_types", type_) if schema is None: context.log.warning("Unknown document type", type=type_) continue passport = context.make(schema) passport.id = context.make_id("ID", entity.id, number) passport.add("holder", entity) passport.add("type", type_) passport.add("number", number) passport.add("type", doc.pop("TYPE_OF_DOCUMENT2", None)) passport.add("startDate", parse_date(doc.pop("DATE_OF_ISSUE", None))) passport.add("country", doc.pop("ISSUING_COUNTRY", None)) passport.add("country", doc.pop("COUNTRY_OF_ISSUE", None)) passport.add("summary", doc.pop("NOTE", None)) context.emit(passport) h.audit_data(doc, ignore=["CITY_OF_ISSUE"]) for addr in data.pop("INDIVIDUAL_ADDRESS", []): address = parse_address(context, addr) h.apply_address(context, entity, address) for addr in data.pop("INDIVIDUAL_PLACE_OF_BIRTH", []): address = parse_address(context, addr) if address is not None: entity.add("birthPlace", address.get("full")) entity.add("country", address.get("country")) for alias in data.pop("INDIVIDUAL_ALIAS", []): entity.add("birthDate", alias.pop("DATE_OF_BIRTH", None)) entity.add("birthDate", alias.pop("YEAR", None)) entity.add("birthPlace", alias.pop("CITY_OF_BIRTH", None)) entity.add("country", alias.pop("COUNTRY_OF_BIRTH", None)) parse_alias(entity, alias) h.audit_data(data, ["VERSIONNUM"]) context.emit(entity, target=True)
def crawl_person(context: Context, data: Dict[str, Any]): is_pep = data.pop("is_pep", False) entity = context.make("Person", target=is_pep) wikidata_id = clean_wdid(data.pop("wikidata_id", None)) entity.id = person_id(context, data.pop("id"), wikidata_id) entity.add("sourceUrl", data.pop("url_en", None)) data.pop("url_ru", None) entity.add("modifiedAt", data.pop("last_change", None)) entity.add("wikidataId", wikidata_id) entity.add("name", data.pop("full_name_en", None)) entity.add("name", data.pop("full_name_ru", None)) entity.add("alias", data.pop("inversed_full_name_en", None)) entity.add("alias", data.pop("inversed_full_name_ru", None)) entity.add("alias", data.pop("also_known_as_en", None)) entity.add("alias", data.pop("also_known_as_ru", None)) entity.add("alias", split_names(data.pop("names", []))) entity.add("birthDate", parse_date(data.pop("date_of_birth", None))) entity.add("deathDate", parse_date(data.pop("termination_date_human", None))) entity.add("birthPlace", data.pop("city_of_birth_ru", None)) entity.add("birthPlace", data.pop("city_of_birth_en", None)) entity.add("innCode", data.pop("inn", None)) entity.add("firstName", data.pop("first_name_en", None)) entity.add("firstName", data.pop("first_name_ru", None)) entity.add("fatherName", data.pop("patronymic_en", None)) entity.add("fatherName", data.pop("patronymic_ru", None)) entity.add("lastName", data.pop("last_name_en", None)) entity.add("lastName", data.pop("last_name_ru", None)) for suffix in ("", "_en", "_ru"): role = data.pop(f"last_job_title{suffix}", None) org = data.pop(f"last_workplace{suffix}", None) if org is None or not len(org.strip()): continue position = org if role is not None and len(role.strip()): position = f"{org} ({role})" entity.add("position", position) for country_data in data.pop("related_countries", []): rel_type = country_data.pop("relationship_type") country_name = country_data.pop("to_country_en", None) country_name = country_name or country_data.pop("to_country_ru") # print(country_name) res = context.lookup("country_links", rel_type) if res is None: context.log.warn( "Unknown country link", rel_type=rel_type, entity=entity, country=country_name, ) continue if res.prop is not None: entity.add(res.prop, country_name) # h.audit_data(country_data) for rel_data in data.pop("related_persons", []): other_pep = rel_data.pop("is_pep", False) other_wdid = clean_wdid(rel_data.pop("person_wikidata_id")) other = context.make("Person", target=other_pep) other.id = person_id(context, rel_data.pop("person_id"), other_wdid) other.add("name", rel_data.pop("person_en", None)) other.add("name", rel_data.pop("person_ru", None)) other.add("wikidataId", other_wdid) rel_type = rel_data.pop("relationship_type_en", None) rel_type_ru = rel_data.pop("relationship_type_ru", None) rel_type = rel_type or rel_type_ru res = context.lookup("person_relations", rel_type) if res is None: context.log.warn( "Unknown person/person relation type", rel_type=rel_type, entity=entity, other=other, ) continue # print("LINK", (entity.id, other.id)) id_a, id_b = sorted((entity.id, other.id)) rel = context.make(res.schema) id_a_short = short_id(context, id_a) id_b_short = short_id(context, id_b) rel.id = context.make_slug(id_a_short, res.schema, id_b_short) rel.add(res.from_prop, id_a) rel.add(res.to_prop, id_b) rel.add(res.desc_prop, rel_type) rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed"))) rel.add("startDate", parse_date(rel_data.pop("date_established"))) rel.add("endDate", parse_date(rel_data.pop("date_finished"))) # h.audit_data(rel_data) context.emit(other, target=other_pep) context.emit(rel) data.pop("type_of_official_ru", None) person_type = data.pop("type_of_official_en", None) person_topic = context.lookup_value("person_type", person_type) if person_topic is None: context.log.warn("Unknown type of official", type=person_type) entity.add("topics", person_topic) if is_pep: entity.add("topics", "role.pep") entity.add("status", person_type) data.pop("died", None) data.pop("tags", None) data.pop("reason_of_termination_en", None) data.pop("reason_of_termination_ru", None) # TODO: store images data.pop("photo", None) data.pop("related_companies", None) data.pop("declarations", None) # h.audit_data(data) context.emit(entity, target=is_pep)
def parse_entry(context: Context, entry: Element): subject_type = entry.find("./subjectType") schema = context.lookup_value( "subject_type", subject_type.get("code"), dataset="eu_fsf", ) if schema is None: context.log.warning("Unknown subject type", type=subject_type) return entity = context.make(schema) eu_ref = entry.get("euReferenceNumber") if eu_ref is not None: entity.id = context.make_slug(eu_ref, dataset="eu_fsf") else: entity.id = context.make_slug("logical", entry.get("logicalId")) entity.add("notes", h.clean_note(entry.findtext("./remark"))) entity.add("topics", "sanction") parse_sanctions(context, entity, entry) for name in entry.findall("./nameAlias"): is_weak = not as_bool(name.get("strong")) h.apply_name( entity, full=name.get("wholeName"), first_name=name.get("firstName"), middle_name=name.get("middleName"), last_name=name.get("lastName"), is_weak=is_weak, quiet=True, ) entity.add("title", name.get("title"), quiet=True) entity.add("position", name.get("function"), quiet=True) entity.add("gender", name.get("gender"), quiet=True) for node in entry.findall("./identification"): type = node.get("identificationTypeCode") schema = "Passport" if type == "passport" else "Identification" passport = context.make(schema) passport.id = context.make_id("ID", entity.id, node.get("logicalId")) passport.add("holder", entity) passport.add("authority", node.get("issuedBy")) passport.add("type", node.get("identificationTypeDescription")) passport.add("number", node.get("number")) passport.add("number", node.get("latinNumber")) passport.add("startDate", node.get("issueDate")) passport.add("startDate", node.get("issueDate")) passport.add("country", parse_country(node)) passport.add("country", node.get("countryDescription")) for remark in node.findall("./remark"): passport.add("summary", remark.text) context.emit(passport) for node in entry.findall("./address"): address = parse_address(context, node) h.apply_address(context, entity, address) for child in node.getchildren(): if child.tag in ("regulationSummary"): continue elif child.tag == "remark": entity.add("notes", child.text) elif child.tag == "contactInfo": prop = context.lookup_value( "contact_info", child.get("key"), dataset="eu_fsf", ) if prop is None: context.log.warning("Unknown contact info", node=child) else: entity.add(prop, child.get("value")) else: context.log.warning("Unknown address component", node=child) for birth in entry.findall("./birthdate"): partialBirth = parse_parts(birth.get("year"), birth.get("month"), birth.get("day")) entity.add("birthDate", birth.get("birthdate")) entity.add("birthDate", partialBirth) address = parse_address(context, birth) if address is not None: entity.add("birthPlace", address.get("full")) entity.add("country", address.get("country")) for node in entry.findall("./citizenship"): entity.add("nationality", parse_country(node), quiet=True) entity.add("nationality", node.get("countryDescription"), quiet=True) context.emit(entity, target=True)
def parse_result(context: Context, result): type_ = result.pop("type", None) schema = context.lookup_value("type", type_) if schema is None: context.log.error("Unknown result type", type=type_) return entity = context.make(schema) entity.id = context.make_slug(result.pop("id")) entity_number = result.pop("entity_number", None) if entity_number is not None: assert int(entity_number) entity.id = context.make_slug(entity_number, dataset="us_ofac_sdn") name = result.pop("name", None) name = name.replace("and any successor, sub-unit, or subsidiary thereof", "") entity.add("name", name) for alias in ensure_list(result.pop("alt_names", "")): entity.add("alias", alias.split("; ")) entity.add("notes", result.pop("remarks", None)) entity.add("country", result.pop("country", None)) if entity.schema.is_a("Person"): entity.add("position", result.pop("title", None)) entity.add("nationality", result.pop("nationalities", None)) entity.add("nationality", result.pop("citizenships", None)) for dob in result.pop("dates_of_birth", []): entity.add("birthDate", h.parse_date(dob, FORMATS)) entity.add("birthPlace", result.pop("places_of_birth", None)) elif entity.schema.is_a("Vessel"): entity.add("flag", result.pop("vessel_flag", None)) entity.add("callSign", result.pop("call_sign", None)) entity.add("type", result.pop("vessel_type", None)) grt = result.pop("gross_registered_tonnage", None) entity.add("grossRegisteredTonnage", grt) gt = result.pop("gross_tonnage", None) entity.add("tonnage", gt) # TODO: make adjacent owner entity result.pop("vessel_owner", None) assert result.pop("title", None) is None assert not len(result.pop("nationalities", [])) assert not len(result.pop("citizenships", [])) assert not len(result.pop("dates_of_birth", [])) assert not len(result.pop("places_of_birth", [])) for address in result.pop("addresses", []): obj = h.make_address( context, street=address.get("address"), city=address.get("city"), postal_code=address.get("postal_code"), region=address.get("state"), country=address.get("country"), ) h.apply_address(context, entity, obj) for ident in result.pop("ids", []): country = ident.pop("country") entity.add("country", country) h.apply_feature( context, entity, ident.pop("type"), ident.pop("number"), country=country, date_formats=FORMATS, start_date=ident.pop("issue_date", None), end_date=ident.pop("expiration_date", None), ) sanction = context.make("Sanction") sanction.id = context.make_id(entity.id, "Sanction") sanction.add("entity", entity) sanction.add("program", result.pop("programs", [])) sanction.add("provisions", result.pop("license_policy", [])) sanction.add("reason", result.pop("license_requirement", [])) sanction.add("authorityId", result.pop("federal_register_notice", None)) sanction.add("startDate", result.pop("start_date", None)) sanction.add("endDate", result.pop("end_date", None)) sanction.add("country", "us") sanction.add("authority", result.pop("source", None)) # TODO: deref source_url = deref_url(context, result.pop("source_information_url")) sanction.add("sourceUrl", source_url) result.pop("source_list_url") context.emit(sanction) context.emit(entity, target=True) h.audit_data(result, ignore=["standard_order"])