def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for table in doc.findall("//table"): headers = table.findall("./thead/tr/td") headers = [h.text_content() for h in headers] assert "Vendor name" in headers, headers assert "From" in headers, headers for row in table.findall("./tbody/tr"): cells = [h.text_content() for h in row.findall("./td")] if len(cells[0]) == 0: continue entity = context.make("LegalEntity") entity.id = context.make_id(*cells) entity.add("name", cells[0]) entity.add("country", cells[1]) entity.add("topics", "crime.fraud") cc = entity.first("country") address = h.make_address(context, full=cells[2], country_code=cc) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("reason", cells[3]) sanction.add("program", cells[4]) sanction.add("startDate", parse_date(cells[5])) sanction.add("endDate", parse_date(cells[6])) context.emit(sanction) context.emit(entity, target=True)
def parse_party(context: Context, distinct_party, locations, documents): profile = distinct_party.find("Profile") sub_type = ref_get("PartySubType", profile.get("PartySubTypeID")) schema = TYPES.get(sub_type.get("Value")) type_ = ref_value("PartyType", sub_type.get("PartyTypeID")) schema = TYPES.get(type_, schema) if schema is None: context.log.error("Unknown party type", value=type_) return party = context.make(schema) party.id = context.make_slug(profile.get("ID")) party.add("notes", h.clean_note(distinct_party.findtext("Comment"))) party.add("sourceUrl", URL % profile.get("ID")) for identity in profile.findall("./Identity"): parts = {} for group in identity.findall(".//NamePartGroup"): type_id = group.get("NamePartTypeID") parts[group.get("ID")] = ref_value("NamePartType", type_id) for alias in identity.findall("./Alias"): parse_alias(party, parts, alias) for regdoc in documents.get(identity.get("ID"), []): parse_registration_doc(context, party, regdoc) for feature in profile.findall("./Feature"): parse_feature(context, feature, party, locations) context.emit(party, target=True) # pprint(party.to_dict()) # context.log.info("[%s] %s" % (party.schema.name, party.caption)) return party
def crawl_person(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "person") for row in data["data"]: row = clean_row(row) person_id = row.pop("person_id") name_en = row.pop("name_en", None) name_ru = row.pop("name_ru", None) name_uk = row.pop("name_uk", None) name = name_en or name_ru or name_uk entity = context.make("Person") entity.id = context.make_slug("person", person_id, name) entity.add("name", name) entity.add("alias", name_ru) entity.add("alias", name_uk) entity.add("birthDate", parse_date(row.pop("date_bd", None))) url = "https://sanctions.nazk.gov.ua/sanction-person/%s/" entity.add("sourceUrl", url % person_id) if row.get("city_bd_en") != "N/A": entity.add("birthPlace", row.pop("city_bd_en", None)) entity.add("birthPlace", row.pop("city_bd_ru", None)) entity.add("birthPlace", row.pop("city_bd_uk", None)) entity.add("position", row.pop("position_en", None)) entity.add("position", row.pop("position_ru", None)) entity.add("position", row.pop("position_uk", None)) entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) country = row.get("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") context.emit(entity, target=True)
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def crawl_company(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "company") for row in data["data"]: row = clean_row(row) # context.pprint(row) company_id = row.pop("company_id") name_en = row.pop("name_en", None) name = row.pop("name", None) or name_en entity = context.make("Organization") entity.id = context.make_slug("company", company_id, name) if entity.id is None: entity.id = context.make_slug( "company", company_id, row.pop("ogrn", None), strict=False, ) entity.add("name", name) entity.add("name", name_en) entity.add("name", row.pop("name_uk", None)) entity.add("name", row.pop("name_ru", None)) entity.add("innCode", row.pop("inn", None)) entity.add_cast("Company", "ogrnCode", row.pop("ogrn", None)) country = row.pop("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) context.emit(entity, target=True) row.pop("logo_en", None)
def parse_row(context: Context, row): entity = context.make("LegalEntity") entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name")) entity.add("name", row.get("Name")) entity.add("notes", row.get("Action")) entity.add("country", row.get("Country")) entity.add("modifiedAt", row.get("Last_Update")) address = h.make_address( context, street=row.get("Street_Address"), postal_code=row.get("Postal_Code"), city=row.get("City"), region=row.get("State"), country=row.get("Country"), ) h.apply_address(context, entity, address) context.emit(entity, target=True) citation = row.get("FR_Citation") sanction = h.make_sanction(context, entity, key=citation) sanction.add("program", citation) sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS)) sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS)) # pprint(row) context.emit(sanction)
def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for sec_id, (section, schema) in SECTIONS.items(): el = doc.find(".//div[@id='%s']" % sec_id) for item in el.findall(".//li"): text = item.text_content().strip() index, text = text.split(".", 1) text = text.strip() if text.endswith(";"): text = text.rstrip(";") entity = context.make(schema) entity.id = context.make_id(text) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("recordId", index) if sec_id == "russianUL": parse_russian_orgs(context, entity, text) if sec_id == "russianFL": parse_russian_persons(context, entity, text) if sec_id == "foreignUL": parse_foreign_orgs(context, entity, text) if sec_id == "foreignFL": parse_foreign_persons(context, entity, text) if entity.has("name"): context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def parse_person(context: Context, data, country, lastmod): person_id = data.pop("id", None) person = context.make("Person") person.id = context.make_slug(person_id) person.add("nationality", country) name = data.get("name") if name is None or name.lower().strip() in ("unknown", ): return person.add("modifiedAt", lastmod.date()) person.add("name", data.pop("name", None)) person.add("alias", data.pop("sort_name", None)) for other in data.pop("other_names", []): person.add("alias", other.get("name")) person.add("gender", data.pop("gender", None)) person.add("title", data.pop("honorific_prefix", None)) person.add("title", data.pop("honorific_suffix", None)) person.add("firstName", data.pop("given_name", None)) person.add("lastName", data.pop("family_name", None)) person.add("fatherName", data.pop("patronymic_name", None)) person.add("birthDate", data.pop("birth_date", None)) person.add("deathDate", data.pop("death_date", None)) person.add("email", h.clean_emails(data.pop("email", None))) person.add("notes", data.pop("summary", None)) person.add("topics", "role.pep") for link in data.pop("links", []): url = link.get("url") if link.get("note") in ("website", "blog", "twitter", "facebook"): person.add("website", url) # elif "Wikipedia (" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # elif "wikipedia" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # else: # person.log.info("Unknown URL", url=url, note=link.get("note")) for ident in data.pop("identifiers", []): identifier = ident.get("identifier") scheme = ident.get("scheme") if scheme == "wikidata" and identifier.startswith("Q"): person.add("wikidataId", identifier) for contact_detail in data.pop("contact_details", []): value = contact_detail.get("value") if "email" == contact_detail.get("type"): person.add("email", h.clean_emails(value)) if "phone" == contact_detail.get("type"): person.add("phone", h.clean_phones(value)) if check_person_cutoff(person): return # data.pop("image", None) # data.pop("images", None) # if len(data): # pprint(data) context.emit(person, target=True) # entities[person_id] = person.id return person.id
def parse_individual(context: Context, node): person = context.make("Person") sanction = parse_common(context, person, node) person.add("title", values(node.find("./TITLE"))) person.add("position", values(node.find("./DESIGNATION"))) for alias in node.findall("./INDIVIDUAL_ALIAS"): parse_alias(person, alias) for addr in node.findall("./INDIVIDUAL_ADDRESS"): h.apply_address(context, person, parse_address(context, addr)) for doc in node.findall("./INDIVIDUAL_DOCUMENT"): passport = context.make("Passport") number = doc.findtext("./NUMBER") date = doc.findtext("./DATE_OF_ISSUE") type_ = doc.findtext("./TYPE_OF_DOCUMENT") if number is None and date is None and type_ is None: continue passport.id = context.make_id(person.id, number, date, type_) passport.add("holder", person) passport.add("passportNumber", number) passport.add("startDate", date) passport.add("type", type_) passport.add("type", doc.findtext("./TYPE_OF_DOCUMENT2")) passport.add("summary", doc.findtext("./NOTE")) country = doc.findtext("./COUNTRY_OF_ISSUE") country = country or doc.findtext("./ISSUING_COUNTRY") passport.add("country", country) context.emit(passport) for nat in node.findall("./NATIONALITY/VALUE"): person.add("nationality", nat.text) for dob in node.findall("./INDIVIDUAL_DATE_OF_BIRTH"): date = dob.findtext("./DATE") or dob.findtext("./YEAR") person.add("birthDate", date) for pob in node.findall("./INDIVIDUAL_PLACE_OF_BIRTH"): address = parse_address(context, pob) if address is not None: person.add("birthPlace", address.get("full")) person.add("country", address.get("country")) context.emit(person, target=True) context.emit(sanction)
def parse_entry(context: Context, node: _Element): entity_name = node.findtext("./Entity") dob = node.findtext("./DateOfBirth") schedule = node.findtext("./Schedule") if schedule == "N/A": schedule = "" program = node.findtext("./Country") item = node.findtext("./Item") if entity_name is not None: entity = context.make("LegalEntity") entity.add("name", entity_name.split("/")) else: entity = context.make("Person") given_name = node.findtext("./GivenName") last_name = node.findtext("./LastName") entity_name = h.make_name(given_name=given_name, last_name=last_name) entity.add("name", entity_name) entity.add("birthDate", dob) country = program if program is not None and "/" in program: country, _ = program.split("/") entity.add("country", country) entity.id = context.make_slug( schedule, item, entity.first("country"), entity_name, strict=False, ) sanction = h.make_sanction(context, entity) sanction.add("program", program) sanction.add("reason", schedule) sanction.add("authorityId", item) names = node.findtext("./Aliases") if names is not None: for name in names.split(", "): name = collapse_spaces(name) entity.add("alias", name) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def crawl_row(context: Context, row: Dict[str, str]): qid = row.get("qid", "").strip() if not len(qid): return if not is_qid(qid): context.log.warning("No valid QID", qid=qid) return entity = context.make("Person") entity.id = qid entity.add("topics", "role.oligarch") context.emit(entity, target=True)
def make_entity(context: Context, el, schema, entity_id): entity = context.make(schema, target=True) entity.id = entity_id entity.add("notes", h.clean_note(el.findtext("./note"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("summary", el.findtext("./correction")) context.emit(sanction) return entity
def crawl_node(context: Context, node): mep_id = node.findtext(".//id") person = context.make("Person") person.id = context.make_slug(mep_id) url = "http://www.europarl.europa.eu/meps/en/%s" % mep_id person.add("sourceUrl", url) name = node.findtext(".//fullName") person.add("name", name) first_name, last_name = split_name(name) person.add("firstName", first_name) person.add("lastName", last_name) person.add("nationality", node.findtext(".//country")) person.add("topics", "role.pep") context.emit(person, target=True) party_name = node.findtext(".//nationalPoliticalGroup") if party_name not in ["Independent"]: party = context.make("Organization") party.id = context.make_slug("npg", party_name) if party.id is not None: party.add("name", party_name) party.add("country", node.findtext(".//country")) context.emit(party) membership = context.make("Membership") membership.id = context.make_id(person.id, party.id) membership.add("member", person) membership.add("organization", party) context.emit(membership) group_name = node.findtext(".//politicalGroup") group = context.make("Organization") group.id = context.make_slug("pg", group_name) if group.id is not None: group.add("name", group_name) group.add("country", "eu") context.emit(group) membership = context.make("Membership") membership.id = context.make_id(person.id, group.id) membership.add("member", person) membership.add("organization", group) context.emit(membership)
def salvage_entity(context: Context, entry): texts = [t.text for t in entry.findall("./remark")] assert len(texts) == 2, texts name, details = texts name = name.split("(", 1)[0] entity = context.make("LegalEntity") entity.id = context.make_id(name) entity.add("name", name) entity.add("notes", details) entity.add("topics", "sanction") parse_sanctions(context, entity, entry) context.emit(entity, target=True)
def parse_entity(context: Context, node): entity = context.make("LegalEntity") sanction = parse_common(context, entity, node) for alias in node.findall("./ENTITY_ALIAS"): parse_alias(entity, alias) for addr in node.findall("./ENTITY_ADDRESS"): h.apply_address(context, entity, parse_address(context, addr)) context.emit(entity, target=True) context.emit(sanction)
def crawl_row(context: Context, row): entity = context.make("Person") tag = row.pop("Tag") name_en = row.pop("Name eng") dob = row.pop("DOB") entity.id = context.make_id(name_en, tag, dob) entity.add("name", name_en) entity.add("alias", row.get("Name cyrillic")) entity.add("birthDate", parse_date(dob)) entity.add("notes", collapse_spaces(row.get("Description"))) entity.add("position", tag.split("\n")) entity.add("gender", row.get("Gender")) context.emit(entity, target=True)
def crawl_row(context: Context, row: Dict[str, str]): qid = row.get("qid", "").strip() if not len(qid): return if not is_qid(qid): context.log.warning("No valid QID", qid=qid) return schema = row.get("schema") or "Person" entity = context.make(schema) entity.id = qid topics = [t.strip() for t in row.get("topics", "").split(";")] topics = [t for t in topics if len(t)] entity.add("topics", topics) context.emit(entity, target=True)
def parse_identity(context: Context, entity, node, places): for name in node.findall(".//name"): parse_name(entity, name) for address in node.findall(".//address"): place = places.get(address.get("place-id")) address = compose_address(context, entity, place, address) h.apply_address(context, entity, address) for bday in node.findall(".//day-month-year"): bval = parse_parts(bday.get("year"), bday.get("month"), bday.get("day")) if entity.schema.is_a("Person"): entity.add("birthDate", bval) else: entity.add("incorporationDate", bval) for nationality in node.findall(".//nationality"): country = nationality.find("./country") if country is not None: entity.add("nationality", country.get("iso-code")) entity.add("nationality", country.text) for bplace in node.findall(".//place-of-birth"): place = places.get(bplace.get("place-id")) address = compose_address(context, entity, place, bplace) entity.add("birthPlace", address.get("full")) for doc in node.findall(".//identification-document"): country = doc.find("./issuer") type_ = doc.get("document-type") number = doc.findtext("./number") entity.add("nationality", country.text, quiet=True) schema = "Identification" if type_ in ("id-card"): entity.add("idNumber", number) if type_ in ("passport", "diplomatic-passport"): entity.add("idNumber", number) schema = "Passport" passport = context.make(schema) passport.id = context.make_id(entity.id, type_, doc.get("ssid")) passport.add("holder", entity) passport.add("country", country.text) passport.add("number", number) passport.add("type", type_) passport.add("summary", doc.findtext("./remark")) passport.add("startDate", doc.findtext("./date-of-issue")) passport.add("endDate", doc.findtext("./expiry-date")) context.emit(passport)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for entry in data.get("result", []): wallet = context.make("CryptoWallet", target=True) wallet.id = context.make_slug(entry.get("address")) wallet.add("publicKey", entry.pop("address")) wallet.add("topics", "crime.theft") wallet.add("createdAt", entry.pop("createdAt")) wallet.add("modifiedAt", entry.pop("updatedAt")) wallet.add("alias", entry.pop("family")) wallet.add("balance", format_number(entry.pop("balance"))) wallet.add("amountUsd", format_number(entry.pop("balanceUSD"))) wallet.add("currency", entry.pop("blockchain")) h.audit_data(entry, ignore=["transactions"]) context.emit(wallet)
def parse_person(context: Context, node): entity = context.make("Person") h.apply_name( entity, given_name=node.findtext("./Name"), patronymic=node.findtext("./Patronomic"), last_name=node.findtext("./Surname"), ) entity.id = context.make_id( node.tag, node.findtext("./Number"), node.findtext("./Name"), node.findtext("./Patronomic"), node.findtext("./Surname"), ) entity.add("birthDate", h.parse_date(node.findtext("./DataBirth"), FORMATS)) entity.add("birthPlace", node.findtext("./PlaceBirth")) parse_common(context, node, entity)
def parse_membership(context: Context, data, persons, organizations, events): person_id = persons.get(data.pop("person_id", None)) org_name = organizations.get(data.pop("organization_id", None)) if person_id and org_name: period_id = data.get("legislative_period_id") period = events.get(period_id, {}) comment = data.pop("role", None) comment = comment or period.get("name") starts = [data.get("start_date"), period.get("start_date")] ends = [data.get("end_date"), period.get("end_date")] # for source in data.get("sources", []): # membership.add("sourceUrl", source.get("url")) position = post_summary(org_name, comment, starts, ends, []) person = context.make("Person") person.id = person_id person.add("position", position) context.emit(person, target=True)
def crawl(context: Context): path = context.fetch_resource("source.csv", context.dataset.data.url) context.export_resource(path, CSV, title=context.SOURCE_TITLE) prev_country = None with open(path, "r") as fh: for row in csv.DictReader(fh): country = row.get("catalog") if country != prev_country: context.log.info("Crawl country", country=country) prev_country = country entity = context.make("Person") qid: Optional[str] = row.get("personID") if qid is None or not is_qid(qid): continue entity.id = qid entity.add("name", row.get("person")) entity.add("topics", "role.pep") entity.add("country", country) context.emit(entity, target=True)
def crawl_physical(context: Context) -> None: data = json_resource(context, PHYSICAL_URL, "physical") for row in data: entity = context.make("Person") entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", row.pop("additional", None)) for country in multi_split(row.pop("citizenship", None), [", "]): entity.add("nationality", country) entity.add("birthDate", row.pop("birthdate", None)) entity.add("birthPlace", row.pop("birthplace", None)) entity.add("position", row.pop("occupation", None)) handle_address(context, entity, row.pop("livingplace", None)) handle_sanction(context, entity, row) context.emit(entity, target=True)