def parse_party(context: Context, distinct_party, locations, documents): profile = distinct_party.find("Profile") sub_type = ref_get("PartySubType", profile.get("PartySubTypeID")) schema = TYPES.get(sub_type.get("Value")) type_ = ref_value("PartyType", sub_type.get("PartyTypeID")) schema = TYPES.get(type_, schema) if schema is None: context.log.error("Unknown party type", value=type_) return party = context.make(schema) party.id = context.make_slug(profile.get("ID")) party.add("notes", h.clean_note(distinct_party.findtext("Comment"))) party.add("sourceUrl", URL % profile.get("ID")) for identity in profile.findall("./Identity"): parts = {} for group in identity.findall(".//NamePartGroup"): type_id = group.get("NamePartTypeID") parts[group.get("ID")] = ref_value("NamePartType", type_id) for alias in identity.findall("./Alias"): parse_alias(party, parts, alias) for regdoc in documents.get(identity.get("ID"), []): parse_registration_doc(context, party, regdoc) for feature in profile.findall("./Feature"): parse_feature(context, feature, party, locations) context.emit(party, target=True) # pprint(party.to_dict()) # context.log.info("[%s] %s" % (party.schema.name, party.caption)) return party
def parse_entry(context: Context, entry, parties): party_id = context.make_slug(entry.get("ProfileID")) party = parties[party_id] sanction = h.make_sanction(context, party, key=entry.get("ID")) sanction.add("program", ref_value("List", entry.get("ListID"))) for event in entry.findall("./EntryEvent"): date = parse_date(event.find("./Date")) party.add("createdAt", date) sanction.add("summary", event.findtext("./Comment")) basis = ref_value("LegalBasis", event.get("LegalBasisID")) sanction.add("reason", basis) party.add("topics", "sanction") sanction.add("listingDate", party.get("createdAt")) sanction.add("startDate", party.get("modifiedAt")) for measure in entry.findall("./SanctionsMeasure"): sanction.add("summary", measure.findtext("./Comment")) type_id = measure.get("SanctionsTypeID") sanction.add("program", ref_value("SanctionsType", type_id)) context.emit(sanction) context.emit(party, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl_person(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "person") for row in data["data"]: row = clean_row(row) person_id = row.pop("person_id") name_en = row.pop("name_en", None) name_ru = row.pop("name_ru", None) name_uk = row.pop("name_uk", None) name = name_en or name_ru or name_uk entity = context.make("Person") entity.id = context.make_slug("person", person_id, name) entity.add("name", name) entity.add("alias", name_ru) entity.add("alias", name_uk) entity.add("birthDate", parse_date(row.pop("date_bd", None))) url = "https://sanctions.nazk.gov.ua/sanction-person/%s/" entity.add("sourceUrl", url % person_id) if row.get("city_bd_en") != "N/A": entity.add("birthPlace", row.pop("city_bd_en", None)) entity.add("birthPlace", row.pop("city_bd_ru", None)) entity.add("birthPlace", row.pop("city_bd_uk", None)) entity.add("position", row.pop("position_en", None)) entity.add("position", row.pop("position_ru", None)) entity.add("position", row.pop("position_uk", None)) entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) country = row.get("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") context.emit(entity, target=True)
def crawl_company(context: Context) -> None: data = json_resource(context, context.dataset.data.url, "company") for row in data["data"]: row = clean_row(row) # context.pprint(row) company_id = row.pop("company_id") name_en = row.pop("name_en", None) name = row.pop("name", None) or name_en entity = context.make("Organization") entity.id = context.make_slug("company", company_id, name) if entity.id is None: entity.id = context.make_slug( "company", company_id, row.pop("ogrn", None), strict=False, ) entity.add("name", name) entity.add("name", name_en) entity.add("name", row.pop("name_uk", None)) entity.add("name", row.pop("name_ru", None)) entity.add("innCode", row.pop("inn", None)) entity.add_cast("Company", "ogrnCode", row.pop("ogrn", None)) country = row.pop("country", None) entity.add("country", COUNTRIES[country]) entity.add("topics", "sanction") entity.add("notes", row.pop("reasoning_en", None)) entity.add("notes", row.pop("reasoning_ru", None)) entity.add("notes", row.pop("reasoning_uk", None)) context.emit(entity, target=True) row.pop("logo_en", None)
def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def crawl_legislature(context: Context, country, legislature): lastmod_ = int(legislature.get("lastmod")) lastmod = datetime.utcfromtimestamp(lastmod_) url = legislature.get("popolo_url") # this isn't being updated, hence long interval: data = context.fetch_json(url, cache_days=30) persons: Dict[str, Optional[str]] = {} for person in data.pop("persons", []): pid = person.get("id") persons[pid] = parse_person(context, person, country, lastmod) organizations: Dict[str, Optional[str]] = {} for org in data.pop("organizations", []): org_id = org.pop("id", None) org_id = context.lookup_value("org_id", org_id, org_id) if org_id is None: continue name = org.pop("name", org.pop("sort_name", None)) organizations[org_id] = name events = data.pop("events", []) events = {e.get("id"): e for e in events} for membership in data.pop("memberships", []): parse_membership(context, membership, persons, organizations, events)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as file: data = json.load(file) for result in data.get("results"): parse_result(context, result)
def parse_person(context: Context, data, country, lastmod): person_id = data.pop("id", None) person = context.make("Person") person.id = context.make_slug(person_id) person.add("nationality", country) name = data.get("name") if name is None or name.lower().strip() in ("unknown", ): return person.add("modifiedAt", lastmod.date()) person.add("name", data.pop("name", None)) person.add("alias", data.pop("sort_name", None)) for other in data.pop("other_names", []): person.add("alias", other.get("name")) person.add("gender", data.pop("gender", None)) person.add("title", data.pop("honorific_prefix", None)) person.add("title", data.pop("honorific_suffix", None)) person.add("firstName", data.pop("given_name", None)) person.add("lastName", data.pop("family_name", None)) person.add("fatherName", data.pop("patronymic_name", None)) person.add("birthDate", data.pop("birth_date", None)) person.add("deathDate", data.pop("death_date", None)) person.add("email", h.clean_emails(data.pop("email", None))) person.add("notes", data.pop("summary", None)) person.add("topics", "role.pep") for link in data.pop("links", []): url = link.get("url") if link.get("note") in ("website", "blog", "twitter", "facebook"): person.add("website", url) # elif "Wikipedia (" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # elif "wikipedia" in link.get("note") and "wikipedia.org" in url: # person.add("wikipediaUrl", url) # else: # person.log.info("Unknown URL", url=url, note=link.get("note")) for ident in data.pop("identifiers", []): identifier = ident.get("identifier") scheme = ident.get("scheme") if scheme == "wikidata" and identifier.startswith("Q"): person.add("wikidataId", identifier) for contact_detail in data.pop("contact_details", []): value = contact_detail.get("value") if "email" == contact_detail.get("type"): person.add("email", h.clean_emails(value)) if "phone" == contact_detail.get("type"): person.add("phone", h.clean_phones(value)) if check_person_cutoff(person): return # data.pop("image", None) # data.pop("images", None) # if len(data): # pprint(data) context.emit(person, target=True) # entities[person_id] = person.id return person.id
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for el in doc.findall(".//FinancialSanctionsTarget"): parse_row(context, make_row(el))
def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for node in doc.findall(".//INDIVIDUAL"): parse_individual(context, node) for node in doc.findall(".//ENTITY"): parse_entity(context, node)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for sec_id, (section, schema) in SECTIONS.items(): el = doc.find(".//div[@id='%s']" % sec_id) for item in el.findall(".//li"): text = item.text_content().strip() index, text = text.split(".", 1) text = text.strip() if text.endswith(";"): text = text.rstrip(";") entity = context.make(schema) entity.id = context.make_id(text) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("recordId", index) if sec_id == "russianUL": parse_russian_orgs(context, entity, text) if sec_id == "russianFL": parse_russian_persons(context, entity, text) if sec_id == "foreignUL": parse_foreign_orgs(context, entity, text) if sec_id == "foreignFL": parse_foreign_persons(context, entity, text) if entity.has("name"): context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for table in doc.findall("//table"): headers = table.findall("./thead/tr/td") headers = [h.text_content() for h in headers] assert "Vendor name" in headers, headers assert "From" in headers, headers for row in table.findall("./tbody/tr"): cells = [h.text_content() for h in row.findall("./td")] if len(cells[0]) == 0: continue entity = context.make("LegalEntity") entity.id = context.make_id(*cells) entity.add("name", cells[0]) entity.add("country", cells[1]) entity.add("topics", "crime.fraud") cc = entity.first("country") address = h.make_address(context, full=cells[2], country_code=cc) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("reason", cells[3]) sanction.add("program", cells[4]) sanction.add("startDate", parse_date(cells[5])) sanction.add("endDate", parse_date(cells[6])) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def parse_common(context: Context, node, entity): sanction = h.make_sanction(context, entity) sanction.add("reason", node.findtext("./BasicInclusion")) sanction.add("program", node.findtext("./CategoryPerson")) inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS) sanction.add("startDate", inclusion_date) sanction.add("listingDate", inclusion_date) entity.add("createdAt", inclusion_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl_row(context: Context, row: Dict[str, str]): qid = row.get("qid", "").strip() if not len(qid): return if not is_qid(qid): context.log.warning("No valid QID", qid=qid) return entity = context.make("Person") entity.id = qid entity.add("topics", "role.oligarch") context.emit(entity, target=True)
def make_entity(context: Context, el, schema, entity_id): entity = context.make(schema, target=True) entity.id = entity_id entity.add("notes", h.clean_note(el.findtext("./note"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("summary", el.findtext("./correction")) context.emit(sanction) return entity
def salvage_entity(context: Context, entry): texts = [t.text for t in entry.findall("./remark")] assert len(texts) == 2, texts name, details = texts name = name.split("(", 1)[0] entity = context.make("LegalEntity") entity.id = context.make_id(name) entity.add("name", name) entity.add("notes", details) entity.add("topics", "sanction") parse_sanctions(context, entity, entry) context.emit(entity, target=True)
def parse_entity(context: Context, node): entity = context.make("LegalEntity") sanction = parse_common(context, entity, node) for alias in node.findall("./ENTITY_ALIAS"): parse_alias(entity, alias) for addr in node.findall("./ENTITY_ADDRESS"): h.apply_address(context, entity, parse_address(context, addr)) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): url = crawl_index(context) if url is None: context.log.error("Could not locate XML file", url=context.dataset.url) return path = context.fetch_resource("source.xml", url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) xml = context.parse_resource_xml(path) for person in xml.findall(".//KyrgyzPhysicPerson"): parse_person(context, person) for legal in xml.findall(".//KyrgyzLegalPerson"): parse_legal(context, legal)
def crawl(context: Context): path = context.fetch_resource("source.zip", context.dataset.data.url) context.export_resource(path, "application/zip", title=context.SOURCE_TITLE) with ZipFile(path, "r") as zip: for name in zip.namelist(): if name.endswith(".xml"): with zip.open(name) as fh: doc = etree.parse(fh) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for row in doc.findall(".//Table"): data = {} for field in row.getchildren(): value = field.text if value == "NA": continue data[field.tag] = value crawl_row(context, data)
def crawl(context: Context): xls_url = fetch_xls_url(context) path = context.fetch_resource("source.xls", xls_url) context.export_resource(path, XLS, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) for sheet in xls.sheets(): headers = None row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)] sections = [c for c in row0 if c is not None] section = collapse_spaces(" / ".join(sections)) for r in range(1, sheet.nrows): row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)] # after a header is found, read normal data: if headers is not None: data: Dict[str, List[str]] = {} for header, cell in zip(headers, row): if header is None: continue values = [] if isinstance(cell, datetime): cell = cell.date() for value in multi_split(stringify(cell), SPLITS): if value is None: continue if value == "不明": continue if value is not None: values.append(value) data[header] = values emit_row(context, sheet.name, section, data) if not len(row) or row[0] is None: continue teaser = row[0].strip() # the first column of the common headers: if "告示日付" in teaser: if headers is not None: context.log.error("Found double header?", row=row) # print("SHEET", sheet, row) headers = [] for cell in row: cell = collapse_spaces(cell) header = context.lookup_value("columns", cell) if header is None: context.log.warning("Unknown column title", column=cell, sheet=sheet.name) headers.append(header)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def crawl_row(context: Context, row): entity = context.make("Person") tag = row.pop("Tag") name_en = row.pop("Name eng") dob = row.pop("DOB") entity.id = context.make_id(name_en, tag, dob) entity.add("name", name_en) entity.add("alias", row.get("Name cyrillic")) entity.add("birthDate", parse_date(dob)) entity.add("notes", collapse_spaces(row.get("Description"))) entity.add("position", tag.split("\n")) entity.add("gender", row.get("Gender")) context.emit(entity, target=True)
def crawl_row(context: Context, row: Dict[str, str]): qid = row.get("qid", "").strip() if not len(qid): return if not is_qid(qid): context.log.warning("No valid QID", qid=qid) return schema = row.get("schema") or "Person" entity = context.make(schema) entity.id = qid topics = [t.strip() for t in row.get("topics", "").split(";")] topics = [t for t in topics if len(t)] entity.add("topics", topics) context.emit(entity, target=True)
def export_dataset(dataset: Dataset, database: Database): """Dump the contents of the dataset to the output directory.""" context = Context(dataset) context.bind() loader = database.view(dataset, export_assembler) exporters = [Exporter(context, loader) for Exporter in EXPORTERS] for entity in loader: for exporter in exporters: exporter.feed(entity) for exporter in exporters: exporter.finish() # Make sure the exported resources are visible in the database db.session.commit() # Export list of data issues from crawl stage issues_path = context.get_resource_path("issues.json") context.log.info("Writing dataset issues list", path=issues_path) with open(issues_path, "w", encoding=settings.ENCODING) as fh: data = {"issues": Issue.query(dataset=dataset).all()} write_json(data, fh) # Export full metadata index_path = context.get_resource_path("index.json") context.log.info("Writing dataset index", path=index_path) with open(index_path, "w", encoding=settings.ENCODING) as fh: meta = dataset.to_index() write_json(meta, fh) context.close()
def parse_identity(context: Context, entity, node, places): for name in node.findall(".//name"): parse_name(entity, name) for address in node.findall(".//address"): place = places.get(address.get("place-id")) address = compose_address(context, entity, place, address) h.apply_address(context, entity, address) for bday in node.findall(".//day-month-year"): bval = parse_parts(bday.get("year"), bday.get("month"), bday.get("day")) if entity.schema.is_a("Person"): entity.add("birthDate", bval) else: entity.add("incorporationDate", bval) for nationality in node.findall(".//nationality"): country = nationality.find("./country") if country is not None: entity.add("nationality", country.get("iso-code")) entity.add("nationality", country.text) for bplace in node.findall(".//place-of-birth"): place = places.get(bplace.get("place-id")) address = compose_address(context, entity, place, bplace) entity.add("birthPlace", address.get("full")) for doc in node.findall(".//identification-document"): country = doc.find("./issuer") type_ = doc.get("document-type") number = doc.findtext("./number") entity.add("nationality", country.text, quiet=True) schema = "Identification" if type_ in ("id-card"): entity.add("idNumber", number) if type_ in ("passport", "diplomatic-passport"): entity.add("idNumber", number) schema = "Passport" passport = context.make(schema) passport.id = context.make_id(entity.id, type_, doc.get("ssid")) passport.add("holder", entity) passport.add("country", country.text) passport.add("number", number) passport.add("type", type_) passport.add("summary", doc.findtext("./remark")) passport.add("startDate", doc.findtext("./date-of-issue")) passport.add("endDate", doc.findtext("./expiry-date")) context.emit(passport)