def parse_row(context: Context, row): entity = context.make("LegalEntity") entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name")) entity.add("name", row.get("Name")) entity.add("notes", row.get("Action")) entity.add("country", row.get("Country")) entity.add("modifiedAt", row.get("Last_Update")) address = h.make_address( context, street=row.get("Street_Address"), postal_code=row.get("Postal_Code"), city=row.get("City"), region=row.get("State"), country=row.get("Country"), ) h.apply_address(context, entity, address) context.emit(entity, target=True) citation = row.get("FR_Citation") sanction = h.make_sanction(context, entity, key=citation) sanction.add("program", citation) sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS)) sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS)) # pprint(row) context.emit(sanction)
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def parse_date(date): if date is not None: date = date.replace(" ", "") dates = set() for part in multi_split(date, [",", "\n", ";"]): dates.update(h.parse_date(part, ["%d.%m.%Y", "dd.%m.%Y"])) return dates
def crawl_notice(context, notice): url = notice.get("_links", {}).get("self", {}).get("href") if url in SEEN: return SEEN.add(url) res = context.http.get(url) if not res.ok: context.log.warning("HTTP error", url=res.url, error=res.status_code) return # if not res.from_cache: # time.sleep(0.5) notice = res.json() first_name = notice["forename"] or "" last_name = notice["name"] or "" entity = context.make("Person") entity.id = context.make_slug(notice.get("entity_id")) entity.add("name", first_name + " " + last_name) entity.add("firstName", first_name) entity.add("lastName", last_name) entity.add("sourceUrl", url) entity.add("nationality", notice.get("nationalities")) entity.add("gender", h.clean_gender(notice.get("sex_id"))) entity.add("birthPlace", notice.get("place_of_birth")) dob_raw = notice["date_of_birth"] entity.add("birthDate", h.parse_date(dob_raw, FORMATS)) if "v1/red" in res.url: entity.add("topics", "crime") for idx, warrant in enumerate(notice.get("arrest_warrants", []), 1): # TODO: make this a Sanction: entity.add("program", warrant["issuing_country_id"]) entity.add("notes", warrant["charge"]) context.emit(entity, target=True, unique=True)
def clean_date(date): splits = [ "a)", "b)", "c)", "d)", "e)", "f)", "g)", "h)", "i)", " or ", " to ", " and ", "alt DOB:", "alt DOB", ";", ">>", ] dates = set() if isinstance(date, float): date = str(int(date)) if isinstance(date, datetime): date = date.date().isoformat() date = remove_bracketed(date) if date is None: return dates date = date.replace("\n", " ") for part in multi_split(date, splits): part = part.strip().strip(",") if not len(part): continue dates.update(h.parse_date(part, FORMATS)) return dates
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def parse_date(text: List[Optional[str]]) -> List[str]: dates: List[str] = [] for date in multi_split(text, DATE_SPLITS): cleaned = DATE_CLEAN.sub("", date) normal = decompose_nfkd(cleaned) for parsed in h.parse_date(normal, FORMATS, default=date): dates.append(parsed) return dates
def parse_date(date): if date is None: return date = date.replace(".", "") if ";" in date: date, _ = date.split(";", 1) date = date.strip() return h.parse_date(date, FORMATS)
def parse_date(date): if isinstance(date, list): dates = [] for d in date: dates.extend(parse_date(d)) return dates if isinstance(date, dict): date = date.get("VALUE") return h.parse_date(date, ["%d/%m/%Y"])
def parse_common(context: Context, node, entity): sanction = h.make_sanction(context, entity) sanction.add("reason", node.findtext("./BasicInclusion")) sanction.add("program", node.findtext("./CategoryPerson")) inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS) sanction.add("startDate", inclusion_date) sanction.add("listingDate", inclusion_date) entity.add("createdAt", inclusion_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find(".//article//table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./td") ] headers = headers[:-2] + ["from", "to"] + headers[-1:] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) if "prohibited_practice" not in cells: continue name = cells.pop("firm_name") nationality = cells.pop("nationality") entity = context.make("Company") entity.id = context.make_id(name, nationality) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", nationality) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("prohibited_practice")) sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS)) sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS)) full = cells.pop("address") address = h.make_address(context, full=full, country=nationality) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def parse_person(context, node): entity = context.make("Person") last_name = node.findtext("./Surname") entity.add("lastName", last_name) first_name = node.findtext("./Name") entity.add("firstName", first_name) patronymic = node.findtext("./Patronomic") entity.add("fatherName", patronymic) entity.add("name", jointext(first_name, patronymic, last_name)) entity.add("birthDate", h.parse_date(node.findtext("./DataBirth"), FORMATS)) entity.add("birthPlace", node.findtext("./PlaceBirth")) parse_common(context, node, entity)
def parse_common(context, node, entity): entity.id = context.make_slug(node.tag, node.findtext("./Number")) sanction = h.make_sanction(context, entity) sanction.add("reason", node.findtext("./BasicInclusion")) sanction.add("program", node.findtext("./CategoryPerson")) inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS) sanction.add("startDate", inclusion_date) if inclusion_date is not None: entity.context["created_at"] = inclusion_date entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('.//table[@id="datatable-1"]') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(c.text, "_") for c in row.findall("./th")] continue cells = [collapse_spaces(c.text) for c in row.findall("./td")] cells = dict(zip(headers, cells)) # AfDB lists several individuals as firms in places where the IADB # shows them to be people (and they have normal personal names) # type_ = cells.pop("type") # schema = context.lookup_value("types", type_) # if schema is None: # context.log.error("Unknown entity type", type=type_) # continue name = cells.pop("name") country = cells.pop("nationality") entity = context.make("LegalEntity") entity.id = context.make_id(name, country) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", country) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("basis")) sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS)) sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS)) context.emit(entity, target=True, unique=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def parse_person(context: Context, node): entity = context.make("Person") h.apply_name( entity, given_name=node.findtext("./Name"), patronymic=node.findtext("./Patronomic"), last_name=node.findtext("./Surname"), ) entity.id = context.make_id( node.tag, node.findtext("./Number"), node.findtext("./Name"), node.findtext("./Patronomic"), node.findtext("./Surname"), ) entity.add("birthDate", h.parse_date(node.findtext("./DataBirth"), FORMATS)) entity.add("birthPlace", node.findtext("./PlaceBirth")) parse_common(context, node, entity)
def crawl_notice(context, notice): url = notice.get("_links", {}).get("self", {}).get("href") if url in SEEN: return SEEN.add(url) try: notice = context.fetch_json(url, cache_days=7) except HTTPError as err: context.log.warning( "HTTP error", url=str(err.request.url), error=err.response.status_code, ) return first_name = notice["forename"] or "" last_name = notice["name"] or "" entity = context.make("Person") entity.id = context.make_slug(notice.get("entity_id")) entity.add("name", first_name + " " + last_name) entity.add("firstName", first_name) entity.add("lastName", last_name) entity.add("sourceUrl", url) entity.add("nationality", notice.get("nationalities")) entity.add("gender", notice.get("sex_id")) entity.add("birthPlace", notice.get("place_of_birth")) dob_raw = notice["date_of_birth"] entity.add("birthDate", h.parse_date(dob_raw, FORMATS)) if "v1/red" in url: entity.add("topics", "crime") for idx, warrant in enumerate(notice.get("arrest_warrants", []), 1): # TODO: make this a Sanction: entity.add("program", warrant["issuing_country_id"]) entity.add("notes", warrant["charge"]) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for el in doc.findall(".//person"): fname = el.findtext("./fname") mname = el.findtext("./mname") lname = el.findtext("./lname") bdate = el.findtext("./birthdate") iin = el.findtext("./iin") name = h.make_name(given_name=fname, middle_name=mname, last_name=lname) entity_id = context.make_id(name, bdate, iin) entity = make_entity(context, el, "Person", entity_id) h.apply_name(entity, given_name=fname, middle_name=mname, last_name=lname) entity.add("innCode", iin) entity.add("birthDate", h.parse_date(bdate, FORMATS, bdate)) context.emit(entity, target=True) for el in doc.findall(".//org"): name = el.findtext(".//org_name") entity_id = context.make_id(el.findtext("./num"), name) entity = make_entity(context, el, "Organization", entity_id) for tag in (".//org_name", ".//org_name_en"): names = el.findtext(tag) if names is None: continue names = names.split("; ") entity.add("name", names) context.emit(entity, target=True)
def parse_date(date): dates = [] for part in multi_split(date, ["OR", ";", " - "]): dates.extend(h.parse_date(part, FORMATS)) return dates
def crawl_person(context: Context, name, url): context.log.debug("Crawling member", name=name, url=url) doc = context.fetch_html(url) _, person_id = url.rsplit("/", 1) person = context.make("Person") person.id = context.make_slug(person_id) person.add("sourceUrl", url) person.add("name", name) person.add("topics", "role.pep") last_name, first_name = name.split(", ", 1) person.add("firstName", first_name) person.add("lastName", last_name) address = {} details = doc.find('.//div[@class="regular-details"]') for row in details.findall('.//ul[@class="no-bullet"]/li'): children = row.getchildren() title = children[0] title_text = collapse_spaces(stringify(title.text_content())) title_text = title_text or title.get("class") value = collapse_spaces(title.tail) if title_text in ("Full name:", "Address:", "Declaration of interests"): # ignore these. continue if title_text == "Emails:": emails = [e.text for e in row.findall(".//a")] person.add("email", emails) continue if "glyphicon-phone" in title_text: person.add("phone", value.split(",")) continue if "fa-fax" in title_text: # TODO: yeah, no # person.add("phone", value) continue if title_text in ("Web sites:", "list-inline"): sites = [e.get("href") for e in row.findall(".//a")] person.add("website", sites) continue if title_text == "Represented Country:": person.add("country", value) continue if title_text == "Languages:": # TODO: missing in FtM # person.add("languages", value.split(',')) continue if "Regions since:" in title_text: date = h.parse_date(value, FORMATS) person.add("createdAt", date) continue if "Date of birth:" in title_text: person.add("birthDate", h.parse_date(value, FORMATS)) continue if "Commissions:" in title_text: for com in row.findall(".//li"): text = collapse_spaces(com.text_content()) sep = "Mandate - " if sep in text: _, text = text.split(sep, 1) person.add("sector", text) continue if "Areas of interest:" in title_text: for area in row.findall(".//li"): person.add("keywords", area.text_content()) continue if title.tag == "i" and value is None: person.add("position", title_text) continue if title_text in ("Country:"): person.add("country", value) if title_text in ("Street:", "Postal code:", "City:", "Country:"): address[title_text.replace(":", "")] = value continue if title_text == "Political group:": group = context.make("Organization") group.add("name", value) slug = value if "(" in slug: _, slug = slug.rsplit("(", 1) slug = slugify(slug, sep="-") group.id = f"eu-cor-group-{slug}" context.emit(group) member = context.make("Membership") member.id = context.make_id("Membership", person.id, group.id) member.add("member", person) member.add("organization", group) context.emit(member) continue address = h.make_address( context, street=address.get("Street"), city=address.get("City"), postal_code=address.get("Posal code"), country=address.get("Country"), ) h.apply_address(context, person, address) context.emit(person, target=True)
def parse_date(date): return h.parse_date(date.strip(), ["%d.%m.%Y"])
def parse_date(date): if date == "permanent": return None date = date.replace("Sept", "Sep") date = date.replace("ago", "Aug") return h.parse_date(date, ["%d-%b-%y", "%d-%b-%Y"])
def clean_date(text): return h.parse_date(text, DATE_FORMATS)
def parse_date(text): if text is None: return None text = text.replace("Sept", "Sep") return h.parse_date(text, FORMATS)
def parse_date(date): return h.parse_date(date, ["%m/%d/%Y"])
def parse_row(context, row): group_type = row.pop("GroupTypeDescription") org_type = row.pop("OrgType", None) if group_type == "Individual": base_schema = "Person" elif row.get("TypeOfVessel") is not None: base_schema = "Vessel" elif group_type == "Entity": base_schema = context.lookup_value("org_type", org_type, "Organization") else: context.log.error("Unknown entity type", group_type=group_type) return entity = context.make(base_schema) entity.id = context.make_slug(row.pop("GroupID")) if org_type is not None: org_types = split_items(org_type) entity.add_cast("LegalEntity", "legalForm", org_types) sanction = h.make_sanction(context, entity) # entity.add("position", row.pop("Position"), quiet=True) entity.add("notes", row.pop("OtherInformation", None), quiet=True) entity.add("notes", row.pop("FurtherIdentifiyingInformation", None), quiet=True) sanction.add("program", row.pop("RegimeName")) sanction.add("authority", row.pop("ListingType", None)) sanction.add("startDate", h.parse_date(row.pop("DateListed"), FORMATS)) sanction.add("recordId", row.pop("FCOId", None)) sanction.add("status", row.pop("GroupStatus", None)) sanction.add("reason", row.pop("UKStatementOfReasons", None)) last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS) if last_updated is not None: sanction.add("modifiedAt", last_updated) sanction.context["updated_at"] = last_updated entity.add("modifiedAt", last_updated) entity.context["updated_at"] = last_updated # DoB is sometimes a year only row.pop("DateOfBirth", None) dob = parse_parts( row.pop("YearOfBirth", 0), row.pop("MonthOfBirth", 0), row.pop("DayOfBirth", 0), ) entity.add_cast("Person", "birthDate", dob) gender = h.clean_gender(row.pop("Gender", None)) entity.add_cast("Person", "gender", gender) id_number = row.pop("NationalIdNumber", None) entity.add_cast("LegalEntity", "idNumber", split_items(id_number)) passport = row.pop("PassportDetails", None) entity.add_cast("Person", "passportNumber", split_items(passport)) flag = row.pop("FlagOfVessel", None) entity.add_cast("Vessel", "flag", flag) prev_flag = row.pop("PreviousFlags", None) entity.add_cast("Vessel", "pastFlags", prev_flag) year = row.pop("YearBuilt", None) entity.add_cast("Vehicle", "buildDate", year) type_ = row.pop("TypeOfVessel", None) entity.add_cast("Vehicle", "type", type_) imo = row.pop("IMONumber", None) entity.add_cast("Vessel", "imoNumber", imo) tonnage = row.pop("TonnageOfVessel", None) entity.add_cast("Vessel", "tonnage", tonnage) row.pop("LengthOfVessel", None) # entity.add("legalForm", org_type) title = split_items(row.pop("NameTitle", None)) entity.add("title", title, quiet=True) entity.add("firstName", row.pop("name1", None), quiet=True) entity.add("secondName", row.pop("name2", None), quiet=True) entity.add("middleName", row.pop("name3", None), quiet=True) entity.add("middleName", row.pop("name4", None), quiet=True) entity.add("middleName", row.pop("name5", None), quiet=True) name6 = row.pop("Name6", None) entity.add("lastName", name6, quiet=True) full_name = row.pop("FullName", name6) row.pop("AliasTypeName") if row.pop("AliasType") == "AKA": entity.add("alias", full_name) else: entity.add("name", full_name) nationalities = parse_countries(row.pop("Nationality", None)) entity.add("nationality", nationalities, quiet=True) position = split_items(row.pop("Position", None)) entity.add("position", position, quiet=True) birth_countries = parse_countries(row.pop("CountryOfBirth", None)) entity.add("country", birth_countries, quiet=True) countries = parse_countries(row.pop("Country", None)) entity.add("country", countries) pob = split_items(row.pop("TownOfBirth", None)) entity.add("birthPlace", pob, quiet=True) address = h.make_address( context, full=row.pop("FullAddress", None), street=row.pop("address1", None), street2=row.pop("address2", None), street3=row.pop("address3", None), city=row.pop("address4", None), place=row.pop("address5", None), region=row.pop("address6", None), postal_code=row.pop("PostCode", None), country=first(countries), ) h.apply_address(context, entity, address) reg_number = row.pop("BusinessRegNumber", None) entity.add_cast("LegalEntity", "registrationNumber", reg_number) phones = split_items(row.pop("PhoneNumber", None), comma=True) phones = h.clean_phones(phones) entity.add_cast("LegalEntity", "phone", phones) website = split_items(row.pop("Website", None), comma=True) entity.add_cast("LegalEntity", "website", website) emails = split_items(row.pop("EmailAddress", None), comma=True) emails = h.clean_emails(emails) entity.add_cast("LegalEntity", "email", emails) # TODO: graph row.pop("Subsidiaries", None) row.pop("ParentCompany", None) row.pop("CurrentOwners", None) row.pop("DateListedDay", None) row.pop("DateListedMonth", None) row.pop("DateListedYear", None) row.pop("LastUpdatedDay", None) row.pop("LastUpdatedMonth", None) row.pop("LastUpdatedYear", None) row.pop("GrpStatus", None) row.pop("ID", None) row.pop("DateOfBirthId", None) row.pop("DateListedDay", None) if len(row): pprint(row) entity.add("topics", "sanction") context.emit(entity, target=True, unique=True) context.emit(sanction)
def parse_result(context, result): type_ = result.pop("type", None) schema = context.lookup_value("type", type_) if schema is None: context.log.error("Unknown result type", type=type_) return entity = context.make(schema) entity.id = context.make_slug(result.pop("id")) entity_number = result.pop("entity_number", None) if entity_number is not None: assert int(entity_number) entity.id = SDN.make_slug(entity_number) entity.add("name", result.pop("name", None)) for alias in ensure_list(result.pop("alt_names", "")): entity.add("alias", alias.split("; ")) entity.add("notes", result.pop("remarks", None)) entity.add("country", result.pop("country", None)) if entity.schema.is_a("Person"): entity.add("position", result.pop("title", None)) entity.add("nationality", result.pop("nationalities", None)) entity.add("nationality", result.pop("citizenships", None)) for dob in result.pop("dates_of_birth", []): entity.add("birthDate", h.parse_date(dob, FORMATS)) entity.add("birthPlace", result.pop("places_of_birth", None)) elif entity.schema.is_a("Vessel"): entity.add("flag", result.pop("vessel_flag", None)) entity.add("callSign", result.pop("call_sign", None)) entity.add("type", result.pop("vessel_type", None)) grt = result.pop("gross_registered_tonnage", None) entity.add("grossRegisteredTonnage", grt) gt = result.pop("gross_tonnage", None) entity.add("tonnage", gt) # TODO: make adjacent owner entity result.pop("vessel_owner", None) assert result.pop("title", None) is None assert not len(result.pop("nationalities", [])) assert not len(result.pop("citizenships", [])) assert not len(result.pop("dates_of_birth", [])) assert not len(result.pop("places_of_birth", [])) for address in result.pop("addresses", []): obj = h.make_address( context, street=address.get("address"), city=address.get("city"), postal_code=address.get("postal_code"), region=address.get("state"), country=address.get("country"), ) h.apply_address(context, entity, obj) for ident in result.pop("ids", []): country = ident.pop("country") entity.add("country", country) h.apply_feature( context, entity, ident.pop("type"), ident.pop("number"), country=country, date_formats=FORMATS, start_date=ident.pop("issue_date", None), end_date=ident.pop("expiration_date", None), ) sanction = context.make("Sanction") sanction.id = context.make_id(entity.id, "Sanction") sanction.add("entity", entity) sanction.add("program", result.pop("programs", [])) sanction.add("status", result.pop("license_policy", [])) sanction.add("reason", result.pop("license_requirement", [])) sanction.add("reason", result.pop("federal_register_notice", None)) sanction.add("startDate", result.pop("start_date", None)) sanction.add("endDate", result.pop("end_date", None)) sanction.add("country", "us") sanction.add("authority", result.pop("source", None)) # TODO: deref source_url = deref_url(context, result.pop("source_information_url")) sanction.add("sourceUrl", source_url) result.pop("source_list_url") # TODO: what is this? result.pop("standard_order", None) context.emit(sanction) context.emit(entity, target=True) if len(result): context.pprint(result)
def crawl(context: Context): for page in count(1): url = str(context.dataset.data.url) url = url.replace("pPageNumber=1", "pPageNumber=%s" % page) headers = { "Accept": "application/json", "Referer": "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals", } res = context.http.get(url, headers=headers) ids = [] for row in res.json(): for field, value in list(row.items()): if value == "N/A": row[field] = "" row_id = row.pop("id") ids.append(row_id) entity_type = row.pop("entity") schema = context.lookup_value("types", entity_type) if schema is None: context.log.warning("Unknown entity type", entity=entity_type) continue entity = context.make(schema) entity.id = context.make_slug(row_id) entity.add("name", row.pop("firmName")) entity.add("topics", "debarment") entity.add("alias", row.pop("additionalName")) entity.add("notes", row.pop("title")) entity.add("notes", row.pop("additionalTitle")) entity.add("country", parse_countries(row.pop("country"))) nat = "nationality" if schema == "Company": nat = "jurisdiction" entity.add(nat, parse_countries(row.pop("nationality"))) affiliated = row.pop("affiliatedWithEntityId") if len(affiliated): link = context.make("UnknownLink") link.id = context.make_id(row_id, affiliated) link.add("subject", entity.id) link.add("object", context.make_slug(affiliated)) context.emit(link) sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("statusName")) sanction.add("reason", row.pop("grounds")) sanction.add("authority", row.pop("source")) sanction.add("authority", row.pop("idBinstSource")) sanction.add("program", row.pop("idBinstType")) sanction.add("startDate", h.parse_date(row.pop("datefrom"), FORMATS)) sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS)) # context.pprint(row) context.emit(sanction) context.emit(entity, target=True) if min(ids) == 1: return
def parse_date(date): if date is not None: date = date.replace("Sept.", "Sep.") return h.parse_date(date, FORMATS)