def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for sec_id, (section, schema) in SECTIONS.items(): el = doc.find(".//div[@id='%s']" % sec_id) for item in el.findall(".//li"): text = item.text_content().strip() index, text = text.split(".", 1) text = text.strip() if text.endswith(";"): text = text.rstrip(";") entity = context.make(schema) entity.id = context.make_id(text) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("recordId", index) if sec_id == "russianUL": parse_russian_orgs(context, entity, text) if sec_id == "russianFL": parse_russian_persons(context, entity, text) if sec_id == "foreignUL": parse_foreign_orgs(context, entity, text) if sec_id == "foreignFL": parse_foreign_persons(context, entity, text) if entity.has("name"): context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as file: data = json.load(file) for result in data.get("results"): parse_result(context, result)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for table in doc.findall("//table"): headers = table.findall("./thead/tr/td") headers = [h.text_content() for h in headers] assert "Vendor name" in headers, headers assert "From" in headers, headers for row in table.findall("./tbody/tr"): cells = [h.text_content() for h in row.findall("./td")] if len(cells[0]) == 0: continue entity = context.make("LegalEntity") entity.id = context.make_id(*cells) entity.add("name", cells[0]) entity.add("country", cells[1]) entity.add("topics", "crime.fraud") cc = entity.first("country") address = h.make_address(context, full=cells[2], country_code=cc) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("reason", cells[3]) sanction.add("program", cells[4]) sanction.add("startDate", parse_date(cells[5])) sanction.add("endDate", parse_date(cells[6])) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for el in doc.findall(".//FinancialSanctionsTarget"): parse_row(context, make_row(el))
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for node in doc.findall(".//INDIVIDUAL"): parse_individual(context, node) for node in doc.findall(".//ENTITY"): parse_entity(context, node)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): url = crawl_index(context) if url is None: context.log.error("Could not locate XML file", url=context.dataset.url) return path = context.fetch_resource("source.xml", url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) xml = context.parse_resource_xml(path) for person in xml.findall(".//KyrgyzPhysicPerson"): parse_person(context, person) for legal in xml.findall(".//KyrgyzLegalPerson"): parse_legal(context, legal)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for row in doc.findall(".//Table"): data = {} for field in row.getchildren(): value = field.text if value == "NA": continue data[field.tag] = value crawl_row(context, data)
def crawl(context: Context): path = context.fetch_resource("source.zip", context.dataset.data.url) context.export_resource(path, "application/zip", title=context.SOURCE_TITLE) with ZipFile(path, "r") as zip: for name in zip.namelist(): if name.endswith(".xml"): with zip.open(name) as fh: doc = etree.parse(fh) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): xls_url = fetch_xls_url(context) path = context.fetch_resource("source.xls", xls_url) context.export_resource(path, XLS, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) for sheet in xls.sheets(): headers = None row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)] sections = [c for c in row0 if c is not None] section = collapse_spaces(" / ".join(sections)) for r in range(1, sheet.nrows): row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)] # after a header is found, read normal data: if headers is not None: data: Dict[str, List[str]] = {} for header, cell in zip(headers, row): if header is None: continue values = [] if isinstance(cell, datetime): cell = cell.date() for value in multi_split(stringify(cell), SPLITS): if value is None: continue if value == "不明": continue if value is not None: values.append(value) data[header] = values emit_row(context, sheet.name, section, data) if not len(row) or row[0] is None: continue teaser = row[0].strip() # the first column of the common headers: if "告示日付" in teaser: if headers is not None: context.log.error("Found double header?", row=row) # print("SHEET", sheet, row) headers = [] for cell in row: cell = collapse_spaces(cell) header = context.lookup_value("columns", cell) if header is None: context.log.warning("Unknown column title", column=cell, sheet=sheet.name) headers.append(header)
def crawl(context: Context): path = context.fetch_resource("source.xls", context.dataset.data.url) context.export_resource(path, EXCEL, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) ws = xls.sheet_by_index(0) headers = [slugify(h, sep="_") for h in ws.row_values(0)] references = defaultdict(list) for r in range(1, ws.nrows): cells = [h.convert_excel_cell(xls, c) for c in ws.row(r)] row = dict(zip(headers, cells)) reference = clean_reference(row.get("reference")) references[reference].append(row) for ref, rows in references.items(): parse_reference(context, ref, rows)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): data = context.fetch_json(context.dataset.data.url) for ban in data.get("data", {}).get("travelBansFiles"): if not ban.get("fileName").endswith(".xml"): continue data_url = URL % ban.get("id") path = context.fetch_resource("source.xml", data_url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): subject_type = entry.find("./subjectType") if subject_type is None: salvage_entity(context, entry) continue parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) updated_at = doc.getroot().get("date") programs = {} for sanc in doc.findall(".//sanctions-program"): ssid = sanc.find("./sanctions-set").get("ssid") programs[ssid] = sanc.findtext('./program-name[@lang="eng"]') places = {} for place in doc.findall(".//place"): places[place.get("ssid")] = parse_address(place) for target in doc.findall("./target"): parse_entry(context, target, programs, places, updated_at)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for entry in data.get("result", []): wallet = context.make("CryptoWallet", target=True) wallet.id = context.make_slug(entry.get("address")) wallet.add("publicKey", entry.pop("address")) wallet.add("topics", "crime.theft") wallet.add("createdAt", entry.pop("createdAt")) wallet.add("modifiedAt", entry.pop("updatedAt")) wallet.add("alias", entry.pop("family")) wallet.add("balance", format_number(entry.pop("balance"))) wallet.add("amountUsd", format_number(entry.pop("balanceUSD"))) wallet.add("currency", entry.pop("blockchain")) h.audit_data(entry, ignore=["transactions"]) context.emit(wallet)
def crawl(context: Context): path = context.fetch_resource("source.csv", context.dataset.data.url) context.export_resource(path, CSV, title=context.SOURCE_TITLE) prev_country = None with open(path, "r") as fh: for row in csv.DictReader(fh): country = row.get("catalog") if country != prev_country: context.log.info("Crawl country", country=country) prev_country = country entity = context.make("Person") qid: Optional[str] = row.get("personID") if qid is None or not is_qid(qid): continue entity.id = qid entity.add("name", row.get("person")) entity.add("topics", "role.pep") entity.add("country", country) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find(".//article//table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./td") ] headers = headers[:-2] + ["from", "to"] + headers[-1:] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) if "prohibited_practice" not in cells: continue name = cells.pop("firm_name") nationality = cells.pop("nationality") entity = context.make("Company") entity.id = context.make_id(name, nationality) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", nationality) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("prohibited_practice")) sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS)) sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS)) full = cells.pop("address") address = h.make_address(context, full=full, country=nationality) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def crawl_individuals(context: Context): path = context.fetch_resource("individuals.xlsx", PEOPLE_URL) context.export_resource(path, XLSX, title=context.SOURCE_TITLE) for record in excel_records(path): seq_id = record.pop("internal_seq_id", None) if seq_id is None: continue name_en = record.pop("name_of_individual_english", None) name_he = record.pop("name_of_individual_hebrew", None) name_ar = record.pop("name_of_individual_arabic", None) entity = context.make("Person") entity.id = context.make_id(name_en, name_he, name_ar) if entity.id is None: continue entity.add("name", name_en or name_he or name_ar) entity.add("alias", name_he) entity.add("alias", name_ar) entity.add("topics", "crime.terror") entity.add("birthDate", parse_date(record.pop("d_o_b", None))) entity.add("nationality", record.pop("nationality_residency", None)) entity.add("idNumber", record.pop("individual_id", None)) sanction = h.make_sanction(context, entity) sanction.add("recordId", seq_id) sanction.add("recordId", record.pop("foreign_designation_id", None)) sanction.add("program", record.pop("designation", None)) sanction.add("program", record.pop("foreign_designation", None)) sanction.add("authority", lang_pick(record, "designated_by")) lang_pick(record, "designated_by_abroad") record.pop("date_of_foreign_designation_date", None) for field in ("date_of_designation_in_israel",): sanction.add("startDate", parse_date(record.pop(field, None))) context.emit(entity, target=True) context.emit(sanction) if len(record): context.pprint(record)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) context.log.info("Loading reference values...") load_ref_values(doc) context.log.info("Loading locations...") locations = load_locations(context, doc) context.log.info("Loading ID reg documents...") documents = load_documents(doc) parties = {} for distinct_party in doc.findall(".//DistinctParty"): party = parse_party(context, distinct_party, locations, documents) parties[party.id] = party for entry in doc.findall(".//SanctionsEntry"): parse_entry(context, entry, parties) for relation in doc.findall(".//ProfileRelationship"): parse_relation(context, relation, parties)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('.//table[@id="datatable-1"]') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(c.text, "_") for c in row.findall("./th")] continue cells = [collapse_spaces(c.text) for c in row.findall("./td")] cells = dict(zip(headers, cells)) # AfDB lists several individuals as firms in places where the IADB # shows them to be people (and they have normal personal names) # type_ = cells.pop("type") # schema = context.lookup_value("types", type_) # if schema is None: # context.log.error("Unknown entity type", type=type_) # continue name = cells.pop("name") country = cells.pop("nationality") entity = context.make("LegalEntity") entity.id = context.make_id(name, country) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", country) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("basis")) sanction.add("startDate", parse_date(cells.pop("from"))) sanction.add("endDate", parse_date(cells.pop("to"))) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for node in doc.findall("./entry"): entity = context.make("Organization") name = node.findtext("./title") entity.id = context.make_slug(node.findtext("./id"), name) entity.add("name", name) link = node.find("./link").get("href") entity.add("sourceUrl", link) aliases = node.findtext("./summary") if aliases != "N/A": aliases = aliases.split(", ") entity.add("alias", aliases) entity.add("notes", node.findtext("./content")) entity.add("createdAt", node.findtext("./published")) entity.add("modifiedAt", node.findtext("./updated")) entity.add("topics", "crime.terror") context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for el in doc.findall(".//person"): fname = el.findtext("./fname") mname = el.findtext("./mname") lname = el.findtext("./lname") bdate = el.findtext("./birthdate") iin = el.findtext("./iin") name = h.make_name(given_name=fname, middle_name=mname, last_name=lname) entity_id = context.make_id(name, bdate, iin) entity = make_entity(context, el, "Person", entity_id) h.apply_name(entity, given_name=fname, middle_name=mname, last_name=lname) entity.add("innCode", iin) entity.add("birthDate", h.parse_date(bdate, FORMATS, bdate)) context.emit(entity, target=True) for el in doc.findall(".//org"): name = el.findtext(".//org_name") entity_id = context.make_id(el.findtext("./num"), name) entity = make_entity(context, el, "Organization", entity_id) for tag in (".//org_name", ".//org_name_en"): names = el.findtext(tag) if names is None: continue names = names.split("; ") entity.add("name", names) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.csv", context.dataset.data.url) context.export_resource(path, CSV, title=context.SOURCE_TITLE) with open(path, "r") as fh: for row in csv.DictReader(fh): crawl_row(context, row)
def crawl_organizations(context: Context): path = context.fetch_resource("organizations.xlsx", ORG_URL) context.export_resource(path, XLSX, title=context.SOURCE_TITLE) seq_ids = {} links = [] for record in excel_records(path): seq_id = record.pop("internal_seq_id", None) name_en = record.pop("organization_name_english", None) name_he = record.pop("organization_name_hebrew", None) entity = context.make("Organization") entity.id = context.make_id(name_en, name_he) if entity.id is None: continue if seq_id is not None: seq_ids[seq_id] = entity.id entity.add("name", name_en) entity.add("name", name_he) entity.add("topics", "crime.terror") entity.add("notes", lang_pick(record, "comments")) entity.add("notes", record.pop("column_42", None)) entity.add("email", record.pop("email", None)) entity.add("country", record.pop("country_hebrew", None)) entity.add("country", record.pop("country_english", None)) entity.add("registrationNumber", record.pop("corporation_id", None)) entity.add("legalForm", lang_pick(record, "corporation_type")) entity.add("jurisdiction", lang_pick(record, "location_of_formation")) date = parse_date(record.pop("date_of_corporation", None)) entity.add("incorporationDate", date) for field in list(record.keys()): if field.startswith("organization_name_"): entity.add("alias", record.pop(field, None)) if field.startswith("telephone"): entity.add("phone", record.pop(field, None)) if field.startswith("website"): entity.add("website", record.pop(field, None)) entity.add("phone", record.pop("column_70", None)) entity.add("website", record.pop("column_73", None)) sanction = h.make_sanction(context, entity) sanction.add("recordId", seq_id) sanction.add("recordId", record.pop("seq_num_in_other_countries", None)) sanction.add("program", record.pop("designation_type", None)) sanction.add("reason", lang_pick(record, "designation_justification")) sanction.add("authority", lang_pick(record, "designated_by")) sanction.add("publisher", record.pop("public_records_references", None)) lang_pick(record, "designated_by_abroad") record.pop("date_designated_in_other_countries", None) linked = record.pop("linked_to_internal_seq_id", "") for link in linked.split(";"): links.append((max(link, seq_id), min(link, seq_id))) street = lang_pick(record, "street") city = lang_pick(record, "city_village") if street or city: address = h.make_address( context, street=street, city=city, country_code=entity.first("country") ) h.apply_address(context, entity, address) for field in ( "date_of_temporary_designation", "date_of_permenant_designation", "date_designation_in_west_bank", ): sanction.add("startDate", parse_date(record.pop(field, None))) context.emit(entity, target=True) context.emit(sanction) if len(record): context.pprint(record) for (subject, object) in links: subject_id = seq_ids.get(subject) object_id = seq_ids.get(object) if subject_id is None or object_id is None: continue link = context.make("UnknownLink") link.id = context.make_id(subject_id, object_id) link.add("subject", subject_id) link.add("object", object_id) context.emit(link)
def fetch(context: Context, part: str): path = context.fetch_resource("%s.json" % part, URL % part) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: return json.load(fh)
def crawl(context: Context): path = context.fetch_resource("source.tsv", context.dataset.data.url) context.export_resource(path, "text/tsv", title=context.SOURCE_TITLE) with open(path, "r") as csvfile: for row in csv.DictReader(csvfile, delimiter="\t"): parse_row(context, row)