def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for el in doc.findall(".//FinancialSanctionsTarget"): parse_row(context, make_row(el))
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for node in doc.findall(".//INDIVIDUAL"): parse_individual(context, node) for node in doc.findall(".//ENTITY"): parse_entity(context, node)
def crawl(context: Context): url = crawl_index(context) if url is None: context.log.error("Could not locate XML file", url=context.dataset.url) return path = context.fetch_resource("source.xml", url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) xml = context.parse_resource_xml(path) for person in xml.findall(".//KyrgyzPhysicPerson"): parse_person(context, person) for legal in xml.findall(".//KyrgyzLegalPerson"): parse_legal(context, legal)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for row in doc.findall(".//Table"): data = {} for field in row.getchildren(): value = field.text if value == "NA": continue data[field.tag] = value crawl_row(context, data)
def crawl(context: Context): data = context.fetch_json(context.dataset.data.url) for ban in data.get("data", {}).get("travelBansFiles"): if not ban.get("fileName").endswith(".xml"): continue data_url = URL % ban.get("id") path = context.fetch_resource("source.xml", data_url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): subject_type = entry.find("./subjectType") if subject_type is None: salvage_entity(context, entry) continue parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) updated_at = doc.getroot().get("date") programs = {} for sanc in doc.findall(".//sanctions-program"): ssid = sanc.find("./sanctions-set").get("ssid") programs[ssid] = sanc.findtext('./program-name[@lang="eng"]') places = {} for place in doc.findall(".//place"): places[place.get("ssid")] = parse_address(place) for target in doc.findall("./target"): parse_entry(context, target, programs, places, updated_at)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) context.log.info("Loading reference values...") load_ref_values(doc) context.log.info("Loading locations...") locations = load_locations(context, doc) context.log.info("Loading ID reg documents...") documents = load_documents(doc) parties = {} for distinct_party in doc.findall(".//DistinctParty"): party = parse_party(context, distinct_party, locations, documents) parties[party.id] = party for entry in doc.findall(".//SanctionsEntry"): parse_entry(context, entry, parties) for relation in doc.findall(".//ProfileRelationship"): parse_relation(context, relation, parties)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for node in doc.findall("./entry"): entity = context.make("Organization") name = node.findtext("./title") entity.id = context.make_slug(node.findtext("./id"), name) entity.add("name", name) link = node.find("./link").get("href") entity.add("sourceUrl", link) aliases = node.findtext("./summary") if aliases != "N/A": aliases = aliases.split(", ") entity.add("alias", aliases) entity.add("notes", node.findtext("./content")) entity.add("createdAt", node.findtext("./published")) entity.add("modifiedAt", node.findtext("./updated")) entity.add("topics", "crime.terror") context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for el in doc.findall(".//person"): fname = el.findtext("./fname") mname = el.findtext("./mname") lname = el.findtext("./lname") bdate = el.findtext("./birthdate") iin = el.findtext("./iin") name = h.make_name(given_name=fname, middle_name=mname, last_name=lname) entity_id = context.make_id(name, bdate, iin) entity = make_entity(context, el, "Person", entity_id) h.apply_name(entity, given_name=fname, middle_name=mname, last_name=lname) entity.add("innCode", iin) entity.add("birthDate", h.parse_date(bdate, FORMATS, bdate)) context.emit(entity, target=True) for el in doc.findall(".//org"): name = el.findtext(".//org_name") entity_id = context.make_id(el.findtext("./num"), name) entity = make_entity(context, el, "Organization", entity_id) for tag in (".//org_name", ".//org_name_en"): names = el.findtext(tag) if names is None: continue names = names.split("; ") entity.add("name", names) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for node in doc.findall(".//mep"): crawl_node(context, node)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for entry in doc.findall(".//acount-list"): parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for node in doc.findall(".//record"): parse_entry(context, node)