def parse(context, data): urls = context.params.get('urls') for url in urls: res = context.http.get(url) doc = res.xml remove_namespace(doc, 'http://tempuri.org/sdnList.xsd') updated_at = doc.findtext('.//Publish_Date') updated_at = stringify(parse_date(updated_at, format_hint='%m/%d/%Y')) for entry in doc.findall('.//sdnEntry'): parse_entry(context, entry, url, updated_at)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for el in doc.findall(".//FinancialSanctionsTarget"): parse_row(context, make_row(el))
def crawl(context: Context): path = context.fetch_resource("source.zip", context.dataset.data.url) context.export_resource(path, "application/zip", title=context.SOURCE_TITLE) with ZipFile(path, "r") as zip: for name in zip.namelist(): if name.endswith(".xml"): with zip.open(name) as fh: doc = etree.parse(fh) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): parse_entry(context, entry)
def crawl(context: Context): data = context.fetch_json(context.dataset.data.url) for ban in data.get("data", {}).get("travelBansFiles"): if not ban.get("fileName").endswith(".xml"): continue data_url = URL % ban.get("id") path = context.fetch_resource("source.xml", data_url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): subject_type = entry.find("./subjectType") if subject_type is None: salvage_entity(context, entry) continue parse_entry(context, entry)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) context.log.info("Loading reference values...") load_ref_values(doc) context.log.info("Loading locations...") locations = load_locations(context, doc) context.log.info("Loading ID reg documents...") documents = load_documents(doc) parties = {} for distinct_party in doc.findall(".//DistinctParty"): party = parse_party(context, distinct_party, locations, documents) parties[party.id] = party for entry in doc.findall(".//SanctionsEntry"): parse_entry(context, entry, parties) for relation in doc.findall(".//ProfileRelationship"): parse_relation(context, relation, parties)
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for node in doc.findall("./entry"): entity = context.make("Organization") name = node.findtext("./title") entity.id = context.make_slug(node.findtext("./id"), name) entity.add("name", name) link = node.find("./link").get("href") entity.add("sourceUrl", link) aliases = node.findtext("./summary") if aliases != "N/A": aliases = aliases.split(", ") entity.add("alias", aliases) entity.add("notes", node.findtext("./content")) entity.add("createdAt", node.findtext("./published")) entity.add("modifiedAt", node.findtext("./updated")) entity.add("topics", "crime.terror") context.emit(entity, target=True)