Ejemplo n.º 1
0
def crawl_legislature(context: Context, country, legislature):
    lastmod_ = int(legislature.get("lastmod"))
    lastmod = datetime.utcfromtimestamp(lastmod_)

    url = legislature.get("popolo_url")
    # this isn't being updated, hence long interval:
    data = context.fetch_json(url, cache_days=30)

    persons: Dict[str, Optional[str]] = {}
    for person in data.pop("persons", []):
        pid = person.get("id")
        persons[pid] = parse_person(context, person, country, lastmod)

    organizations: Dict[str, Optional[str]] = {}
    for org in data.pop("organizations", []):
        org_id = org.pop("id", None)
        org_id = context.lookup_value("org_id", org_id, org_id)
        if org_id is None:
            continue

        name = org.pop("name", org.pop("sort_name", None))
        organizations[org_id] = name

    events = data.pop("events", [])
    events = {e.get("id"): e for e in events}

    for membership in data.pop("memberships", []):
        parse_membership(context, membership, persons, organizations, events)
Ejemplo n.º 2
0
def crawl(context: Context):
    params = {"_": settings.RUN_DATE}
    res = context.fetch_json(context.dataset.data.url, params=params)
    data = res.get("result", {}).get("data", {})
    for edge in data.get("governments", {}).get("edges", []):
        node = edge.get("node", {})
        crawl_country(context, params, node.get("path"), node.get("title"))
Ejemplo n.º 3
0
def crawl(context: Context):
    url = context.dataset.data.url
    headers = {"apikey": context.dataset.data.api_key}
    data = context.fetch_json(url, headers=headers)
    # TODO write this out to a source.json
    for data in data["response"]["ZPROCSUPP"]:
        # context.pprint(data)
        entity = context.make("LegalEntity")
        name = data.get("SUPP_NAME")
        ent_id = data.get("SUPP_ID")
        entity.id = context.make_slug(ent_id)
        names = clean_name(name)
        entity.add("name", names[0])
        entity.add("topics", "debarment")
        entity.add("country", data.get("COUNTRY_NAME"))
        for name in names[1:]:
            entity.add("alias", name)

        address = h.make_address(
            context,
            street=data.get("SUPP_ADDR"),
            city=data.get("SUPP_CITY"),
            country=data.get("COUNTRY_NAME"),
            key=entity.id,
        )
        h.apply_address(context, entity, address)

        sanction = h.make_sanction(context, entity)
        sanction.add("program", data.get("DEBAR_REASON"))
        sanction.add("startDate",
                     h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS))
        sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"),
                                             FORMATS))
        context.emit(entity, target=True)
        context.emit(sanction)
Ejemplo n.º 4
0
def crawl(context: Context):
    data = context.fetch_json(context.dataset.data.url)

    for country in data:
        for legislature in country.get("legislatures", []):
            code = country.get("code").lower()
            context.log.info("Country: %s" % code)
            crawl_legislature(context, code, legislature)
Ejemplo n.º 5
0
def http_get(context: Context, url, params=None, cache_days=None):
    try:
        return context.fetch_json(
            url,
            params=params,
            auth=AUTH,
            cache_days=cache_days,
        )
    except HTTPError as err:
        if err.response.status_code in (429, 416):
            raise AbortCrawl()
        context.log.info("HTTP error: %r", err)
Ejemplo n.º 6
0
def crawl(context: Context):
    data = context.fetch_json(context.dataset.data.url)
    for ban in data.get("data", {}).get("travelBansFiles"):
        if not ban.get("fileName").endswith(".xml"):
            continue
        data_url = URL % ban.get("id")
        path = context.fetch_resource("source.xml", data_url)
        context.export_resource(path, "text/xml", title=context.SOURCE_TITLE)
        doc = context.parse_resource_xml(path)
        doc = h.remove_namespace(doc)
        for entry in doc.findall(".//sanctionEntity"):
            subject_type = entry.find("./subjectType")
            if subject_type is None:
                salvage_entity(context, entry)
                continue
            parse_entry(context, entry)
Ejemplo n.º 7
0
def crawl_country(context: Context, country, age_max=120, age_min=0):
    params = {
        "ageMin": int(age_min),
        "ageMax": int(age_max),
        # "arrestWarrantCountryId": country,
        "nationality": country,
        "resultPerPage": MAX_RESULTS,
    }
    try:
        data = context.fetch_json(context.dataset.data.url, params=params)
    except HTTPError as err:
        context.log.warning(
            "HTTP error",
            url=str(err.request.url),
            country=country,
            error=err.response.status_code,
        )
        return
    # if res.status_code != 200:

    # if not res.from_cache:
    #     time.sleep(0.5)
    # data = res.json()
    notices = data.get("_embedded", {}).get("notices", [])
    for notice in notices:
        crawl_notice(context, notice)
    total = data.get("total")
    # pprint((country, total, age_max, age_min))
    if total > MAX_RESULTS:
        age_range = age_max - age_min
        if age_range > 1:
            age_split = age_min + (age_range // 2)
            crawl_country(context, country, age_max, age_split)
            crawl_country(context, country, age_split, age_min)
        elif age_range == 1:
            crawl_country(context, country, age_max, age_max)
            crawl_country(context, country, age_min, age_min)
Ejemplo n.º 8
0
def crawl_country(context: Context, params, path, country):
    source_url = UI_URL % path
    context.log.debug("Crawling country: %s" % country)
    res = context.fetch_json(DATA_URL % path, params=params)
    data = res.get("result", {}).get("data", {}).get("page", {})
    blocks = data.get("acf", {}).get("blocks", [{}])[0]
    content = blocks.get("free_form_content", []).get("content")
    doc = html.fromstring(content)
    function = None
    for i, el in enumerate(doc.getchildren()):
        text = el.text_content().strip()
        if el.tag == "h2":
            continue
        if el.tag == "h3":
            function = text
            continue
        if i == 0 and el.tag == "p":
            # this paragraph at the start is a note, not a person
            continue
        name = text.replace("(Acting)", "")
        if is_empty(name):
            continue
        context.log.debug(
            "Person",
            country=country,
            name=name,
            function=function,
            url=source_url,
        )
        person = context.make("Person")
        person.id = context.make_slug(country, name, function)
        person.add("name", name)
        person.add("country", country)
        person.add("position", function)
        person.add("sourceUrl", source_url)
        person.add("topics", "role.pep")
        context.emit(person, target=True)
Ejemplo n.º 9
0
def crawl(context: Context):
    for page in count(1):
        url = str(context.dataset.data.url)
        url = url.replace("pPageNumber=1", "pPageNumber=%s" % page)
        headers = {
            "Accept":
            "application/json",
            "Referer":
            "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals",
        }
        page_data = context.fetch_json(url, headers=headers)
        ids = []
        for row in page_data:
            for field, value in list(row.items()):
                if value == "N/A":
                    row[field] = ""
            row_id = row.pop("id")
            ids.append(row_id)
            entity_type = row.pop("entity")
            schema = context.lookup_value("types", entity_type)
            if schema is None:
                context.log.warning("Unknown entity type", entity=entity_type)
                continue
            entity = context.make(schema)
            entity.id = context.make_slug(row_id)
            entity.add("name", row.pop("firmName"))
            entity.add("topics", "debarment")
            entity.add("alias", row.pop("additionalName"))
            entity.add("notes", row.pop("title"))
            entity.add("notes", row.pop("additionalTitle"))
            entity.add("country", parse_countries(row.pop("country")))

            nat = "nationality"
            if schema == "Company":
                nat = "jurisdiction"
            entity.add(nat, parse_countries(row.pop("nationality")))

            affiliated = row.pop("affiliatedWithEntityId")
            if len(affiliated):
                link = context.make("UnknownLink")
                link.id = context.make_id(row_id, affiliated)
                link.add("subject", entity.id)
                link.add("object", context.make_slug(affiliated))
                context.emit(link)

            sanction = h.make_sanction(context, entity)
            sanction.add("status", row.pop("statusName"))
            sanction.add("reason", row.pop("grounds"))
            sanction.add("authority", row.pop("source"))
            sanction.add("authority", row.pop("idBinstSource"))
            sanction.add("program", row.pop("idBinstType"))
            sanction.add("startDate", h.parse_date(row.pop("datefrom"),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS))
            # context.pprint(row)

            context.emit(sanction)
            context.emit(entity, target=True)

        if min(ids) == 1:
            return