Example #1
0
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Example #2
0
def crawl_legislature(context: Context, country, legislature):
    lastmod_ = int(legislature.get("lastmod"))
    lastmod = datetime.utcfromtimestamp(lastmod_)

    url = legislature.get("popolo_url")
    # this isn't being updated, hence long interval:
    data = context.fetch_json(url, cache_days=30)

    persons: Dict[str, Optional[str]] = {}
    for person in data.pop("persons", []):
        pid = person.get("id")
        persons[pid] = parse_person(context, person, country, lastmod)

    organizations: Dict[str, Optional[str]] = {}
    for org in data.pop("organizations", []):
        org_id = org.pop("id", None)
        org_id = context.lookup_value("org_id", org_id, org_id)
        if org_id is None:
            continue

        name = org.pop("name", org.pop("sort_name", None))
        organizations[org_id] = name

    events = data.pop("events", [])
    events = {e.get("id"): e for e in events}

    for membership in data.pop("memberships", []):
        parse_membership(context, membership, persons, organizations, events)
Example #3
0
def crawl(context: Context):
    xls_url = fetch_xls_url(context)
    path = context.fetch_resource("source.xls", xls_url)
    context.export_resource(path, XLS, title=context.SOURCE_TITLE)

    xls = xlrd.open_workbook(path)
    for sheet in xls.sheets():
        headers = None
        row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)]
        sections = [c for c in row0 if c is not None]
        section = collapse_spaces(" / ".join(sections))
        for r in range(1, sheet.nrows):
            row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)]

            # after a header is found, read normal data:
            if headers is not None:
                data: Dict[str, List[str]] = {}
                for header, cell in zip(headers, row):
                    if header is None:
                        continue
                    values = []
                    if isinstance(cell, datetime):
                        cell = cell.date()
                    for value in multi_split(stringify(cell), SPLITS):
                        if value is None:
                            continue
                        if value == "不明":
                            continue
                        if value is not None:
                            values.append(value)
                    data[header] = values
                emit_row(context, sheet.name, section, data)

            if not len(row) or row[0] is None:
                continue
            teaser = row[0].strip()
            # the first column of the common headers:
            if "告示日付" in teaser:
                if headers is not None:
                    context.log.error("Found double header?", row=row)
                # print("SHEET", sheet, row)
                headers = []
                for cell in row:
                    cell = collapse_spaces(cell)
                    header = context.lookup_value("columns", cell)
                    if header is None:
                        context.log.warning("Unknown column title",
                                            column=cell,
                                            sheet=sheet.name)
                    headers.append(header)
Example #4
0
def crawl(context: Context):
    for page in count(1):
        url = str(context.dataset.data.url)
        url = url.replace("pPageNumber=1", "pPageNumber=%s" % page)
        headers = {
            "Accept":
            "application/json",
            "Referer":
            "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals",
        }
        res = context.http.get(url, headers=headers)
        ids = []
        for row in res.json():
            for field, value in list(row.items()):
                if value == "N/A":
                    row[field] = ""
            row_id = row.pop("id")
            ids.append(row_id)
            entity_type = row.pop("entity")
            schema = context.lookup_value("types", entity_type)
            if schema is None:
                context.log.warning("Unknown entity type", entity=entity_type)
                continue
            entity = context.make(schema)
            entity.id = context.make_slug(row_id)
            entity.add("name", row.pop("firmName"))
            entity.add("topics", "debarment")
            entity.add("alias", row.pop("additionalName"))
            entity.add("notes", row.pop("title"))
            entity.add("notes", row.pop("additionalTitle"))
            entity.add("country", parse_countries(row.pop("country")))

            nat = "nationality"
            if schema == "Company":
                nat = "jurisdiction"
            entity.add(nat, parse_countries(row.pop("nationality")))

            affiliated = row.pop("affiliatedWithEntityId")
            if len(affiliated):
                link = context.make("UnknownLink")
                link.id = context.make_id(row_id, affiliated)
                link.add("subject", entity.id)
                link.add("object", context.make_slug(affiliated))
                context.emit(link)

            sanction = h.make_sanction(context, entity)
            sanction.add("status", row.pop("statusName"))
            sanction.add("reason", row.pop("grounds"))
            sanction.add("authority", row.pop("source"))
            sanction.add("authority", row.pop("idBinstSource"))
            sanction.add("program", row.pop("idBinstType"))
            sanction.add("startDate", h.parse_date(row.pop("datefrom"),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS))
            # context.pprint(row)

            context.emit(sanction)
            context.emit(entity, target=True)

        if min(ids) == 1:
            return
Example #5
0
def emit_row(context: Context, sheet: str, section: str, row: Dict[str,
                                                                   List[str]]):
    schema = context.lookup_value("schema", section)
    if schema is None:
        context.log.warning("No schema for section",
                            section=section,
                            sheet=sheet)
        return
    entity = context.make(schema)
    entity.id = context.make_id(*row.get("name_english"),
                                *row.get("name_japanese"))
    if entity.id is None:
        # context.pprint((sheet, row))
        return
    entity.add("name", parse_names(row.pop("name_english")))
    if not entity.has("name"):
        entity.add("name", parse_names(row.pop("name_japanese")))
    else:
        entity.add("alias", parse_names(row.pop("name_japanese")))

    entity.add("alias", parse_names(row.pop("alias", [])))
    entity.add("alias", parse_names(row.pop("known_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("weak_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("nickname", [])))
    entity.add("previousName", parse_names(row.pop("past_alias", [])))
    entity.add("previousName", parse_names(row.pop("old_name", [])))
    entity.add_cast("Person", "position", row.pop("position", []))
    birth_date = parse_date(row.pop("birth_date", []))
    entity.add_cast("Person", "birthDate", birth_date)
    entity.add_cast("Person", "birthPlace", row.pop("birth_place", []))
    entity.add_cast("Person", "passportNumber", row.pop("passport_number", []))
    entity.add("idNumber", row.pop("id_number", []))
    entity.add("idNumber", row.pop("identification_number", []))
    entity.add("notes", row.pop("other_information", []))
    entity.add("notes", row.pop("details", []))
    entity.add("phone", row.pop("phone", []))
    entity.add("phone", row.pop("fax", []))

    for address_full in row.pop("address", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    for address_full in row.pop("where", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    title = row.pop("title", [])
    if entity.schema.is_a("Person"):
        entity.add("title", title)
    else:
        entity.add("notes", title)
    entity.add("country", row.pop("citizenship", []))
    entity.add("country", row.pop("activity_area", []))

    sanction = h.make_sanction(context, entity)
    sanction.add("program", section)
    sanction.add("reason", row.pop("root_nomination", None))
    sanction.add("reason", row.pop("reason_res1483", None))
    sanction.add("recordId", row.pop("notification_number", None))

    sanction.add("startDate", parse_date(row.pop("notification_date", [])))
    sanction.add("startDate", parse_date(row.pop("designated_date", [])))
    sanction.add("listingDate", parse_date(row.pop("publication_date", [])))

    row.pop("designated_un", None)
    # if len(row):
    #     context.pprint(row)
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Example #6
0
def crawl_persons(context: Context):
    for data in fetch(context, "personas"):
        entity = crawl_common(context, data, "personas", "Person")
        entity.add("title", values(data.pop("TITLE", None)))
        entity.add("nationality", values(data.pop("NATIONALITY", None)))
        entity.add("position", values(data.pop("DESIGNATION", None)))
        entity.add("gender", data.pop("GENDER", None))
        entity.add("birthDate", data.pop("DATE_OF_BIRTH", None))
        entity.add("birthDate", data.pop("YEAR", None))
        entity.add("birthPlace", data.pop("CITY_OF_BIRTH", None))
        entity.add("country", data.pop("COUNTRY_OF_BIRTH", None))

        for dob in data.pop("INDIVIDUAL_DATE_OF_BIRTH", []):
            date = parse_date(dob.pop("DATE", None))
            entity.add("birthDate", date)
            date = parse_date(dob.pop("TYPE_OF_DATE", None))
            entity.add("birthDate", date)
            entity.add("birthDate", dob.pop("YEAR", None))
            entity.add("birthDate", dob.pop("FROM_YEAR", None))
            entity.add("birthDate", dob.pop("TO_YEAR", None))
            h.audit_data(dob, ignore=["NOTE"])

        for doc in data.pop("INDIVIDUAL_DOCUMENT", []):
            type_ = doc.pop("TYPE_OF_DOCUMENT", None)
            number = doc.pop("NUMBER", None)
            schema = context.lookup_value("doc_types", type_)
            if schema is None:
                context.log.warning("Unknown document type", type=type_)
                continue
            passport = context.make(schema)
            passport.id = context.make_id("ID", entity.id, number)
            passport.add("holder", entity)
            passport.add("type", type_)
            passport.add("number", number)
            passport.add("type", doc.pop("TYPE_OF_DOCUMENT2", None))
            passport.add("startDate",
                         parse_date(doc.pop("DATE_OF_ISSUE", None)))
            passport.add("country", doc.pop("ISSUING_COUNTRY", None))
            passport.add("country", doc.pop("COUNTRY_OF_ISSUE", None))
            passport.add("summary", doc.pop("NOTE", None))
            context.emit(passport)
            h.audit_data(doc, ignore=["CITY_OF_ISSUE"])

        for addr in data.pop("INDIVIDUAL_ADDRESS", []):
            address = parse_address(context, addr)
            h.apply_address(context, entity, address)

        for addr in data.pop("INDIVIDUAL_PLACE_OF_BIRTH", []):
            address = parse_address(context, addr)
            if address is not None:
                entity.add("birthPlace", address.get("full"))
                entity.add("country", address.get("country"))

        for alias in data.pop("INDIVIDUAL_ALIAS", []):
            entity.add("birthDate", alias.pop("DATE_OF_BIRTH", None))
            entity.add("birthDate", alias.pop("YEAR", None))
            entity.add("birthPlace", alias.pop("CITY_OF_BIRTH", None))
            entity.add("country", alias.pop("COUNTRY_OF_BIRTH", None))
            parse_alias(entity, alias)

        h.audit_data(data, ["VERSIONNUM"])
        context.emit(entity, target=True)
Example #7
0
def crawl_person(context: Context, data: Dict[str, Any]):
    is_pep = data.pop("is_pep", False)
    entity = context.make("Person", target=is_pep)
    wikidata_id = clean_wdid(data.pop("wikidata_id", None))
    entity.id = person_id(context, data.pop("id"), wikidata_id)
    entity.add("sourceUrl", data.pop("url_en", None))
    data.pop("url_ru", None)
    entity.add("modifiedAt", data.pop("last_change", None))
    entity.add("wikidataId", wikidata_id)
    entity.add("name", data.pop("full_name_en", None))
    entity.add("name", data.pop("full_name_ru", None))
    entity.add("alias", data.pop("inversed_full_name_en", None))
    entity.add("alias", data.pop("inversed_full_name_ru", None))
    entity.add("alias", data.pop("also_known_as_en", None))
    entity.add("alias", data.pop("also_known_as_ru", None))
    entity.add("alias", split_names(data.pop("names", [])))
    entity.add("birthDate", parse_date(data.pop("date_of_birth", None)))
    entity.add("deathDate", parse_date(data.pop("termination_date_human", None)))
    entity.add("birthPlace", data.pop("city_of_birth_ru", None))
    entity.add("birthPlace", data.pop("city_of_birth_en", None))
    entity.add("innCode", data.pop("inn", None))
    entity.add("firstName", data.pop("first_name_en", None))
    entity.add("firstName", data.pop("first_name_ru", None))
    entity.add("fatherName", data.pop("patronymic_en", None))
    entity.add("fatherName", data.pop("patronymic_ru", None))
    entity.add("lastName", data.pop("last_name_en", None))
    entity.add("lastName", data.pop("last_name_ru", None))

    for suffix in ("", "_en", "_ru"):
        role = data.pop(f"last_job_title{suffix}", None)
        org = data.pop(f"last_workplace{suffix}", None)
        if org is None or not len(org.strip()):
            continue
        position = org
        if role is not None and len(role.strip()):
            position = f"{org} ({role})"
        entity.add("position", position)

    for country_data in data.pop("related_countries", []):
        rel_type = country_data.pop("relationship_type")
        country_name = country_data.pop("to_country_en", None)
        country_name = country_name or country_data.pop("to_country_ru")
        # print(country_name)
        res = context.lookup("country_links", rel_type)
        if res is None:
            context.log.warn(
                "Unknown country link",
                rel_type=rel_type,
                entity=entity,
                country=country_name,
            )
            continue
        if res.prop is not None:
            entity.add(res.prop, country_name)
        # h.audit_data(country_data)

    for rel_data in data.pop("related_persons", []):
        other_pep = rel_data.pop("is_pep", False)
        other_wdid = clean_wdid(rel_data.pop("person_wikidata_id"))
        other = context.make("Person", target=other_pep)
        other.id = person_id(context, rel_data.pop("person_id"), other_wdid)
        other.add("name", rel_data.pop("person_en", None))
        other.add("name", rel_data.pop("person_ru", None))
        other.add("wikidataId", other_wdid)

        rel_type = rel_data.pop("relationship_type_en", None)
        rel_type_ru = rel_data.pop("relationship_type_ru", None)
        rel_type = rel_type or rel_type_ru
        res = context.lookup("person_relations", rel_type)
        if res is None:
            context.log.warn(
                "Unknown person/person relation type",
                rel_type=rel_type,
                entity=entity,
                other=other,
            )
            continue

        # print("LINK", (entity.id, other.id))
        id_a, id_b = sorted((entity.id, other.id))
        rel = context.make(res.schema)
        id_a_short = short_id(context, id_a)
        id_b_short = short_id(context, id_b)
        rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        rel.add(res.from_prop, id_a)
        rel.add(res.to_prop, id_b)
        rel.add(res.desc_prop, rel_type)
        rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        rel.add("startDate", parse_date(rel_data.pop("date_established")))
        rel.add("endDate", parse_date(rel_data.pop("date_finished")))

        # h.audit_data(rel_data)
        context.emit(other, target=other_pep)
        context.emit(rel)

    data.pop("type_of_official_ru", None)
    person_type = data.pop("type_of_official_en", None)
    person_topic = context.lookup_value("person_type", person_type)
    if person_topic is None:
        context.log.warn("Unknown type of official", type=person_type)
    entity.add("topics", person_topic)
    if is_pep:
        entity.add("topics", "role.pep")
    entity.add("status", person_type)

    data.pop("died", None)
    data.pop("tags", None)
    data.pop("reason_of_termination_en", None)
    data.pop("reason_of_termination_ru", None)
    # TODO: store images
    data.pop("photo", None)
    data.pop("related_companies", None)
    data.pop("declarations", None)
    # h.audit_data(data)
    context.emit(entity, target=is_pep)
Example #8
0
def parse_entry(context: Context, entry: Element):
    subject_type = entry.find("./subjectType")
    schema = context.lookup_value(
        "subject_type",
        subject_type.get("code"),
        dataset="eu_fsf",
    )
    if schema is None:
        context.log.warning("Unknown subject type", type=subject_type)
        return

    entity = context.make(schema)
    eu_ref = entry.get("euReferenceNumber")
    if eu_ref is not None:
        entity.id = context.make_slug(eu_ref, dataset="eu_fsf")
    else:
        entity.id = context.make_slug("logical", entry.get("logicalId"))
    entity.add("notes", h.clean_note(entry.findtext("./remark")))
    entity.add("topics", "sanction")
    parse_sanctions(context, entity, entry)

    for name in entry.findall("./nameAlias"):
        is_weak = not as_bool(name.get("strong"))
        h.apply_name(
            entity,
            full=name.get("wholeName"),
            first_name=name.get("firstName"),
            middle_name=name.get("middleName"),
            last_name=name.get("lastName"),
            is_weak=is_weak,
            quiet=True,
        )
        entity.add("title", name.get("title"), quiet=True)
        entity.add("position", name.get("function"), quiet=True)
        entity.add("gender", name.get("gender"), quiet=True)

    for node in entry.findall("./identification"):
        type = node.get("identificationTypeCode")
        schema = "Passport" if type == "passport" else "Identification"
        passport = context.make(schema)
        passport.id = context.make_id("ID", entity.id, node.get("logicalId"))
        passport.add("holder", entity)
        passport.add("authority", node.get("issuedBy"))
        passport.add("type", node.get("identificationTypeDescription"))
        passport.add("number", node.get("number"))
        passport.add("number", node.get("latinNumber"))
        passport.add("startDate", node.get("issueDate"))
        passport.add("startDate", node.get("issueDate"))
        passport.add("country", parse_country(node))
        passport.add("country", node.get("countryDescription"))
        for remark in node.findall("./remark"):
            passport.add("summary", remark.text)
        context.emit(passport)

    for node in entry.findall("./address"):
        address = parse_address(context, node)
        h.apply_address(context, entity, address)

        for child in node.getchildren():
            if child.tag in ("regulationSummary"):
                continue
            elif child.tag == "remark":
                entity.add("notes", child.text)
            elif child.tag == "contactInfo":
                prop = context.lookup_value(
                    "contact_info",
                    child.get("key"),
                    dataset="eu_fsf",
                )
                if prop is None:
                    context.log.warning("Unknown contact info", node=child)
                else:
                    entity.add(prop, child.get("value"))
            else:
                context.log.warning("Unknown address component", node=child)

    for birth in entry.findall("./birthdate"):
        partialBirth = parse_parts(birth.get("year"), birth.get("month"),
                                   birth.get("day"))
        entity.add("birthDate", birth.get("birthdate"))
        entity.add("birthDate", partialBirth)
        address = parse_address(context, birth)
        if address is not None:
            entity.add("birthPlace", address.get("full"))
            entity.add("country", address.get("country"))

    for node in entry.findall("./citizenship"):
        entity.add("nationality", parse_country(node), quiet=True)
        entity.add("nationality", node.get("countryDescription"), quiet=True)

    context.emit(entity, target=True)
Example #9
0
def parse_result(context: Context, result):
    type_ = result.pop("type", None)
    schema = context.lookup_value("type", type_)
    if schema is None:
        context.log.error("Unknown result type", type=type_)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(result.pop("id"))

    entity_number = result.pop("entity_number", None)
    if entity_number is not None:
        assert int(entity_number)
        entity.id = context.make_slug(entity_number, dataset="us_ofac_sdn")

    name = result.pop("name", None)
    name = name.replace("and any successor, sub-unit, or subsidiary thereof",
                        "")
    entity.add("name", name)
    for alias in ensure_list(result.pop("alt_names", "")):
        entity.add("alias", alias.split("; "))
    entity.add("notes", result.pop("remarks", None))
    entity.add("country", result.pop("country", None))
    if entity.schema.is_a("Person"):
        entity.add("position", result.pop("title", None))
        entity.add("nationality", result.pop("nationalities", None))
        entity.add("nationality", result.pop("citizenships", None))
        for dob in result.pop("dates_of_birth", []):
            entity.add("birthDate", h.parse_date(dob, FORMATS))
        entity.add("birthPlace", result.pop("places_of_birth", None))
    elif entity.schema.is_a("Vessel"):
        entity.add("flag", result.pop("vessel_flag", None))
        entity.add("callSign", result.pop("call_sign", None))
        entity.add("type", result.pop("vessel_type", None))
        grt = result.pop("gross_registered_tonnage", None)
        entity.add("grossRegisteredTonnage", grt)
        gt = result.pop("gross_tonnage", None)
        entity.add("tonnage", gt)

        # TODO: make adjacent owner entity
        result.pop("vessel_owner", None)

    assert result.pop("title", None) is None
    assert not len(result.pop("nationalities", []))
    assert not len(result.pop("citizenships", []))
    assert not len(result.pop("dates_of_birth", []))
    assert not len(result.pop("places_of_birth", []))

    for address in result.pop("addresses", []):
        obj = h.make_address(
            context,
            street=address.get("address"),
            city=address.get("city"),
            postal_code=address.get("postal_code"),
            region=address.get("state"),
            country=address.get("country"),
        )
        h.apply_address(context, entity, obj)

    for ident in result.pop("ids", []):
        country = ident.pop("country")
        entity.add("country", country)
        h.apply_feature(
            context,
            entity,
            ident.pop("type"),
            ident.pop("number"),
            country=country,
            date_formats=FORMATS,
            start_date=ident.pop("issue_date", None),
            end_date=ident.pop("expiration_date", None),
        )

    sanction = context.make("Sanction")
    sanction.id = context.make_id(entity.id, "Sanction")
    sanction.add("entity", entity)
    sanction.add("program", result.pop("programs", []))
    sanction.add("provisions", result.pop("license_policy", []))
    sanction.add("reason", result.pop("license_requirement", []))
    sanction.add("authorityId", result.pop("federal_register_notice", None))
    sanction.add("startDate", result.pop("start_date", None))
    sanction.add("endDate", result.pop("end_date", None))
    sanction.add("country", "us")
    sanction.add("authority", result.pop("source", None))

    # TODO: deref
    source_url = deref_url(context, result.pop("source_information_url"))
    sanction.add("sourceUrl", source_url)
    result.pop("source_list_url")

    context.emit(sanction)
    context.emit(entity, target=True)

    h.audit_data(result, ignore=["standard_order"])