def parse_entry(context, node):
    entity_name = node.findtext("./Entity")
    if entity_name is not None:
        entity = context.make("LegalEntity")
        entity.add("name", entity_name.split("/"))
    else:
        entity = context.make("Person")
        given_name = node.findtext("./GivenName")
        entity.add("firstName", given_name)
        last_name = node.findtext("./LastName")
        entity.add("lastName", last_name)
        entity.add("name", jointext(given_name, last_name))
        entity.add("birthDate", node.findtext("./DateOfBirth"))

    # ids are per country and entry type (individual/entity)
    item = node.findtext("./Item")
    schedule = node.findtext("./Schedule")
    country = node.findtext("./Country")
    if "/" in country:
        country, _ = country.split("/")
    entity.id = context.make_slug(country, schedule, item, strict=False)
    entity.add("country", country)
    sanction = h.make_sanction(context, entity)
    sanction.add("program", schedule)

    names = node.findtext("./Aliases")
    if names is not None:
        for name in names.split(", "):
            name = collapse_spaces(name)
            entity.add("alias", name)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #2
0
def parse_entry(context, entry):
    party = context.make("Thing")
    party.id = context.make_slug(entry.get("ProfileID"))

    sanction = h.make_sanction(context, party, key=entry.get("ID"))
    sanction.add("program", ref_value("List", entry.get("ListID")))

    dates = set()
    for event in entry.findall("./EntryEvent"):
        date = parse_date(event.find("./Date"))
        dates.add(date)
        sanction.add("startDate", date)
        sanction.add("summary", event.findtext("./Comment"))
        basis = ref_value("LegalBasis", event.get("LegalBasisID"))
        sanction.add("reason", basis)

    if len(dates):
        party.context["created_at"] = min(dates)
        party.context["updated_at"] = max(dates)

    for measure in entry.findall("./SanctionsMeasure"):
        sanction.add("summary", measure.findtext("./Comment"))
        type_id = measure.get("SanctionsTypeID")
        sanction.add("program", ref_value("SanctionsType", type_id))

    context.emit(sanction)
Exemple #3
0
def parse_entry(context: Context, entry, parties):
    party_id = context.make_slug(entry.get("ProfileID"))
    party = parties[party_id]

    sanction = h.make_sanction(context, party, key=entry.get("ID"))
    sanction.add("program", ref_value("List", entry.get("ListID")))

    for event in entry.findall("./EntryEvent"):
        date = parse_date(event.find("./Date"))
        party.add("createdAt", date)
        sanction.add("summary", event.findtext("./Comment"))
        basis = ref_value("LegalBasis", event.get("LegalBasisID"))
        sanction.add("reason", basis)

    party.add("topics", "sanction")
    sanction.add("listingDate", party.get("createdAt"))
    sanction.add("startDate", party.get("modifiedAt"))

    for measure in entry.findall("./SanctionsMeasure"):
        sanction.add("summary", measure.findtext("./Comment"))
        type_id = measure.get("SanctionsTypeID")
        sanction.add("program", ref_value("SanctionsType", type_id))

    context.emit(sanction)
    context.emit(party, target=True)
def crawl(context: Context):
    url = context.dataset.data.url
    headers = {"apikey": context.dataset.data.api_key}
    data = context.fetch_json(url, headers=headers)
    # TODO write this out to a source.json
    for data in data["response"]["ZPROCSUPP"]:
        # context.pprint(data)
        entity = context.make("LegalEntity")
        name = data.get("SUPP_NAME")
        ent_id = data.get("SUPP_ID")
        entity.id = context.make_slug(ent_id)
        names = clean_name(name)
        entity.add("name", names[0])
        entity.add("topics", "debarment")
        entity.add("country", data.get("COUNTRY_NAME"))
        for name in names[1:]:
            entity.add("alias", name)

        address = h.make_address(
            context,
            street=data.get("SUPP_ADDR"),
            city=data.get("SUPP_CITY"),
            country=data.get("COUNTRY_NAME"),
            key=entity.id,
        )
        h.apply_address(context, entity, address)

        sanction = h.make_sanction(context, entity)
        sanction.add("program", data.get("DEBAR_REASON"))
        sanction.add("startDate",
                     h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS))
        sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"),
                                             FORMATS))
        context.emit(entity, target=True)
        context.emit(sanction)
def parse_reference(context, reference, rows):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(reference)
    # entity.add("sourceUrl", context.dataset.url)
    sanction = h.make_sanction(context, entity)

    for row in rows:
        if row.pop("type") == "Individual":
            entity.schema = model.get("Person")

        name = row.pop("name_of_individual_or_entity", None)
        if row.pop("name_type") == "aka":
            entity.add("alias", name)
        else:
            entity.add("name", name)

        address = h.make_address(context, full=row.pop("address"))
        h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", row.pop("additional_information"))
        entity.add("notes", row.pop("listing_information"), quiet=True)

        control_date = row.pop("control_date")
        sanction.add("modifiedAt", control_date)
        entity.add("modifiedAt", control_date)
        entity.context["updated_at"] = control_date.isoformat()

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #6
0
def parse_row(context: Context, row):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name"))
    entity.add("name", row.get("Name"))
    entity.add("notes", row.get("Action"))
    entity.add("country", row.get("Country"))
    entity.add("modifiedAt", row.get("Last_Update"))

    address = h.make_address(
        context,
        street=row.get("Street_Address"),
        postal_code=row.get("Postal_Code"),
        city=row.get("City"),
        region=row.get("State"),
        country=row.get("Country"),
    )
    h.apply_address(context, entity, address)
    context.emit(entity, target=True)

    citation = row.get("FR_Citation")
    sanction = h.make_sanction(context, entity, key=citation)
    sanction.add("program", citation)
    sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS))
    sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS))
    # pprint(row)
    context.emit(sanction)
Exemple #7
0
def crawl_common(context: Context, data: Dict[str, str], part: str,
                 schema: str):
    entity = context.make(schema)
    entity.id = context.make_slug(part, data.pop("DATAID"))
    entity.add("topics", "sanction")
    entity.add("notes", h.clean_note(data.pop("COMMENTS1")))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT"))

    h.apply_name(
        entity,
        name1=data.pop("FIRST_NAME", None),
        name2=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    sanction = h.make_sanction(context, entity)
    submitted_on = parse_date(data.pop("SUBMITTED_ON", None))
    listed_on = parse_date(data.pop("LISTED_ON"))
    modified_at = parse_date(data.pop("LAST_DAY_UPDATED"))
    entity.add("createdAt", submitted_on or listed_on or modified_at)
    entity.add("modifiedAt", modified_at)

    sanction.add("listingDate", submitted_on or listed_on)
    sanction.add("startDate", listed_on)
    sanction.add("program", data.pop("UN_LIST_TYPE"))
    sanction.add("program", data.pop("LIST_TYPE"))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER"))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))
    context.emit(sanction)
    return entity
Exemple #8
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    table = doc.find('//div[@class="sanctioned-table"]/table')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(el.text) for el in row.findall("./th")]
            continue
        cells = [collapse_spaces(el.text) for el in row.findall("./td")]
        data = {hdr: c for hdr, c in zip(headers, cells)}

        entity = context.make("Person")
        entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"])
        entity.add("name", data["ad-soyad-ata-adi"])
        entity.add("idNumber", data["id"])
        entity.add("birthDate", parse_date(data["dogum-tarixi"]))
        entity.add("country", "az")
        entity.add("topics", "sanction")

        addr = h.make_address(context, full=data["malumat"])
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        context.emit(sanction)
        context.emit(entity, target=True)
Exemple #9
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for table in doc.findall("//table"):
        headers = table.findall("./thead/tr/td")
        headers = [h.text_content() for h in headers]
        assert "Vendor name" in headers, headers
        assert "From" in headers, headers
        for row in table.findall("./tbody/tr"):
            cells = [h.text_content() for h in row.findall("./td")]
            if len(cells[0]) == 0:
                continue
            entity = context.make("LegalEntity")
            entity.id = context.make_id(*cells)
            entity.add("name", cells[0])
            entity.add("country", cells[1])
            entity.add("topics", "crime.fraud")

            cc = entity.first("country")
            address = h.make_address(context, full=cells[2], country_code=cc)
            h.apply_address(context, entity, address)

            sanction = h.make_sanction(context, entity)
            sanction.add("reason", cells[3])
            sanction.add("program", cells[4])
            sanction.add("startDate", parse_date(cells[5]))
            sanction.add("endDate", parse_date(cells[6]))

            context.emit(sanction)
            context.emit(entity, target=True)
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for sec_id, (section, schema) in SECTIONS.items():
        el = doc.find(".//div[@id='%s']" % sec_id)
        for item in el.findall(".//li"):
            text = item.text_content().strip()
            index, text = text.split(".", 1)
            text = text.strip()
            if text.endswith(";"):
                text = text.rstrip(";")
            entity = context.make(schema)
            entity.id = context.make_id(text)
            sanction = h.make_sanction(context, entity)
            sanction.add("program", section)
            sanction.add("recordId", index)
            if sec_id == "russianUL":
                parse_russian_orgs(context, entity, text)
            if sec_id == "russianFL":
                parse_russian_persons(context, entity, text)
            if sec_id == "foreignUL":
                parse_foreign_orgs(context, entity, text)
            if sec_id == "foreignFL":
                parse_foreign_persons(context, entity, text)

            if entity.has("name"):
                context.emit(entity, target=True)
                context.emit(sanction)
Exemple #11
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
Exemple #12
0
def handle_sanction(context, entity, row):
    sanction = h.make_sanction(context, entity)
    sanction.add("status", row.pop("action", None))
    sanction.add("summary", row.pop("restriction_period", None))
    sanction.add("program", row.pop("restriction_type", None))
    sanction.add("startDate", row.pop("ukaz_date", None))
    sanction.add("endDate", row.pop("restriction_end_date", None))
    context.emit(sanction)
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #14
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="ISO-8859-1") as fh:
        doc = html.parse(fh)

    table = doc.find("//div[@id='viewcontainer']/table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./th")
            ]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        cells.pop(None, None)

        full_name = name = cells.pop("name")
        registration_number = None
        for splitter in REG_NRS:
            if splitter in name:
                name, registration_number = name.split(splitter, 1)
                registration_number = registration_number.replace(")", "")

        country = cells.pop("nationality")
        country = country.replace("Non ADB Member Country", "")
        country = country.replace("Rep. of", "")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(full_name, country)
        entity.add("name", name)
        entity.add("alias", cells.pop("othername_logo"))
        entity.add("topics", "debarment")
        entity.add("country", country)
        entity.add("registrationNumber", registration_number)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("grounds"))
        sanction.add("program", cells.pop("sanction_type"))
        date_range = cells.pop("effect_date_lapse_date", "")
        if "|" in date_range:
            start_date, end_date = date_range.split("|")
            sanction.add("startDate", h.parse_date(start_date.strip(),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS))

        address = h.make_address(context,
                                 full=cells.pop("address"),
                                 country=country)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Exemple #15
0
def parse_common(context: Context, node, entity):
    sanction = h.make_sanction(context, entity)
    sanction.add("reason", node.findtext("./BasicInclusion"))
    sanction.add("program", node.findtext("./CategoryPerson"))
    inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS)
    sanction.add("startDate", inclusion_date)
    sanction.add("listingDate", inclusion_date)
    entity.add("createdAt", inclusion_date)
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #16
0
def make_entity(context: Context, el, schema, entity_id):
    entity = context.make(schema, target=True)
    entity.id = entity_id
    entity.add("notes", h.clean_note(el.findtext("./note")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    sanction.add("summary", el.findtext("./correction"))
    context.emit(sanction)

    return entity
Exemple #17
0
def parse_common(context, node, entity):
    entity.id = context.make_slug(node.tag, node.findtext("./Number"))
    sanction = h.make_sanction(context, entity)
    sanction.add("reason", node.findtext("./BasicInclusion"))
    sanction.add("program", node.findtext("./CategoryPerson"))
    inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS)
    sanction.add("startDate", inclusion_date)
    if inclusion_date is not None:
        entity.context["created_at"] = inclusion_date
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #18
0
def parse_entry(context, target, programs, places, updated_at):
    entity = context.make("LegalEntity")
    node = target.find("./entity")
    if node is None:
        node = target.find("./individual")
        entity = context.make("Person")
    if node is None:
        node = target.find("./object")
        object_type = node.get("object-type")
        if object_type != "vessel":
            context.log.warning("Unknown target type",
                                target=target,
                                object_type=object_type)
        entity = context.make("Vessel")

    dates = set()
    for mod in target.findall("./modification"):
        date = mod.get("publication-date")
        if date is not None:
            dates.add(date)
    if not len(dates):
        dates.add(updated_at)
    entity.context["created_at"] = min(dates)
    entity.context["updated_at"] = max(dates)

    entity.id = context.make_slug(target.get("ssid"))
    entity.add("gender", node.get("sex"), quiet=True)
    for other in node.findall("./other-information"):
        value = other.text.strip()
        if entity.schema.is_a("Vessel") and value.lower().startswith("imo"):
            _, imo_num = value.split(":", 1)
            entity.add("imoNumber", imo_num)
        else:
            entity.add("notes", value)

    sanction = make_sanction(context, entity)
    sanction.add("modifiedAt", max(dates))

    for justification in node.findall("./justification"):
        entity.add("notes", justification.text)

    ssid = target.get("sanctions-set-id")
    sanction.add("program", programs.get(ssid))

    for identity in node.findall("./identity"):
        parse_identity(context, entity, identity, places)

    entity.add("topics", "sanction")
    context.emit(entity, target=True, unique=True)
    context.emit(sanction)
Exemple #19
0
def crawl_entity(context, data):
    nature = data.pop("Nature")
    schema = SCHEMATA.get(nature)
    entity = context.make(schema)
    entity.id = context.make_slug(data.pop("IdRegistre"))
    entity.add("name", data.pop("Nom"))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    for detail in data.pop("RegistreDetail"):
        field = detail.pop("TypeChamp")
        for value in detail.pop("Valeur"):
            apply_prop(context, entity, sanction, field, value)

    context.emit(entity, target=True)
Exemple #20
0
def crawl(context: Context):
    path = context.fetch_resource("source.json", context.dataset.data.url)
    context.export_resource(path, JSON, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        data = json.load(fh)
    for record in data:
        bank = context.make("Company")
        charter_no = record.pop("CharterNumber")
        bank_name = record.pop("BankName")
        bank.id = context.make_slug(charter_no, bank_name)
        bank.add("name", bank_name)
        bank.add("registrationNumber", charter_no)
        bank.add("country", "us")
        bank.add("topics", "fin.bank")
        if bank.id is not None:
            context.emit(bank)
        company_name = record.pop("CompanyName")
        first_name = record.pop("FirstName")
        last_name = record.pop("LastName")
        if company_name:
            entity = context.make("Company")
            entity.id = context.make_id(charter_no, bank_name, company_name)
            entity.add("name", company_name)
        else:
            entity = context.make("Person")
            entity.id = context.make_id(charter_no, bank_name, first_name,
                                        last_name)
            h.apply_name(entity, first_name=first_name, last_name=last_name)
        entity.add("country", "us")
        entity.add("topics", "crime.fin")

        addr = h.make_address(
            context,
            city=record.pop("CityName"),
            state=record.pop("StateName"),
            country_code="us",
        )
        record.pop("StateAbbreviation")
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        sanction.add("startDate", record.pop("CompleteDate", None))
        sanction.add("endDate", record.pop("TerminationDate", None))
        sanction.add("program", record.pop("EnforcementTypeDescription", None))
        sanction.add("authorityId", record.pop("DocketNumber", None))
        # context.pprint(record)
        context.emit(entity, target=True)
        context.emit(sanction)
Exemple #21
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for node in doc.findall(".//td[@class='tailSTxt']"):
        if not node.text_content().startswith("2."):
            continue
        for item in node.findall(".//tr"):
            number = item.find(".//td[@class='sProvP1No']").text_content()
            text = item.findtext(".//td[@class='sProvP1']")
            text = text.strip().rstrip(";").rstrip(".")
            name, _ = text.split("(", 1)
            names = multi_split(name, ["s/o", "@"])

            entity = context.make("Person")
            entity.id = context.make_slug(number, name)
            entity.add("name", names)
            entity.add("topics", "sanction")

            sanction = h.make_sanction(context, entity)
            sanction.add("program", PROGRAM)

            for match in IN_BRACKETS.findall(text):
                # match = match.replace("\xa0", "")
                res = context.lookup("props", match)
                if res is not None:
                    for prop, value in res.props.items():
                        entity.add(prop, value)
                    continue
                if match.endswith("citizen"):
                    nat = match.replace("citizen", "")
                    entity.add("nationality", nat)
                    continue
                if match.startswith(DOB):
                    dob = match.replace(DOB, "").strip()
                    entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"]))
                    continue
                if match.startswith(PASSPORT):
                    passport = match.replace(PASSPORT, "").strip()
                    entity.add("passportNumber", passport)
                    continue
                context.log.warn("Unparsed bracket term", term=match)

            context.emit(entity, target=True)
            context.emit(sanction)
def parse_entry(context: Context, node: _Element):
    entity_name = node.findtext("./Entity")
    dob = node.findtext("./DateOfBirth")
    schedule = node.findtext("./Schedule")
    if schedule == "N/A":
        schedule = ""
    program = node.findtext("./Country")
    item = node.findtext("./Item")
    if entity_name is not None:
        entity = context.make("LegalEntity")
        entity.add("name", entity_name.split("/"))
    else:
        entity = context.make("Person")
        given_name = node.findtext("./GivenName")
        last_name = node.findtext("./LastName")
        entity_name = h.make_name(given_name=given_name, last_name=last_name)
        entity.add("name", entity_name)
        entity.add("birthDate", dob)

    country = program
    if program is not None and "/" in program:
        country, _ = program.split("/")
    entity.add("country", country)

    entity.id = context.make_slug(
        schedule,
        item,
        entity.first("country"),
        entity_name,
        strict=False,
    )

    sanction = h.make_sanction(context, entity)
    sanction.add("program", program)
    sanction.add("reason", schedule)
    sanction.add("authorityId", item)

    names = node.findtext("./Aliases")
    if names is not None:
        for name in names.split(", "):
            name = collapse_spaces(name)
            entity.add("alias", name)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemple #23
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    table = doc.find(".//article//table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./td")
            ]
            headers = headers[:-2] + ["from", "to"] + headers[-1:]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        if "prohibited_practice" not in cells:
            continue

        name = cells.pop("firm_name")
        nationality = cells.pop("nationality")
        entity = context.make("Company")
        entity.id = context.make_id(name, nationality)
        entity.add("name", name)
        entity.add("topics", "debarment")
        entity.add("country", nationality)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("prohibited_practice"))
        sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS))
        sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS))

        full = cells.pop("address")
        address = h.make_address(context, full=full, country=nationality)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Exemple #24
0
def parse_common(context, entity, node):
    entity.id = context.make_slug(node.findtext("./DATAID"))
    name = node.findtext("./NAME_ORIGINAL_SCRIPT")
    name = name or node.findtext("./FIRST_NAME")
    entity.add("name", name)
    entity.add("notes", node.findtext("./COMMENTS1"))
    entity.add("topics", "sanction")
    updated_at = values(node.find("./LAST_DAY_UPDATED"))
    if len(updated_at):
        entity.add("modifiedAt", updated_at)
        entity.context["updated_at"] = max(updated_at)
    listed_on = node.findtext("./LISTED_ON")
    if listed_on is not None:
        entity.context["created_at"] = listed_on

    sanction = make_sanction(context, entity)
    sanction.add("startDate", listed_on)
    sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED")))
    sanction.add("program", node.findtext("./UN_LIST_TYPE"))
    sanction.add("recordId", node.findtext("./REFERENCE_NUMBER"))
    return sanction
def crawl_individuals(context: Context):
    path = context.fetch_resource("individuals.xlsx", PEOPLE_URL)
    context.export_resource(path, XLSX, title=context.SOURCE_TITLE)
    for record in excel_records(path):
        seq_id = record.pop("internal_seq_id", None)
        if seq_id is None:
            continue
        name_en = record.pop("name_of_individual_english", None)
        name_he = record.pop("name_of_individual_hebrew", None)
        name_ar = record.pop("name_of_individual_arabic", None)
        entity = context.make("Person")
        entity.id = context.make_id(name_en, name_he, name_ar)
        if entity.id is None:
            continue
        entity.add("name", name_en or name_he or name_ar)
        entity.add("alias", name_he)
        entity.add("alias", name_ar)
        entity.add("topics", "crime.terror")
        entity.add("birthDate", parse_date(record.pop("d_o_b", None)))
        entity.add("nationality", record.pop("nationality_residency", None))
        entity.add("idNumber", record.pop("individual_id", None))

        sanction = h.make_sanction(context, entity)
        sanction.add("recordId", seq_id)
        sanction.add("recordId", record.pop("foreign_designation_id", None))
        sanction.add("program", record.pop("designation", None))
        sanction.add("program", record.pop("foreign_designation", None))
        sanction.add("authority", lang_pick(record, "designated_by"))

        lang_pick(record, "designated_by_abroad")
        record.pop("date_of_foreign_designation_date", None)

        for field in ("date_of_designation_in_israel",):
            sanction.add("startDate", parse_date(record.pop(field, None)))

        context.emit(entity, target=True)
        context.emit(sanction)
        if len(record):
            context.pprint(record)
Exemple #26
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    table = doc.find('.//table[@id="datatable-1"]')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(c.text, "_") for c in row.findall("./th")]
            continue
        cells = [collapse_spaces(c.text) for c in row.findall("./td")]
        cells = dict(zip(headers, cells))

        # AfDB lists several individuals as firms in places where the IADB
        # shows them to be people (and they have normal personal names)

        # type_ = cells.pop("type")
        # schema = context.lookup_value("types", type_)
        # if schema is None:
        #     context.log.error("Unknown entity type", type=type_)
        #     continue
        name = cells.pop("name")
        country = cells.pop("nationality")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(name, country)
        entity.add("name", name)
        entity.add("topics", "debarment")
        entity.add("country", country)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("basis"))
        sanction.add("startDate", parse_date(cells.pop("from")))
        sanction.add("endDate", parse_date(cells.pop("to")))

        context.emit(entity, target=True)
        context.emit(sanction)
Exemple #27
0
def parse_common(context: Context, entity, node):
    entity.id = context.make_slug(node.findtext("./DATAID"))
    h.apply_name(
        entity,
        given_name=node.findtext("./FIRST_NAME"),
        second_name=node.findtext("./SECOND_NAME"),
        name3=node.findtext("./THIRD_NAME"),
        name4=node.findtext("./FOURTH_NAME"),
        quiet=True,
    )
    entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT"))
    entity.add("notes", h.clean_note(node.findtext("./COMMENTS1")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    entity.add("createdAt", node.findtext("./LISTED_ON"))
    sanction.add("listingDate", node.findtext("./LISTED_ON"))
    sanction.add("startDate", node.findtext("./LISTED_ON"))
    sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED")))
    entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED")))
    sanction.add("program", node.findtext("./UN_LIST_TYPE"))
    sanction.add("unscId", node.findtext("./REFERENCE_NUMBER"))
    return sanction
Exemple #28
0
def parse_sanctions(context: Context, entity: Entity, entry):

    regulations = entry.findall("./regulation")
    # if len(regulations) == 0:
    #     context.log.warning(
    #         "No regulations on entity",
    #         entity=entity,
    #         regulations=len(regulations),
    #     )

    for regulation in regulations:
        url = regulation.findtext("./publicationUrl")
        assert url is not None, etree.tostring(regulation)
        sanction = h.make_sanction(context, entity, key=url)
        sanction.set("sourceUrl", url)
        sanction.add("program", regulation.get("programme"))
        sanction.add("reason", regulation.get("numberTitle"))
        sanction.add("startDate", regulation.get("entryIntoForceDate"))
        sanction.add("listingDate", regulation.get("publicationDate"))
        entity.add("createdAt", regulation.get("publicationDate"))
        sanction.add("unscId", entry.get("unitedNationId"))
        sanction.add("authorityId", entry.get("euReferenceNumber"))
        context.emit(sanction)
def crawl_entity(context: Context, data):
    # context.pprint(data)
    nature = data.pop("Nature")
    schema = SCHEMATA.get(nature)
    if schema is None:
        context.log.error("Unknown entity type", nature=nature)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(data.pop("IdRegistre"))
    sanction = h.make_sanction(context, entity)
    for detail in data.pop("RegistreDetail"):
        field = detail.pop("TypeChamp")
        for value in detail.pop("Valeur"):
            apply_prop(context, entity, sanction, field, value)

    name = data.pop("Nom")
    h.apply_name(
        entity,
        first_name=entity.first("firstName", quiet=True),
        tail_name=name,
        quiet=True,
    )
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
Exemple #30
0
def crawl(context: Context):
    for page in count(1):
        url = str(context.dataset.data.url)
        url = url.replace("pPageNumber=1", "pPageNumber=%s" % page)
        headers = {
            "Accept":
            "application/json",
            "Referer":
            "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals",
        }
        res = context.http.get(url, headers=headers)
        ids = []
        for row in res.json():
            for field, value in list(row.items()):
                if value == "N/A":
                    row[field] = ""
            row_id = row.pop("id")
            ids.append(row_id)
            entity_type = row.pop("entity")
            schema = context.lookup_value("types", entity_type)
            if schema is None:
                context.log.warning("Unknown entity type", entity=entity_type)
                continue
            entity = context.make(schema)
            entity.id = context.make_slug(row_id)
            entity.add("name", row.pop("firmName"))
            entity.add("topics", "debarment")
            entity.add("alias", row.pop("additionalName"))
            entity.add("notes", row.pop("title"))
            entity.add("notes", row.pop("additionalTitle"))
            entity.add("country", parse_countries(row.pop("country")))

            nat = "nationality"
            if schema == "Company":
                nat = "jurisdiction"
            entity.add(nat, parse_countries(row.pop("nationality")))

            affiliated = row.pop("affiliatedWithEntityId")
            if len(affiliated):
                link = context.make("UnknownLink")
                link.id = context.make_id(row_id, affiliated)
                link.add("subject", entity.id)
                link.add("object", context.make_slug(affiliated))
                context.emit(link)

            sanction = h.make_sanction(context, entity)
            sanction.add("status", row.pop("statusName"))
            sanction.add("reason", row.pop("grounds"))
            sanction.add("authority", row.pop("source"))
            sanction.add("authority", row.pop("idBinstSource"))
            sanction.add("program", row.pop("idBinstType"))
            sanction.add("startDate", h.parse_date(row.pop("datefrom"),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS))
            # context.pprint(row)

            context.emit(sanction)
            context.emit(entity, target=True)

        if min(ids) == 1:
            return