Ejemplo n.º 1
0
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Ejemplo n.º 2
0
def parse_date(date):
    if date is not None:
        date = date.replace(" ", "")
    dates = set()
    for part in multi_split(date, [",", "\n", ";"]):
        dates.update(h.parse_date(part, ["%d.%m.%Y", "dd.%m.%Y"]))
    return dates
Ejemplo n.º 3
0
def parse_reference(context, reference, rows):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(reference)
    # entity.add("sourceUrl", context.dataset.url)
    sanction = h.make_sanction(context, entity)

    for row in rows:
        if row.pop("type") == "Individual":
            entity.schema = model.get("Person")

        name = row.pop("name_of_individual_or_entity", None)
        if row.pop("name_type") == "aka":
            entity.add("alias", name)
        else:
            entity.add("name", name)

        address = h.make_address(context, full=row.pop("address"))
        h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", row.pop("additional_information"))
        entity.add("notes", row.pop("listing_information"), quiet=True)

        control_date = row.pop("control_date")
        sanction.add("modifiedAt", control_date)
        entity.add("modifiedAt", control_date)
        entity.context["updated_at"] = control_date.isoformat()

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Ejemplo n.º 4
0
def clean_date(date):
    splits = [
        "a)",
        "b)",
        "c)",
        "d)",
        "e)",
        "f)",
        "g)",
        "h)",
        "i)",
        " or ",
        " to ",
        " and ",
        "alt DOB:",
        "alt DOB",
        ";",
        ">>",
    ]
    dates = set()
    if isinstance(date, float):
        date = str(int(date))
    if isinstance(date, datetime):
        date = date.date().isoformat()
    date = remove_bracketed(date)
    if date is None:
        return dates
    date = date.replace("\n", " ")
    for part in multi_split(date, splits):
        part = part.strip().strip(",")
        if not len(part):
            continue
        dates.update(h.parse_date(part, FORMATS))
    return dates
Ejemplo n.º 5
0
def parse_date(text: List[Optional[str]]) -> List[str]:
    dates: List[str] = []
    for date in multi_split(text, DATE_SPLITS):
        cleaned = DATE_CLEAN.sub("", date)
        normal = decompose_nfkd(cleaned)
        for parsed in h.parse_date(normal, FORMATS, default=date):
            dates.append(parsed)
    return dates
Ejemplo n.º 6
0
def crawl_physical(context: Context) -> None:
    data = json_resource(context, PHYSICAL_URL, "physical")
    for row in data:
        entity = context.make("Person")
        entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index"))
        entity.add("name", row.pop("name_ukr", None))
        entity.add("name", row.pop("name_original", None))
        for alias in multi_split(row.pop("name_alternative", None),
                                 [";", "/"]):
            entity.add("alias", alias)
        entity.add("notes", row.pop("additional", None))
        for country in multi_split(row.pop("citizenship", None), [", "]):
            entity.add("nationality", country)
        entity.add("birthDate", row.pop("birthdate", None))
        entity.add("birthPlace", row.pop("birthplace", None))
        entity.add("position", row.pop("occupation", None))
        handle_address(context, entity, row.pop("livingplace", None))
        handle_sanction(context, entity, row)

        context.emit(entity, target=True)
Ejemplo n.º 7
0
def crawl(context: Context):
    xls_url = fetch_xls_url(context)
    path = context.fetch_resource("source.xls", xls_url)
    context.export_resource(path, XLS, title=context.SOURCE_TITLE)

    xls = xlrd.open_workbook(path)
    for sheet in xls.sheets():
        headers = None
        row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)]
        sections = [c for c in row0 if c is not None]
        section = collapse_spaces(" / ".join(sections))
        for r in range(1, sheet.nrows):
            row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)]

            # after a header is found, read normal data:
            if headers is not None:
                data: Dict[str, List[str]] = {}
                for header, cell in zip(headers, row):
                    if header is None:
                        continue
                    values = []
                    if isinstance(cell, datetime):
                        cell = cell.date()
                    for value in multi_split(stringify(cell), SPLITS):
                        if value is None:
                            continue
                        if value == "不明":
                            continue
                        if value is not None:
                            values.append(value)
                    data[header] = values
                emit_row(context, sheet.name, section, data)

            if not len(row) or row[0] is None:
                continue
            teaser = row[0].strip()
            # the first column of the common headers:
            if "告示日付" in teaser:
                if headers is not None:
                    context.log.error("Found double header?", row=row)
                # print("SHEET", sheet, row)
                headers = []
                for cell in row:
                    cell = collapse_spaces(cell)
                    header = context.lookup_value("columns", cell)
                    if header is None:
                        context.log.warning("Unknown column title",
                                            column=cell,
                                            sheet=sheet.name)
                    headers.append(header)
Ejemplo n.º 8
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for node in doc.findall(".//td[@class='tailSTxt']"):
        if not node.text_content().startswith("2."):
            continue
        for item in node.findall(".//tr"):
            number = item.find(".//td[@class='sProvP1No']").text_content()
            text = item.findtext(".//td[@class='sProvP1']")
            text = text.strip().rstrip(";").rstrip(".")
            name, _ = text.split("(", 1)
            names = multi_split(name, ["s/o", "@"])

            entity = context.make("Person")
            entity.id = context.make_slug(number, name)
            entity.add("name", names)
            entity.add("topics", "sanction")

            sanction = h.make_sanction(context, entity)
            sanction.add("program", PROGRAM)

            for match in IN_BRACKETS.findall(text):
                # match = match.replace("\xa0", "")
                res = context.lookup("props", match)
                if res is not None:
                    for prop, value in res.props.items():
                        entity.add(prop, value)
                    continue
                if match.endswith("citizen"):
                    nat = match.replace("citizen", "")
                    entity.add("nationality", nat)
                    continue
                if match.startswith(DOB):
                    dob = match.replace(DOB, "").strip()
                    entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"]))
                    continue
                if match.startswith(PASSPORT):
                    passport = match.replace(PASSPORT, "").strip()
                    entity.add("passportNumber", passport)
                    continue
                context.log.warn("Unparsed bracket term", term=match)

            context.emit(entity, target=True)
            context.emit(sanction)
Ejemplo n.º 9
0
def crawl_legal(context: Context) -> None:
    data = json_resource(context, LEGAL_URL, "legal")
    for row in data:
        entity = context.make("Organization")
        entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index"))
        entity.add("name", row.pop("name_ukr", None))
        entity.add("name", row.pop("name_original", None))
        for alias in multi_split(row.pop("name_alternative", None),
                                 [";", "/"]):
            entity.add("alias", alias)
        entity.add("notes", row.pop("additional", None))
        ipn = row.pop("ipn", "") or ""
        entity.add("taxNumber", ipn.replace("ІПН", ""))
        odrn = row.pop("odrn_edrpou", "") or ""
        entity.add("registrationNumber", odrn.replace("ОДРН", ""))

        handle_address(context, entity, row.pop("place", None))
        handle_address(context, entity, row.pop("place_alternative", None))
        handle_sanction(context, entity, row)
        # context.pprint(row)
        context.emit(entity, target=True)
Ejemplo n.º 10
0
def parse_date(date):
    dates = []
    for part in multi_split(date, ["OR", ";", " - "]):
        dates.extend(h.parse_date(part, FORMATS))
    return dates
Ejemplo n.º 11
0
def parse_countries(text):
    countries = set()
    for country in multi_split(text, COUNTRY_SPLIT):
        country = remove_bracketed(country)
        countries.add(country)
    return countries
Ejemplo n.º 12
0
def split_new(text):
    # It's 2022 and they can't multi-value a thing...
    return multi_split(text, [". ", ", "])
Ejemplo n.º 13
0
def clean_phones(phones):
    out = []
    for phone in multi_split(phones, SPLITS):
        phone = REMOVE.sub("", phone)
        out.append(phone)
    return out
Ejemplo n.º 14
0
def split_names(names):
    return multi_split(names, ["\n", ", "])
Ejemplo n.º 15
0
def parse_entry(context: Context, entry):
    entity = context.make("LegalEntity")
    if entry.findtext("./type-entry") == "2":
        entity = context.make("Person")
    entry_id = entry.findtext("number-entry")
    entity.id = context.make_slug(entry_id)

    sanction = h.make_sanction(context, entity)
    sanction.add("program", entry.findtext("./program-entry"))
    date_entry = entry.findtext("./date-entry")
    if date_entry:
        date = datetime.strptime(date_entry, "%Y%m%d")
        entity.add("createdAt", date.date())
        sanction.add("listingDate", date.date())
        sanction.add("startDate", date.date())

    for aka in entry.findall("./aka-list"):
        h.apply_name(
            entity,
            name1=aka.findtext("./aka-name1"),
            name2=aka.findtext("./aka-name2"),
            name3=aka.findtext("./aka-name3"),
            tail_name=aka.findtext("./aka-name4"),
            alias=aka.findtext("type-aka") != "N",
            is_weak=aka.findtext("./quality-aka") == "2",
            quiet=True,
        )

    for node in entry.findall("./title-list"):
        entity.add("title", node.text, quiet=True)

    for doc in entry.findall("./document-list"):
        reg = doc.findtext("./document-reg")
        number = doc.findtext("./document-id")
        country = doc.findtext("./document-country")
        passport = context.make("Passport")
        passport.id = context.make_id("Passport", entity.id, reg, number, country)
        passport.add("holder", entity)
        passport.add("passportNumber", number)
        passport.add("summary", reg)
        passport.add("country", country)
        context.emit(passport)

    for doc in entry.findall("./id-number-list"):
        entity.add("idNumber", doc.text)

    for node in entry.findall("./address-list"):
        address = h.make_address(context, full=node.findtext("./address"))
        h.apply_address(context, entity, address)

    for pob in entry.findall("./place-of-birth-list"):
        entity.add_cast("Person", "birthPlace", pob.text)

    for dob in entry.findall("./date-of-birth-list"):
        date = parse_date(dob.text)
        entity.add_cast("Person", "birthDate", date)

    for nat in entry.findall("./nationality-list"):
        for country in multi_split(nat.text, [";", ","]):
            country = remove_bracketed(country)
            entity.add("nationality", country, quiet=True)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Ejemplo n.º 16
0
def letter_split(text):
    return multi_split(text, SPLITS)
Ejemplo n.º 17
0
def parse_entry(context, entry):
    entity = context.make("LegalEntity")
    if entry.findtext("./type-entry") == "2":
        entity = context.make("Person")
    entry_id = entry.findtext("number-entry")
    entity.id = context.make_slug(entry_id)

    sanction = h.make_sanction(context, entity)
    sanction.add("program", entry.findtext("./program-entry"))
    date_entry = entry.findtext("./date-entry")
    if date_entry:
        date = datetime.strptime(date_entry, "%Y%m%d")
        entity.context["created_at"] = date.isoformat()
        sanction.add("startDate", date.date())

    for aka in entry.findall("./aka-list"):
        first_name = aka.findtext("./aka-name1")
        entity.add("firstName", first_name, quiet=True)
        second_name = aka.findtext("./aka-name2")
        entity.add("secondName", second_name, quiet=True)
        third_name = aka.findtext("./aka-name3")
        entity.add("middleName", third_name, quiet=True)
        last_name = aka.findtext("./aka-name4")
        entity.add("lastName", last_name, quiet=True)
        name = jointext(first_name, second_name, third_name, last_name)
        if aka.findtext("type-aka") == "N":
            entity.add("name", name)
        else:
            if aka.findtext("./quality-aka") == "2":
                entity.add("weakAlias", name)
            else:
                entity.add("alias", name)

    for node in entry.findall("./title-list"):
        entity.add("title", node.text, quiet=True)

    for doc in entry.findall("./document-list"):
        reg = doc.findtext("./document-reg")
        number = doc.findtext("./document-id")
        country = doc.findtext("./document-country")
        passport = context.make("Passport")
        passport.id = context.make_id("Passport", entity.id, reg, number,
                                      country)
        passport.add("holder", entity)
        passport.add("passportNumber", number)
        passport.add("summary", reg)
        passport.add("country", country)
        context.emit(passport)

    for doc in entry.findall("./id-number-list"):
        entity.add("idNumber", doc.text)

    for node in entry.findall("./address-list"):
        address = h.make_address(context, full=node.findtext("./address"))
        h.apply_address(context, entity, address)

    for pob in entry.findall("./place-of-birth-list"):
        entity.add_cast("Person", "birthPlace", pob.text)

    for dob in entry.findall("./date-of-birth-list"):
        date = parse_date(dob.text)
        entity.add_cast("Person", "birthDate", date)

    for nat in entry.findall("./nationality-list"):
        for country in multi_split(nat.text, [";", ","]):
            country = remove_bracketed(country)
            entity.add("nationality", country, quiet=True)

    entity.add("topics", "sanction")
    context.emit(entity, target=True, unique=True)
    context.emit(sanction)