コード例 #1
0
ファイル: normalizers.py プロジェクト: opennem/opennem
def _old_station_name_cleaner(facility_name: str) -> str:
    """ "
    This cleans station names from their messy as hell AEMO names to something
    we can plug into name_clean and humans can actually read

    It's a bit of a mess and could use a refactor


    """
    name_clean = facility_name or ""

    if type(facility_name) is str:
        name_clean = facility_name.strip()
    else:
        name_clean = str(facility_name).strip()

    # @todo check has duid / unit other

    name_clean = name_clean.lower()

    # @TODO replace with the re character stripped - this is unicode junk
    name_clean = name_clean.replace("\u00a0", " ")

    if station_map_name(name_clean) != name_clean:
        return station_map_name(name_clean)

    # strip units from name
    name_clean = re.sub(r"\d+\ ?(mw|kw|MW|KW)", "", name_clean)

    # strip other chars
    name_clean = re.sub(r",|-|\(|\)|\–|\"|\'", "", name_clean)
    # name_clean = re.sub(r"(\W|\ )+", "", name_clean)

    name_clean = re.sub(" +", " ", name_clean)

    name_clean = name_clean.replace("yalumba winery", "yalumba")
    name_clean = name_clean.replace("university of melbourne", "uom")

    # @TODO remove these hard codes
    if name_clean not in ["barcaldine solar farm", "Darling Downs Solar Farm"]:
        for w in STRIP_WORDS:
            if name_clean.startswith("todae solar") and w in ["solar"]:
                continue

            if " " in w:
                name_clean = name_clean.replace(w, "")

    name_components = [str(i) for i in name_clean.strip().split(" ")]
    name_components_parsed = []

    for _comp in name_components:
        comp: Optional[str] = str(_comp)

        if not comp:
            continue

        comp = comp.strip()
        comp = re.sub(r",|-|\(|\)|\–", "", comp)

        if type(comp) is not str:
            comp = None

        if comp == "":
            comp = None

        if comp in STRIP_WORDS:
            comp = None

        if comp in ACRONYMS:
            comp = comp.upper()
        elif type(comp) is str and comp.startswith("mc"):
            comp = "Mc" + comp[2:].capitalize()
        elif type(comp) is str and comp != "":
            comp = comp.capitalize()

        # strip numbers greater than 5
        comp_clean = clean_station_numbers_to_string(comp)

        if comp_clean:
            comp = comp_clean

        name_components_parsed.append(comp)

    if name_components_parsed[0] == "uom":
        name_components_parsed = name_components_parsed[:-1]

    name_clean = " ".join(
        [str(i) for i in name_components_parsed if i is not None])

    name_clean = re.sub(" +", " ", name_clean)

    name_clean = name_clean.strip()

    if "/" in name_clean:
        name_clean = " / ".join(
            [i.strip().title() for i in name_clean.split("/")])

    if station_map_name(name_clean) != name_clean:
        return station_map_name(name_clean)

    # uom special case
    name_clean = name_clean.replace("UOM ", "UoM ")

    # todae special case
    todae_match = re.match(r"^(Todae)\ (.*)", name_clean)
    if todae_match:
        todae_name, todae_rest = todae_match.groups()

        name_clean = "{} ({})".format(todae_rest, todae_name)

    return name_clean
コード例 #2
0
ファイル: normalizers.py プロジェクト: opennem/opennem
def station_name_cleaner(station_name: str) -> str:
    """Refactred version of the station name cleaner. Cleans up station names prior to applying
    any manual mappings"""

    # Clean it up all in lower case
    station_clean_name = station_name

    # Exit early if we already map
    if station_map_name(station_clean_name) != station_clean_name:
        return station_map_name(station_clean_name)

    if skip_clean_for_matching(station_clean_name):
        return station_clean_name

    # List of cleaning methods to pass the string through
    for clean_func in [
            str.strip,
            str.lower,
            strip_double_spaces,
            strip_encoded_non_breaking_spaces,
            strip_station_name_numbering,
            strip_capacity_from_string,
            strip_non_alpha_characters_from_string,
            strip_words_from_sentence,
            strip_double_spaces,
            station_name_hyphenate,
            str.strip,
    ]:
        station_clean_name = clean_func(station_clean_name)  # type: ignore

    # Exit early if we already map
    if station_map_name(station_clean_name) != station_clean_name:
        return station_map_name(station_clean_name)

    # Split the name up into parts and go through each one
    name_components = [str(i) for i in station_clean_name.strip().split(" ")]
    name_components_parsed: List[str] = []

    for _comp in name_components:
        comp: Optional[str] = str(_comp)

        if not comp:
            continue

        if not isinstance(comp, str):
            comp = ""

        if comp in STRIP_WORDS:
            comp = ""

        if comp in ACRONYMS:
            comp = comp.upper()

        elif isinstance(comp, str) and comp.startswith("mc"):
            comp = "Mc" + comp[2:].capitalize()

        elif isinstance(comp, str) and comp != "":
            comp = comp.capitalize()

        # strip numbers greater than 5
        if comp:
            comp = clean_station_numbers_to_string(comp)

        name_components_parsed.append(comp)

    # Join the name back up
    station_clean_name = " ".join(name_components_parsed)

    # List of cleaning methods to pass the string through
    for clean_func in [
            str.strip,
            strip_double_spaces,
            clean_and_format_slashed_station_names,
            station_name_run_replacements,
            strip_station_name_numbering,
            station_name_hyphenate,
    ]:
        station_clean_name = clean_func(station_clean_name)  # type: ignore

    # Exit if we map
    if station_map_name(station_clean_name) != station_clean_name:
        return station_map_name(station_clean_name)

    return station_clean_name