def _old_station_name_cleaner(facility_name: str) -> str: """ " This cleans station names from their messy as hell AEMO names to something we can plug into name_clean and humans can actually read It's a bit of a mess and could use a refactor """ name_clean = facility_name or "" if type(facility_name) is str: name_clean = facility_name.strip() else: name_clean = str(facility_name).strip() # @todo check has duid / unit other name_clean = name_clean.lower() # @TODO replace with the re character stripped - this is unicode junk name_clean = name_clean.replace("\u00a0", " ") if station_map_name(name_clean) != name_clean: return station_map_name(name_clean) # strip units from name name_clean = re.sub(r"\d+\ ?(mw|kw|MW|KW)", "", name_clean) # strip other chars name_clean = re.sub(r",|-|\(|\)|\–|\"|\'", "", name_clean) # name_clean = re.sub(r"(\W|\ )+", "", name_clean) name_clean = re.sub(" +", " ", name_clean) name_clean = name_clean.replace("yalumba winery", "yalumba") name_clean = name_clean.replace("university of melbourne", "uom") # @TODO remove these hard codes if name_clean not in ["barcaldine solar farm", "Darling Downs Solar Farm"]: for w in STRIP_WORDS: if name_clean.startswith("todae solar") and w in ["solar"]: continue if " " in w: name_clean = name_clean.replace(w, "") name_components = [str(i) for i in name_clean.strip().split(" ")] name_components_parsed = [] for _comp in name_components: comp: Optional[str] = str(_comp) if not comp: continue comp = comp.strip() comp = re.sub(r",|-|\(|\)|\–", "", comp) if type(comp) is not str: comp = None if comp == "": comp = None if comp in STRIP_WORDS: comp = None if comp in ACRONYMS: comp = comp.upper() elif type(comp) is str and comp.startswith("mc"): comp = "Mc" + comp[2:].capitalize() elif type(comp) is str and comp != "": comp = comp.capitalize() # strip numbers greater than 5 comp_clean = clean_station_numbers_to_string(comp) if comp_clean: comp = comp_clean name_components_parsed.append(comp) if name_components_parsed[0] == "uom": name_components_parsed = name_components_parsed[:-1] name_clean = " ".join( [str(i) for i in name_components_parsed if i is not None]) name_clean = re.sub(" +", " ", name_clean) name_clean = name_clean.strip() if "/" in name_clean: name_clean = " / ".join( [i.strip().title() for i in name_clean.split("/")]) if station_map_name(name_clean) != name_clean: return station_map_name(name_clean) # uom special case name_clean = name_clean.replace("UOM ", "UoM ") # todae special case todae_match = re.match(r"^(Todae)\ (.*)", name_clean) if todae_match: todae_name, todae_rest = todae_match.groups() name_clean = "{} ({})".format(todae_rest, todae_name) return name_clean
def station_name_cleaner(station_name: str) -> str: """Refactred version of the station name cleaner. Cleans up station names prior to applying any manual mappings""" # Clean it up all in lower case station_clean_name = station_name # Exit early if we already map if station_map_name(station_clean_name) != station_clean_name: return station_map_name(station_clean_name) if skip_clean_for_matching(station_clean_name): return station_clean_name # List of cleaning methods to pass the string through for clean_func in [ str.strip, str.lower, strip_double_spaces, strip_encoded_non_breaking_spaces, strip_station_name_numbering, strip_capacity_from_string, strip_non_alpha_characters_from_string, strip_words_from_sentence, strip_double_spaces, station_name_hyphenate, str.strip, ]: station_clean_name = clean_func(station_clean_name) # type: ignore # Exit early if we already map if station_map_name(station_clean_name) != station_clean_name: return station_map_name(station_clean_name) # Split the name up into parts and go through each one name_components = [str(i) for i in station_clean_name.strip().split(" ")] name_components_parsed: List[str] = [] for _comp in name_components: comp: Optional[str] = str(_comp) if not comp: continue if not isinstance(comp, str): comp = "" if comp in STRIP_WORDS: comp = "" if comp in ACRONYMS: comp = comp.upper() elif isinstance(comp, str) and comp.startswith("mc"): comp = "Mc" + comp[2:].capitalize() elif isinstance(comp, str) and comp != "": comp = comp.capitalize() # strip numbers greater than 5 if comp: comp = clean_station_numbers_to_string(comp) name_components_parsed.append(comp) # Join the name back up station_clean_name = " ".join(name_components_parsed) # List of cleaning methods to pass the string through for clean_func in [ str.strip, strip_double_spaces, clean_and_format_slashed_station_names, station_name_run_replacements, strip_station_name_numbering, station_name_hyphenate, ]: station_clean_name = clean_func(station_clean_name) # type: ignore # Exit if we map if station_map_name(station_clean_name) != station_clean_name: return station_map_name(station_clean_name) return station_clean_name