def process_item(self, item, spider=None): if not "generators" in item: raise Exception("No generators found in item pipeline failed") generators = item["generators"] # Add clean station names and if group_by name generators = [{ **i, "name": station_name_cleaner(i["station_name"]), "name_join": False if facility_station_join_by_name( station_name_cleaner(i["station_name"])) else i["duid"], } for i in generators] # sort by name generators_grouped = {} for k, v in groupby(generators, key=lambda v: (v["name"], v["name_join"])): key = k if not key in generators_grouped: generators_grouped[key] = [] generators_grouped[key] += list(v) # with open("registrtation-exemption-grouped.json", "w") as fh: # json.dump(generators_grouped, fh, indent=4) return {**item, "generators": generators_grouped}
def stations_grouper(tables): if "PARTICIPANT_REGISTRATION_STATION" not in tables: raise Exception("No PARTICIPANT_REGISTRATION_STATION table") records = tables["PARTICIPANT_REGISTRATION_STATION"] mms = tables["mms"] if "mms" in tables else {} records = [{ "id": _id, "updated_at": parse_mms_date(i["LASTCHANGED"]), "name": station_name_cleaner(i["STATIONNAME"]), "code": i["STATIONID"], "station_code": i["STATIONID"], "network_name": i["STATIONNAME"], "address1": i["ADDRESS1"], "address2": i["ADDRESS2"], "locality": i["CITY"], "state": i["STATE"], "postcode": i["POSTCODE"], "facilities": [], } for _id, i in enumerate(records, start=1000)] for record in records: station_code = record["station_code"] if station_code not in mms: mms[station_code] = {} mms[station_code] = record tables["mms"] = mms return tables
def test_dashed_names_whitespace_capitalized(self): name = "catagunya/liapootah / wayatinah" subject = station_name_cleaner(name) assert ( subject == "Catagunya / Liapootah / Wayatinah" ), "Catagunya slash name whitespaced and capitalized correctly"
def process_item(self, item, spider=None): s = self.session() records_updated = 0 records_created = 0 for record in item: created = False duid = normalize_duid(record["STATIONID"]) name = station_name_cleaner(record["STATIONNAME"]) network_name = normalize_string(record["STATIONNAME"]) address1 = normalize_string(record["ADDRESS1"]) address2 = normalize_string(record["ADDRESS2"]) city = normalize_string(record["CITY"]) state = normalize_string(record["STATE"]).capitalize() postcode = normalize_string(record["POSTCODE"]) station = (s.query(Station).filter( Station.network_code == duid).one_or_none()) if not station: station = Station( code=duid, network_code=duid, created_by="au.nem.mms.stations", ) records_created += 1 created = True else: station.updated_by = "au.nem.mms.stations" records_updated += 1 station.name = name station.network_id = "NEM" station.network_name = network_name station.address1 = address1 station.address2 = address2 station.locality = city station.state = state station.postcode = postcode try: s.add(station) s.commit() except Exception as e: logger.error(e) logger.debug("{} station record with id {}".format( "Created" if created else "Updated", duid)) logger.info("Created {} records and updated {}".format( records_created, records_updated))
def test_hallet_is_three_units(self): hallet_names = [ "Hallett Power Station", "Hallett 1 Wind Farm", "Hallett 2 Wind Farm", ] hallet_names_cleaned = list( set([station_name_cleaner(i) for i in hallet_names])) assert len(hallet_names) == len( hallet_names_cleaned), "Hallet should have three distinct names"
def parse_aemo_general_information(filename: str) -> List[AEMOGIRecord]: wb = load_workbook(filename, data_only=True) SHEET_KEY = "ExistingGeneration&NewDevs" if SHEET_KEY not in wb: raise Exception("Doesn't look like a GI spreadsheet") ws = wb[SHEET_KEY] records = [] for row in ws.iter_rows(min_row=3, values_only=True): # pick out the columns we want # lots of hidden columns in the sheet row_collapsed = [ row[excel_column_to_column_index(i) - 1] for i in GI_EXISTING_NEW_GEN_KEYS.values() ] return_dict = dict(zip(GI_EXISTING_NEW_GEN_KEYS, list(row_collapsed))) # break at end of data records # GI has a blank line before garbage notes if row[0] is None: break if return_dict is None: raise Exception("Failed on row: {}".format(row)) return_dict = { **return_dict, **{ "name": station_name_cleaner(return_dict["StationName"]), "status_id": aemo_gi_status_map(return_dict["UnitStatus"]), "fueltech_id": aemo_gi_fueltech_to_fueltech(return_dict["FuelSummary"]), }, } return_model = AEMOGIRecord(**return_dict) records.append(return_model) return records
def wikidata_parse() -> None: # query: https://w.wiki/dVi # download the simplified json and save to wikidata.json wikidata = load_data("wikidata.json", from_project=True) out_entries = [] total_entries = len(wikidata) current = 0 for entry in wikidata: wikilink = article_from_wikipedia(entry["article"]) wikidata = dataid_from_url(entry["item"]) station_name = station_name_cleaner(entry["itemLabel"]) description = None try: description = wikipedia.summary(wikilink) except Exception as e: print(e) new_entry = { "wikipedia": entry["article"], "wikidata": entry["item"], "wiki_id": wikilink, "wikidata_id": wikidata, "name": station_name, "name_original": entry["itemLabel"], "description": description, } out_entries.append(new_entry) current += 1 print("Done {} of {}".format(current, total_entries)) with open("data/wikidata-parsed.json", "w") as fh: json.dump(out_entries, fh)
def test_station_name_cleaner(station_name: str, station_name_clean: str) -> None: subject = station_name_cleaner(station_name) assert subject == station_name_clean, "Clean name matches"
def test_dashed_names_with_stripping(self): name = "Catagunya / Liapootah / Wayatinah Power Station" subject = station_name_cleaner(name) assert (subject == "Catagunya / Liapootah / Wayatinah" ), "Catagunya hyphenated name"
def test_strip_combined_cycle(self): name = "Tamar Valley Combined Cycle" subject = station_name_cleaner(name) assert (subject == "Tamar Valley" ), "Tamar Valley Combined Cycle stripped to suburb"
def test_strip_landfill(self): name = "Broadmeadows Landfill" subject = station_name_cleaner(name) assert (subject == "Broadmeadows" ), "Broadmeadows Landfill becomes Broadmeadows"
def test_grosvenor_stripping(self): name = "Grosvenor 1 Waste Coal Mine Gas Power Station" subject = station_name_cleaner(name) assert subject == "Grosvenor 1", "Grosvenor strips specifications"
def test_dashed_names_with_stripping_capitalized(self): name = "Catagunya / Liapootah / woy woy power station" subject = station_name_cleaner(name) assert (subject == "Catagunya / Liapootah / Woy Woy" ), "Catagunya hyphenated name"
def test_power_stripper(self): name = "Test Power Station" subj = station_name_cleaner(name) assert subj == "Test", "Test power station becomes just Test"
def rel_grouper(records, station_code_map): records_parsed = [] for _id, i in enumerate(records, start=2000): name = station_name_cleaner(i["station_name"]) duid = normalize_duid(i["duid"]) unit = parse_unit_duid(i["unit_no"], duid) fueltech = lookup_fueltech( i["fuel_source_primary"], i["fuel_source_descriptor"], i["tech_primary"], i["tech_primary_descriptor"], i["dispatch_type"], ) station_code = lookup_station_code([duid], i["station_name"], station_code_map) records_parsed.append({ "name": name, "code": duid, "status": parse_facility_status("operating"), "station_code": station_code, "network_region": i["region"].strip(), "network_name": i["station_name"].strip(), "unit_size": clean_capacity(i["unit_size"]), "unit_code": get_unit_code(unit, duid, name), "dispatch_type": parse_dispatch_type(i["dispatch_type"]), "fueltech": parse_facility_fueltech(fueltech), "capacity_registered": clean_capacity(i["reg_cap"]), "capacity_maximum": clean_capacity(i["max_cap"]), }) grouped_records = {} for key, v in groupby(records_parsed, key=lambda v: v["station_code"]): # key = k[1 if key not in grouped_records: grouped_records[key] = [] grouped_records[key] += list(v) coded_records = {} _id = 2000 for station_code, rel in grouped_records.items(): station_name = rel[0]["network_name"] if station_code in coded_records: raise Exception("Code conflict: {}. {} {}".format( station_code, station_name, coded_records[station_code])) if not station_code: raise Exception("Unmapped station: {}".format(rel)) coded_records[station_code] = { "name": station_name_cleaner(station_name), "network_name": station_name, "code": station_code, "id": _id, "facilities": rel, } _id += 1 return coded_records
def test_acronyms(self): name = "bhp power" subj = station_name_cleaner(name) assert subj == "BHP", "Acronym is BHP"
def test_hallett_power(self): name = "Hallett Power Station" subj = station_name_cleaner(name) assert subj == "Hallett", "Hallet Power Station is Hallet"
def test_name_mapping_and_stripping(self): name = "SA Government Virtual Power Plant - stage 1" subject = station_name_cleaner(name) assert (subject == "SA VPP" ), "SA Government Virtual Power Plant maps to SA VPP"
def test_stripping_units(self): name = "Eastern Creek LFG PS Units 1-4" subj = station_name_cleaner(name) assert subj == "Eastern Creek", "Eastern Creek should strip units"
def test_name_mapping_hornsdale(self): name = "Hornsdale Power Reserve Unit 1" subject = station_name_cleaner(name) assert subject == "Hornsdale Power Reserve", "Hornsdale maps"
def test_unit_letters(self): name = "Yallourn 'W' Power Station" subject = station_name_cleaner(name) assert subject == "Yallourn W", "Yallourn has a unit letter"
def test_name_uni_melbourne(self): name = "University of Melbourne Archives Brunswick" subject = station_name_cleaner(name) assert (subject == "UoM Archives Brunswick" ), "UoM is abbreviated and suburb name added"
def test_strip_waste_disposal(self): name = "Wyndham Waste Disposal Facility" subject = station_name_cleaner(name) assert ( subject == "Wyndham"), "Whyndham Waste Disposal stripped to suburb"
def test_name_energy_brix(self): name = "Energy Brix Complex" subject = station_name_cleaner(name) assert subject == "Morwell", "Energy Brix Complex becomes Morwell"
def test_dashed_names(self): name = "Catagunya / Liapootah / Wayatinah" subject = station_name_cleaner(name) assert (subject == "Catagunya / Liapootah / Wayatinah" ), "Catagunya slash name"
def test_government_virtual(self): name = "SA Government Virtual Power Plant - stage 1" subject = station_name_cleaner(name) assert subject == "SA VPP", "SA Government Virtual becomes SA VPP"
def update_existing_geos() -> None: """ Old method to update geos from existing facilities file on OpenNEM """ station_fixture = load_data("facility_registry.json", from_fixture=True) stations = [{"station_code": k, **v} for k, v in station_fixture.items()] s = SessionLocal() for station_data in stations: station = None station_name = station_name_cleaner(station_data["display_name"]) station_code = normalize_duid(station_data["station_code"]) station_state = map_compat_facility_state( station_data["status"]["state"]) station = s.query(Station).filter( Station.network_code == station_code).one_or_none() if not station: logger.info("Could not find station {}".format(station_code)) continue if ("location" in station_data and "latitude" in station_data["location"] and station_data["location"]["latitude"]): station.geom = ("SRID=4326;POINT({} {})".format( station_data["location"]["latitude"], station_data["location"]["longitude"], ), ) station.geocode_processed_at = datetime.now() station.geocode_by = "opennem" station.geocode_approved = True station.updated_by = "fixture.registry" s.add(station) logger.info("Updated station geo location {} ({})".format( station.code, station.name, )) facilities = [{ "code": k, **v } for k, v in stations[0]["duid_data"].items()] # update fueltechs for facility_data in facilities: facility_duid = facility_data["code"] facility_fueltech = lookup_fueltech(facility_data["fuel_tech"]) facility = s.query(Facility).filter( Facility.network_code == facility_duid).first() if not facility: logger.error( "Could not find existing facility {} for station {}". format(facility_duid, station_code)) continue if not facility.fueltech_id: facility.fueltech_id = facility_fueltech if facility.fueltech_id != facility_fueltech: logger.error( "Fueltech mismatch for {}. Old is {} and new is {}".format( station_code, facility_fueltech, station.fueltech_id)) s.add(facility) s.commit()
def test_swanbank_b(self): name = "Swanbank B Power Station & Swanbank E Gas Turbine" subject = station_name_cleaner(name) assert subject == "Swanbank B", "Swanbank B"
def process_facilities(self, records): s = self.session() # Store a list of all existing duids all_duids = list( set([ i[0] for i in s.query(Facility.network_code).filter( Facility.network_code != None).all() ])) for _, facility_records in records.items(): facility_index = 1 facility_station = None created_station = False station_network_name = record_get_station_name(facility_records) station_name = station_name_cleaner(station_network_name) duid_unique = has_unique_duid(facility_records) facility_count = len(facility_records) # Step 1. Find the station # First by duid if it's unique duid = get_unique_duid(facility_records) # all GI records should have a region station_network_region = get_unique_reqion(facility_records) # This is the most suitable unit record to use for the station # see helper above facility_station_record = get_station_record_from_facilities( facility_records) if duid and duid_unique and facility_count == 1: facility_lookup = None try: facility_lookup = (s.query(Facility).filter( Facility.network_code == duid).filter( Facility.network_region == station_network_region).one_or_none()) except MultipleResultsFound: logger.error( "Found multiple duid for station with code {}".format( duid)) continue if facility_lookup and facility_lookup.station: facility_station = facility_lookup.station if (duid and (duid_unique and facility_count > 1) or not duid_unique): facility_lookup = (s.query(Facility).filter( Facility.network_code == duid).filter( Facility.network_region == station_network_region).first()) if facility_lookup and facility_lookup.station: facility_station = facility_lookup.station if not facility_station and facility_station_join_by_name( station_name): try: facility_station = (s.query(Station).filter( Station.name == station_name).one_or_none()) except MultipleResultsFound: logger.warning( "Multiple results found for station name : {}".format( station_name)) facility_station = None # If we have a station name, and no duid, and it's ok to join by name # then find the station (make sure to region lock) if (station_name and not duid and not facility_station and facility_station_join_by_name(station_name)): facility = (s.query(Facility).join(Facility.station).filter( Facility.network_region == station_network_region).filter( Station.name == station_name).first()) if facility: facility_station = facility.station # Create one as it doesn't exist if not facility_station: facility_station = Station( name=station_name, network_name=name_normalizer( facility_station_record["station_name"]), network_id="NEM", created_by="pipeline.aemo.general_information", ) s.add(facility_station) s.commit() created_station = True else: facility_station.updated_by = ( "pipeline.aemo.general_information") for facility_record in facility_records: if facility_record["FuelType"] in ["Natural Gas Pipeline"]: continue # skip these statuses too if facility_record["UnitStatus"] in FACILITY_INVALID_STATUS: continue facility = None created_facility = False facility_network_name = name_normalizer( facility_record["station_name"]) facility_name = station_name_cleaner( facility_record["station_name"]) duid = normalize_duid(facility_record["duid"]) reg_cap = clean_capacity(facility_record["NameCapacity"]) units_num = facility_record["Units"] or 1 unit_id = facility_index + (units_num - 1) unit = parse_unit_duid(unit_id, duid) unit_size = clean_capacity(facility_record["unit_capacity"]) unit_code = get_unit_code(unit, duid, facility_record["station_name"]) facility_comissioned = facility_record["SurveyEffective"] facility_comissioned_dt = None if type(facility_comissioned) is datetime: facility_comissioned_dt = facility_comissioned try: if type(facility_comissioned) is str: facility_comissioned_dt = datetime.strptime( facility_comissioned, "%d/%m/%y") except ValueError: logger.error( "Error parsing date: {}".format(facility_comissioned)) facility_status = map_aemo_facility_status( facility_record["UnitStatus"]) facility_network_region = normalize_aemo_region( facility_record["Region"]) facility_fueltech = (lookup_fueltech( facility_record["FuelType"], techtype=facility_record["TechType"], ) if ("FuelType" in facility_record and facility_record["FuelType"]) else None) if not facility_fueltech: logger.error("Error looking up fueltech: {} {} ".format( facility_record["FuelType"], facility_record["TechType"], )) # check if we have it by ocode first facility = (s.query(Facility).filter( Facility.code == unit_code).one_or_none()) if not facility and duid: try: facility = ( s.query(Facility).filter( Facility.network_code == duid).filter( Facility.network_region == facility_network_region) # .filter(Facility.nameplate_capacity != None) .one_or_none()) except MultipleResultsFound: logger.warn( "Multiple results found for duid : {}".format( duid)) if facility: if facility.station and not facility_station: facility_station = facility.station logger.info( "GI: Found facility by DUID: code {} station {}". format( facility.code, facility.station.name if facility.station else None, )) # Done trying to find existing if not facility: facility = Facility( code=unit_code, network_code=duid, created_by="pipeline.aemo.general_information", ) facility.station = facility_station created_facility = True if duid and not facility.network_code: facility.network_code = duid facility.updated_by = "pipeline.aemo.general_information" if not facility.network_region: facility.network_region = facility_network_region facility.updated_by = "pipeline.aemo.general_information" if not facility.network_name: facility.network_name = facility_network_name facility.updated_by = "pipeline.aemo.general_information" if not facility.fueltech_id and facility_fueltech: facility.fueltech_id = facility_fueltech facility.updated_by = "pipeline.aemo.general_information" if not facility.capacity_registered or ( facility.status and facility.status != "operating"): facility.capacity_registered = reg_cap facility.updated_by = "pipeline.aemo.general_information" # @TODO work this out # facility.dispatch_type = facility_dispatch_type if not facility.unit_id: facility.unit_id = unit.id facility.unit_number = unit.number facility.unit_size = unit_size facility.unit_alias = unit.alias if not facility.unit_capacity or ( facility.status and facility.status != "operating"): facility.unit_capacity = unit_size facility.updated_by = "pipeline.aemo.general_information" # if not facility.status_id: facility.status_id = facility_status # facility.updated_by = "pipeline.aemo.general_information" if not facility.registered and facility_comissioned_dt: facility.registered = facility_comissioned_dt facility.updated_by = "pipeline.aemo.general_information" facility.station = facility_station if facility.fueltech_id is None: logger.warning("Could not find fueltech for: {} {}".format( facility.code, facility.network_code)) # facility.status_id = facility_status if facility_station and not facility.station: facility.station = facility_station if facility.status_id is None: raise Exception( "GI: Failed to map status ({}) on row: {}".format( facility.status_id, facility_record)) s.add(facility) s.commit() facility_index += units_num if created_station: logger.info("GI: {} station with name {} ".format( "Created" if created_station else "Updated", station_name, # facility_station.id, )) if created_facility: logger.info( "GI: {} facility with duid {} to station {}".format( "Created" if created_facility else "Updated", duid, station_name, )) try: s.commit() except Exception as e: logger.error(e) raise e finally: s.close()
def test_swanbank_e_single(self): name = "Swanbank E" subject = station_name_cleaner(name) assert subject == "Swanbank E", "Swanbank E"