def measure_naptan_groups(gdf, naptan_column_name="LocalityName"): """[summary] this function measures the number of groups present within the given geodataframe, when groupbed the [default LocalityNames] Args: gdf ([type]): [description] naptan_column_name ([type]): [description] Returns: [type]: [description] """ # filter dataset to bare minimum needed. naptan_column_name = "LocalityName" gdf2 = gdf[[ "AreaName", "LocalityName", "geometry", "Longitude", "Latitude" ]] # reconvert to geo gdf3 = geo.calculate_naptan_geometry(gdf2) gdf3 = gdf3.drop(["Longitude", "Latitude"], axis=1) # groupby the naptan column type. groups = gdf3.groupby(naptan_column_name) # gets us the number of values in the geometry column. counts = gdf3.groupby(naptan_column_name).size().reset_index() counts.columns = [f"{naptan_column_name}", f"Size_{naptan_column_name}"] return counts, groups
def group_naptan_datatypes(gdf, naptan_column='LocalityName'): """[summary] groups together naptan datasets into subsets that are grouped by the given naptan column. Args: gdf ([type]): [description] naptan_column (str, optional): [description]. Defaults to 'LocalityName'. Returns: [type]: [description] """ # collapse dataset to minimum, keeping possibly useable datasets gdf2 = gdf[[ 'LocalityName', 'NptgLocalityCode', 'AreaName', 'StopAreaCode', 'Latitude', 'Longitude' ]] # calculates the centroid of each given naptan segment. gdf3 = gdf2.groupby([naptan_column], as_index=False)[['Latitude', 'Longitude' ]].apply(lambda x: np.mean(x, axis=0)) # convert the lat lon into centroid geometry points. gdf4 = geo.calculate_naptan_geometry(gdf3) # save output to csv. gdf4.to_csv(f'{naptan_column}.csv', encoding='utf-8', sep=',') return gdf4
def naptan_gazette_localities(): """[summary] returns the gazette locality data for use with the stops data. """ # just the basics cols = [ 'NptgLocalityCode', 'LocalityName', 'AdministrativeAreaCode', 'QualifierName', 'NptgDistrictCode', 'SourceLocalityType', 'GridType', 'Easting', 'Northing' ] # read the file gaz_locs = pd.read_csv(f'{nptg_dir}/Localities.csv', encoding='iso-8859-1', low_memory=True, usecols=cols) # TODO - gaz_locs = gaz_locs.rename(columns={'AdministrativeAreaCode': 'AdminCode'}) gaz_locs['AdminCode'] = gaz_locs['AdminCode'].astype(str) # convert lat long gaz_locs = geo.convert_to_lat_long(gaz_locs) # calculate geometry point for geodataframe. gaz_locs = geo.calculate_naptan_geometry(gaz_locs) # rename for later merger. gaz_locs.rename(columns={ 'NptgLocalityCode': 'NptgLocalityCode', 'LocalityName': 'LocalityName', 'QualifierName': 'QualifierName', 'NptgDistrictCode': 'NptgDistrictCode', 'SourceLocalityType': 'SourceLocalityType', 'GridType': 'NptgGridType', 'Longitude': 'Gazette_Longitude', 'Latitude': 'Gazette_Latitude', 'geometry': 'Gazette_geometry' }, inplace=True) # TODO new column merge Locality and qualifier name, check if duplicate. gaz_locs['Qualified_Locality'] = gaz_locs['LocalityName'] + ', ' +\ gaz_locs['QualifierName'] return gaz_locs
def find_unused_localities(cls, gdf): """[summary] returns a list of admin areas in nptg, checks those are in the nodes file, if the nodes file has aac not in Args: ([gdf]) Raises: NotImplementedError: [description] ve: [description] Returns: [pandas.core.frame.DataFrame]: [localities that are not used in the nodes file.] """ # node values localities = etl_pipe.naptan_gazette_localities() unused = localities[~localities['NptgLocalityCode']. isin(gdf['NptgLocalityCode'])] # conversion for geometry. unused = unused.rename(columns={ "Gazette_Longitude": "Longitude", "Gazette_Latitude": "Latitude" }) # unused = geo_pipe.calculate_naptan_geometry(unused) # reporting function rep.report_failing_nodes(gdf, 'unused localities near stops', failed_nodes=failedNodes) # m = vis.generate_base_map(unused, 'LocalityName') # m # TODO find out if any stops are inside the boundaries of the unused areas # TODO the geometries are just points for the unused localites # TODO find out the closest stops to these points. # localites. return unused
def main(named_area): """Downloads the naptan dataset and runs the basic internal consistency checks and geospatial checks""" # etl pipeline functions. etl.naptan_data_source("nptg", "csv") etl.naptan_data_source("naptan_nodes", "csv") nodes = Path(f"{dl_home}/{timestr}_naptan_nodes.zip") nptg = Path(f"{dl_home}/{timestr}_nptg.zip") etl.extract_naptan_files(nodes) etl.extract_naptan_files(nptg) # naptanfilenames = etl.file_verification('ext') # dataframe creation gdf = etl.read_naptan_file("Stops") gdf = etl.deactivated_nodes(gdf) # we join the gazette locality code and admin code data onto the nodes data # frame, this gives us accurate locality and admin area names. locality_codes = etl.naptan_gazette_localities() gdf = etl.map_gazette_to_nodes(gdf, locality_codes, "NptgLocalityCode") admin_codes = etl.naptan_gazette_admin_area_codes() gdf = etl.map_gazette_to_nodes(gdf, admin_codes, "AdminCode") # we merge on the stop area data and corresponding codes for stop area gdf = etl.merge_stop_areas(gdf) gdf = geopipe.calculate_naptan_geometry(gdf) # Check that the naptan data structure downloaded is within acceptable # tolerances NaptanStructureChecks.check_naptan_stop_number_limits(gdf) # cli to provide a named administrative area within the naptan dataset. naptan_area_level = "AreaName" named_area = named_area # TODO or locality. # TODO make the named area geojson polygon with feature data. gdf_sub = etl.create_naptan_subframe(gdf, naptan_area_level, named_area) # Data Cleansing functions # illegal captials IllegalCaptials.check_illegal_caps(gdf_sub, "StopPoint") # illegal characters IllegalCharacters.check_illegal_characters(gdf_sub, "StopPoint") # check for illegal spaces in required string columns. IllegalSpaces.check_illegal_spaces(gdf_sub) # The internal data consistency checks LocalitiesIDStops.localities_with_identical_stops(gdf_sub) NameContainsLocality.stop_name_contains_locality_name(gdf_sub) BearingMissing.stop_with_bearing_missing(gdf_sub) StopNameHighRisks.stop_names_with_high_risk_words(gdf_sub) StopsDifferentNamedAdminArea.stops_in_different_admin_area(gdf_sub) # TODO new checks - add to release notes CheckDateTime.check_stop_dates_not_after_today(gdf_sub) CheckName.check_name_length(gdf_sub) MultiRoadName.stop_with_multiple_road_names(gdf_sub, "CommonName") AtcocodeCheck.check_atcocode_length(gdf_sub) print("All internal consistency checks have been completed.") # geospatial data checks CoastlineStops.naptan_coastal_nodes(gdf_sub) # checks that should only be performed on locality level, get passed out to # this function collection for running through the size of each type. etl.locality_level_checks(gdf_sub) # area specific checks print("All geospatial functions have been completed.") # make the map and populate with node cluster. generate_base_map(gdf_sub) return gdf_sub
def create_naptan_subframe(gdf, naptan_area_level, col_value): """[summary] creates a naptan sub frame based on a given column name, by the value that is presented must be in that column. Arguments: gdf {[geopandas dataframe]} -- [the naptan dataframe] naptan_area_level {[str]} -- [the name of the column which will split the dataframe on] colvalue {[str]} -- [the value in the column to query.] Returns: [geodataframe] -- [The naptan subframe] """ # convert the colvalue string into a lower case. try: if isinstance(col_value, str): pass except isinstance(col_value, int): col_value = f'{col_value}' finally: col_value = col_value.lower() lower_case = gdf[naptan_area_level].str.lower() new_df = pd.DataFrame(lower_case) gdf.update(new_df) # we put this here so we can filter out all areas that are managed by # dft centrally dft_authorities = [ 'National - National Rail', 'National - National Air', 'National - National Ferry', 'National - National Tram' ] # for grouping we need to pass in wildcard values, ideally string # contains but the stop area code will always start with the atcocode for # the area or mode. if naptan_area_level == 'AreaName': # check if an invalid area has been passed. try: if col_value in dft_authorities: # if the user passes in a DfT managed area, we exit out sys.exit(f'{col_value} is a DfT central authority.') except KeyError: # catch the value if isn't found. sys.exit(f"{col_value} was not found in the given dataframe.") finally: # We get the nptg locality codes within the given admin area, as # this will include all forms of transport for the area and not # just bus transport infrastructure. gdf_sub = gdf[gdf[f'{naptan_area_level}'] == col_value] gdf_subframe = geo.calculate_naptan_geometry(gdf_sub) return gdf_subframe elif naptan_area_level == 'StopType': try: gdf1 = gdf[gdf['StopType'].str.match(col_value)] gdf_subframe = geo.calculate_naptan_geometry(gdf1) return gdf_subframe except KeyError: # catch of the value just isn't found. sys.exit(f"{col_value} is not known stoptype.") elif naptan_area_level == 'StopAreaCode': mask = gdf[f'{naptan_area_level}'].str.startswith(f'{col_value}') gdf_subframe = gdf[mask] gdf_sub = geo.calculate_naptan_geometry(gdf_subframe) return gdf_sub # expects the full string of the npgt or admin code to work. elif naptan_area_level == 'NptgLocalityCode' or 'LocalityName': # print('This is a locality area.') columngroup = gdf.groupby(naptan_area_level) gdf_subframe = columngroup.get_group(col_value) gdf_subframe.reset_index(drop=True, inplace=True) gdf_sub = geo.calculate_naptan_geometry(gdf_subframe) return gdf_sub else: sys.exit('Column type is not supported.')