コード例 #1
0
    def locality_with_unusually_elongated_shape(cls, gdf_locality):
        """[summary] the enclosing bounding box is arbitrary - this check
        and all other checks that require to build a shape are variants of
        Minimum Bounding Box/Convex Hull problems
        (https://en.wikipedia.org/wiki/Minimum_bounding_box_algorithms). There
        should be something pre-made in python, otherwise we can just look at
        implementing an existing algorithm. Many of these problems are
        arbitrarily defined by ITO, so we need to come up with a definition
        of "elongated" (assuming it is really a problem). For example, we could
        say that the shape is elongated if the longest edge is 10x longer than
        the shortest edge. We need to think what makes sense.

        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "locality_with_unusually_elongated_shape"

        PolygonStructure.check_area_length_is_regular(gdf_locality,
                                                      "Name of locality")
        # list of stops not in correct admin areas by geo position.
        # TODO - if
        failed_nodes = ""

        rep.report_failing_nodes(gdf_locality, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #2
0
    def locality_not_unique(cls, gdf):
        """[summary] The name of the locality with its qualifier (if any) is not
        unique nationally. To ensure that a search for a locality based on the
        National Gazetteer (NPTG) will differentiate between localities that
        may be identically named, by applying an appropriate qualifier to each
        ambiguous entry. Ensure that the appropriate qualifier is added to a
         locality which is ambiguous.

        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "locality_not_unique"
        gdf['Local_Qualifer_Name'] = gdf['LocalityName'] + \
            ', ' + gdf['QualifierName']
        # remove all the notana values from the dataframe.
        nodes = gdf[gdf["Local_Qualifer_Name"].notna()]
        # check for duplicates in the locality qualifier name column.
        boolean = nodes.duplicated(subset=['Local_Qualifer_Name'])
        nodes_dup = nodes[boolean]
        # TODO this might work, it returns 8000 localities out of 28000 that are
        # not unique... not sure that all is correct.
        a = nodes_dup.loc[~nodes_dup.duplicated(keep=False),
                          'Locality_Qualifier_Name'].unique()
        # TODO check that the returned are correct

        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #3
0
    def check_area_length_is_regular(cls, gdf, naptan_locality):
        """[summary] when given a geodataframe, checks the matching polygon, is
        under a 1000 nodes

        Args:
            gdf ([naptan master geodataframe]): [the naptan master.]
            df_area ([naptan geodataframe]): [the sub area we are checking,
            not we pass the entire frame, (a locality)]
            polygon ([[shapely.geometry.polygon.Polygon]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = cls.check_area_length_is_regular.__name__
        # make the polygon from area data, check length
        area_polygon = make_naptan_polygon(naptan_locality)
        # TODO get the longest length
        poly_long = cls.polygon_longest_side(area_polygon)
        # TODO get the shortest length
        poly_short = cls.polygon_shortest_side(area_polygon)
        # check area length.
        if poly_long >= 800:
            print("Is not a locality and the area is excluded from this check")
            pass
        elif poly_long <= 1000:
            print("Polygon area is regular")
        else:
            print("Polygon area is irregular")
            rep.report_failing_nodes(gdf, check_name, naptan_locality)
コード例 #4
0
    def check_name_length(cls, gdf):
        """[summary]:- A stop point fails if StopPoint has a full name [Locality,
        CommonName (Indicator)] that is more than 80 characters in length.

        Arguments:
            gdf {[geopandas dataframe]} -- [The naptan master dataframe.]
        Returns:
            df_str {[dataframe of ]} -- Nodes that failed the check.
        """
        try:
            # get name for report
            check_name = "check_name_length"
            # clean frame
            gdf1 = gdf
            # get the stoppoint name
            gdf1["newName"] = (
                gdf1["CommonName"].astype(str) + ", " + gdf1["LocalityName"].astype(str)
            )
            # mask the names against 80 chars
            mask = gdf1["newName"].str.len() > 80
            df_str = gdf1.loc[mask]
            # send to report
            rep.report_failing_nodes(gdf, check_name, df_str)
            return df_str.ATCOCode
        except Exception as e:
            raise e
            sys.exit(f"{check_name} failed because of {e}.")
コード例 #5
0
    def stop_with_bearing_missing(cls, gdf):
        """[summary] The data does not include a value for “bearing” for all BCT
        stops except those in the FLX (flexible zone) sub-type.

        Args:
            gdf {[geopandas dataframe]} -- [The naptan master dataframe.]

        Returns:
            [type]: [description]
        """
        # get the check name
        check_name = "stop_with_bearing_missing"
        try:
            # the permitted bearings that can be present in that field.
            valid_bearing = ["SW", "NE", "SE", "S", "N", "NW", "E", "W"]
            # merged form, checking for the validing bearing list is not
            # present
            failed_nodes = gdf[(gdf["StopType"] == "BCT")
                               & (gdf["BusStopType"] != "FLX")
                               & (~gdf["Bearing"].isin(valid_bearing))]
            # reporting.
            rep.report_failing_nodes(gdf, check_name, failed_nodes)
            return failed_nodes
        except Exception as e:
            sys.exit(f"{check_name} has failed because of {e}")
コード例 #6
0
def road_name_matches_coordinates(gdf, atcocode):
    """[summary] Checks that the road name in the record, matches if the
       The “street” shown in the data does not correspond with the name
       attached to the road segment to which the stop is snapped in the Navteq
       mapping data used by Ito.
    Arguments:
        gdf {[geopandas dataframe]} -- [pass in the chosen dataframe]
        ATCOCode {[str]} -- [Pass in the given naptan unique stop id.]

    Returns:
        [type] -- [description]
    """
    # check name
    check_name = road_name_matches_coordinates.__name__
    # masking ideally.
    gdf1 = gdf
    node = gdf1.loc[gdf1['ATCOCode'] == atcocode]
    # api call to get nearest road name
    found_name = geopipe.get_nearest_road_name(gdf1, atcocode)
    if found_name[1] == node['Street'][0]:
        print('Road Name Matches')
        pass
    else:
        # TODO - needs testing.
        res = node["ATCOCode"]
        rep.report_failing_nodes(gdf, check_name, res)
        return res
コード例 #7
0
    def check_stop_dates_not_after_today(cls, gdf):
        """[summary] checks if the dates for bus stops have not been
        added to the naptan database in a future date.

        Args:
            gdf ([type]): [description]
            check_name ([type]): [description]
            check_warning_level ([type]): [description]
            check_geographical_level ([type]): [description]

        Returns:
            [type]: [description]
        """

        check_name = "Check stop dates are after today"
        check_geographic_level = "stop"
        check_warning_level = "low"

        # just use between for both date fields.
        today = pd.Timestamp(datetime.today().date())
        # check if greater than today in mod date column
        baddates = gdf[gdf.ModificationDateTime > today]
        # mask for speed.
        bad_timeframe = baddates
        # check if we report.
        if bad_timeframe.empty:
            print("No stop dates are in the future.")
        else:
            print(f"Stop creation or modification date after {today}.")
            report_failing_nodes(gdf, check_name, bad_timeframe)
            return bad_timeframe
コード例 #8
0
    def naptan_coastal_nodes(cls, gdf):
        # TODO - add a column to the master naptan dataframe, and then count up
        #  false values, to get the percent of stops that fail, and then compare
        #  those stops, to find out which ones are near the coast and how near
        #  the coast they are.
        """[summary] provided a dataframe, returns a list of nodes that are near the
            coast line, this uses global land mask library, a numpy & pandas extension,
            for mapping the boundaries of the coastline.

            Arguments:
                df {[geospatial dataframe]} -- [the naptan master dataframe.]

            Raises:
                ve: [Raises description]
                e:  []
            Returns:
                [type] -- [description]
            """

        check_name = "naptan_coastal_nodes"
        try:
            # remove ferry based stops / jetty stop types, as they proximity to the
            # coastline isn't a problem.
            coastal_infrastructure = ['FTD', 'FBT', 'FER']
            gdf = gdf[~gdf['StopType'].isin(coastal_infrastructure)]
            # we compare against the compressed land geometry dataset for
            # coordinates outside the coastline.
            gdf['Land_State'] = globe.is_land(gdf['Latitude'],
                                              gdf['Longitude'])
            coastal_nodes = gdf.loc[~gdf.Land_State]
            # get the count of failing nodes as a values
            high_node_areas = coastal_nodes['LocalityName'].value_counts()
            percent = ((len(coastal_nodes) / len(gdf)) * 100.0)
            # if the number of nodes is over this percent, console warning.
            if percent >= 1.1:
                print(
                    f"The {gdf.AreaName.iloc[0]} has {len(coastal_nodes)} stops\
                    that are off the UK Coastline, that is {percent: 0.2f} %\
                    of all stops in the named admin area.")
            elif percent <= 0:
                print('No Nodes were found along the coastline.')
                pass
            else:
                print(
                    f"The area has {len(coastal_nodes)} nodes that are off the\
                      coastline boundary. UK coastline, this is {percent: 0.2f} % of all nodes in the area.")
            rep.report_failing_nodes(gdf, check_name, coastal_nodes)
            return high_node_areas

        except ValueError as ve:
            raise(ve)

        except Exception as e:
            print(e)
コード例 #9
0
def detect_nan_values(gdf, col_name):
    """[summary] This is an internal open naptan method, it shouldn't be exposed
    to the user unless required for reporting widespread data issues.
    Returns the presence of nan values in the naptan dataset for
    a given column.
    Arguments:
        gdf {[pandas dataframe]} -- [A naptan dataframe.]
        colName {[str]} -- [a column str name as expected in the naptan data
        column.]

    Returns:
        [pandas dataframe] -- [a dataframe]
    """
    # nodes into a report ,
    check_name = 'Required columns contain null values.'
    # the below list is columns which must
    required_cols = [
        'ATCOCode', 'CommonName', 'Street', 'Indicator', 'Bearing',
        'NptgLocalityCode', 'Town', 'TownLang', 'Suburb', 'LocalityCentre',
        'Longitude', 'Latitude', 'StopType', 'BusStopType', 'TimingStatus',
        'AdminCode', 'CreationDateTime', 'ModificationDateTime', 'Status',
        'StopPoint', 'LocalityName', 'QualifierName', 'AtcoAreaCode',
        'AreaName', 'RegionCode', 'Status_area', 'geometry'
    ]
    # check if the column is a required column. Defensive.
    if col_name not in required_cols:
        message = f'{col_name} for the area {gdf.AreaName.iloc[0]} the \
        requested column can have null values. This '

        write_basic_log_file(message)
        pass
    # Check is the column contains any null or na values.
    elif gdf[col_name].isnull().values.any() != 0:
        nan_array = gdf[col_name].isnull()
        # build array of missing values, using masking.
        missing_values = gdf[nan_array]
        percent_missing = gdf[col_name].isna().sum() / (len(
            gdf.Indicator)) / 100
        # return missing percentage of rows.
        print(f'{percent_missing:.4%}')
        df_cleaned = gdf[missing_values]
        report_failing_nodes(gdf,
                             test_name=check_name,
                             failed_nodes=missing_values)
        return df_cleaned
    else:
        message = f'{col_name} for the area {gdf.AreaName.iloc[0]} has missing \
        values in a required column and has failed this test.'

        print('all good.')
        pass
コード例 #10
0
def test_report_failing_nodes(naptan_sample):
    """[summary]
    """
    assert naptan_sample.report_failing_nodes()
    assert rep.report_failing_nodes(complete_gdf,
                                    'Check Name Length',
                                    gdf)
コード例 #11
0
    def check_atcocode_length(cls, gdf):
        """[summary] checks the atcocode (unique identifier) length is 12 and if
        not the stop fails the check.

        Args:
            gdf ([geopandas dataset, master or sub]): [description]

        Returns:
            [geopandas dataframe]: [Geopandas dataframe of failed nodes.]
        """
        #  variance? Stop type? Authority?
        check_name = "check_atcocode_length_is_12"
        gdf["AtcoCode_Character_Len"] = gdf["ATCOCode"].apply(len)
        fail_range = gdf["AtcoCode_Character_Len"].unique()

        try:
            # create a mask that include no inactive nodes and atcocodes under
            # 12
            if len(fail_range) != 1:
                mask = (gdf["Status"] != "del") & (gdf["AtcoCode_Character_Len"] != 12)
                # get the failing nodes.
                fn = gdf[mask]
                # makes report
                rep.report_failing_nodes(gdf, check_name, fn)
                # TODO make a sample level map of the failing area with codes.
                # get the name of the area that is failing
                fail_area = gdf.AreaName.iloc[0]
                print(fail_area)
                # the below returns a short dataframe counting the number of
                # atcocodes.
                # that are less than 12 alphanumeric characters in length.
                result_agg = (
                    fn[["AtcoCode_Character_Len", "ATCOCode"]]
                    .groupby(["AtcoCode_Character_Len"])
                    .count()
                )

                return result_agg

        except ValueError as ve:
            sys.exit(f"This error occured {ve}")
        except Exception as e:
            sys.exit(f"{e} was encounter check has been cancelled.")
        else:
            message = f"{gdf.AreaName.iloc[0]} all Atcocode unique identifiers are the correct length."
            rep.write_basic_log_file(message)
コード例 #12
0
    def stop_road_distance(cls, gdf):
        """[summary] 

            Args:
                gdf ([type]): [description]

            Raises:
                NotImplementedError: [description]

            Returns:
                [type]: [description]
            """
        check_name = "stop_road_distance".__name__
        # list of stops not in correct admin areas by geo position.
        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
コード例 #13
0
    def stop_names_with_high_risk_words(cls, gdf):
        """[summary] Descriptions: StopPoint has a CommonName that contains one of
        the following high risk words: DELETE, DELETED, N/A, N/K, OBSOLETE,
            UNUSED (case-insensitive).
        Args:
            gdf ([geopandas ]): [a pandas dataframe of the current naptan file.]

        Returns:
            df_risks [type]: [csv file containing risk updates.]
        """

        # name of check.
        check_name = "stop_names_with_high_risk_words"
        # clone
        gdf1 = gdf
        try:
            # list of risk words.
            riskwords = [
                "DELETE",
                "DELETED",
                "N/A",
                "NOT IN USE"
                "N/K",
                "OBSOLETE",
                "UNUSED",
            ]
            # text captialising managment
            gdf1["CommonName"] = gdf1["CommonName"].str.upper()
            gdf1["RiskWords"] = gdf1["CommonName"].apply(
                lambda x: 1 if any(i in x for i in riskwords) else 0)
            #
            df_risks = gdf1.loc[gdf1["RiskWords"] != 0]
            #
            endcol = len(df_risks.columns)
            #
            df_risks.insert(endcol, "Warning Flag", check_name)
            #
            rep.report_failing_nodes(gdf, check_name, df_risks)
            return df_risks
            # TODO indicate if it's a bus stop, if so flag locality or
            # TODO authorities that should confirm the stops deletion from the
            # TODO database.
        except Exception as e:
            raise (e)
            sys.exit(f"{check_name} has failed due to {e}.")
コード例 #14
0
    def stop_with_wrong_types(cls, gdf):
        """[summary]

        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "stop_with_wrong_types"
        # list of stops not in correct admin areas by geo position.
        failed_nodes = ""
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #15
0
    def hail_ride_section_length(cls, gdf):
        """[summary] Hail and Ride Bus Stop where total length of section is greater than 1km


        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "hail_ride_section_length"
        # list of stops not in correct admin areas by geo position.
        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
コード例 #16
0
    def hail_ride_invalid(cls, gdf):
        """[summary] Hail and Ride Bus Stops that do not have a valid entry,
           centroid or exit record.

            Args:
                gdf([type]): [description]

            Raises:
                NotImplementedError: [description]

            Returns:
                [type]: [description]
            """
        check_name = "hail_ride_invalid"
        # list of stops not in correct admin areas by geo position.
        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
コード例 #17
0
    def localities_with_identical_stops(cls, gdf_locality):
        """[summary]StopArea containing StopPoints that do not have identical
        CommonNames.

        The CommonName of stops within a single stoparea should be the same
        as each other (and the same as the name of the stoparea) wherever
        possible. This test identifies examples where the stopnames are not
        identical. At present this test does not identify cases where the stoparea
        name is different from any one or more of the individual stop‟s
        CommonName – but this may be added.

        Given a stop point within a locality, check if the stoppoint is duplicated
        at any point.

        Arguments:
            gdf {[geopandas dataframe]} -- [The Master naptan node frame.]

        Returns:
            df_warnings[type] -- [description]
        """
        # for reporting
        check_name = "Check localities for identical stops."
        check_warning_level = "high"
        check_geographic_level = "localities"
        # clone the stop name
        gdf1 = gdf_locality
        # get the area name.
        try:
            # check nptg locality length is not over 1, otherwise this is
            # not a single locality
            if len(gdf1["NptgLocalityCode"].unique()) == 1:
                # get duplicates.
                mask = gdf1["StopPoint"].duplicated()
                # mask
                failed_nodes = gdf1[mask]
                rep.report_failing_nodes(gdf_locality, check_name,
                                         failed_nodes)
                return failed_nodes

        except Exception as e:
            # pass if this is not a localities, we just catching.
            print(f"Not a locality, test can not be performed. {e}")
            pass
コード例 #18
0
    def stops_area_members_without_identical_names(cls, gdf):
        """[summary] StopArea containing StopPoints that do not have identical
        CommonNames


        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "stops_area_members_without_identical_names"
        gdf1 = gdf
        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #19
0
    def stops_in_alternate_localities(cls, gdf):
        """[summary] Locality is an alternative but has members or children
         that should be connected to the primary Locality. This checks if the
         stop can be linked to an nptg locality.


        Args:
            gdf ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "stops_in_alternate_localities"
        gdf1 = gdf
        failed_nodes = ""
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #20
0
    def find_unused_localities(cls, gdf):
        """[summary] returns a list of admin areas in nptg,
            checks those are in the nodes file, if the nodes file has aac not in 

            Args:
                ([gdf])
            Raises:
                NotImplementedError: [description]
                ve: [description]

            Returns:
                [pandas.core.frame.DataFrame]: [localities that are not used in the
                nodes file.]
            """
        # node values

        localities = etl_pipe.naptan_gazette_localities()
        unused = localities[~localities['NptgLocalityCode'].
                            isin(gdf['NptgLocalityCode'])]
        # conversion for geometry.
        unused = unused.rename(columns={
            "Gazette_Longitude": "Longitude",
            "Gazette_Latitude": "Latitude"
        })
        #
        unused = geo_pipe.calculate_naptan_geometry(unused)
        # reporting function
        rep.report_failing_nodes(gdf,
                                 'unused localities near stops',
                                 failed_nodes=failedNodes)
        # m = vis.generate_base_map(unused, 'LocalityName')
        # m
        # TODO find out if any stops are inside the boundaries of the unused areas
        # TODO the geometries are just points for the unused localites
        # TODO find out the closest stops to these points.
        #  localites.
        return unused
コード例 #21
0
 def stops_in_different_admin_area(cls, gdf):
     """[summary] Checks if a stop is in a different administrative area, based
     on the AtcoAreaCode Column. We take the first 3 characters prefix of the
     atcocode and check them against the atcoareacode for the admin area.
     They should match.
     Args:
         gdf ([pandas dataframe]): [The Master naptan node frame.]
     Returns:
         [panda dataframe] -- [description]
     Raises:
         NotImplementedError: [geo spatial cross checking, not implemented yet.]
     """
     check_name = "stops_in_different_admin_area"
     gdf1 = gdf
     try:
         #  get prefix from atcocode column
         gdf1["atcocodeprefix"] = gdf1["ATCOCode"].str[:3]
         #  get the AtcoAreaCode column value, making sure that we account for
         # 2-digit atcocode prefixes and int types, using to_numeric
         gdf1["AtcoAreaCode"] = gdf1["AtcoAreaCode"].astype(str)
         gdf1["atcocodeprefix"] = pd.to_numeric(gdf1["atcocodeprefix"])
         gdf1["AtcoAreaCode"] = pd.to_numeric(gdf1["AtcoAreaCode"])
         #  compare the two together, they should match
         gdf1["not matching"] = gdf1["atcocodeprefix"].eq(
             pd.to_numeric(gdf1["AtcoAreaCode"], errors="coerce"))
         failed_nodes = gdf1[~gdf1["not matching"]]
         rep.report_failing_nodes(gdf, check_name, failed_nodes)
         return failed_nodes
         # TODO if they don't match, report the nodes that don't match
         # TODO compare the geometry point to the polygon boundaries of the
         #  expected admin area
         # TODO if the geometry point is further 500 meters outside the
         #  boundaries of the given area, then the node fails
     except Exception as e:
         raise e
         sys.exit(f"{check_name} has failed because of {e}")
    def stops_in_different_admin_authority_geo_position(cls, gdf):
        """[summary] The AtcoCode prefix for the StopPoint represents an
        AdminArea other than the one associated with the stop‟s Locality.
        This test highlights those stops which are associated with a locality that
        is itself not in the same administrative area. This is often not wrong – 
        but in some cases it indicates a stop that is incorrectly located, or 
        associated with the wrong locality.

        Check each example and confirm that each represents a stop close to the
        boundary of your authority‟s area – and consider whether the locality
        association with each stop is reasonable, even if it is with a locality
        that is in the adjacent admin area. Check that the coordinates of the stop
        are right, and correct them if not. 
        Args:
            gdf ([gdf]): [the naptan total dataframe]
            stops ([node_type_stops]): [description]
            authorities ([gdf]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "stops_in_different_admin_authority_geo_position"
        # TODO - check if any other stops or points are within the authority
        # TODO polygon of this boundary,
        # TODO check from the surrounding admin areas
        # TODO if so add the stop to the failed nodes report file
        # TODO include how the name of the other area and distance outside.
        #
        # list of stops not in correct admin areas by geo position.
        failed_nodes = ''
        rep.report_failing_nodes(gdf, check_name, failed_nodes)
        return failed_nodes
        raise NotImplementedError
コード例 #23
0
    def unused_locality_near_stops(cls, nodes, nptg):
        """[summary] Locality has no stops or child Localities, but is within
         250 metres of a StopPoint associated with a different Locality.


        Args:
            nodes ([type]): [description]
            nptg ([type]): [description]

        Raises:
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        check_name = "unused_locality_near_stops"
        unused_localities = check_nodes_match_nptg_data(nodes, "")
        # list of unused localities, finding nearest naptan stop within the area.
        # geos.g
        # list of stops not in correct admin areas by geo position.

        failed_nodes = ""
        rep.report_failing_nodes(nodes, check_name, failed_nodes)
        return failed_nodes
コード例 #24
0
        Arguments:
            gdf {[geopandas dataframe]} -- [the naptan master dataframe]
            columnName {[str]} -- [a given column name to search through]
        Returns:
            df -- a df object that consists of only stops with illegal
             characters removed, from the
            given field.
        """
        check_name = 'Check Illegal Characters'
        # our regex pattern of allowed special characters in stop point names
        pattern = r"\bO/S|NO\.|P\.H\.|P\.O\.|ST\.|'s|St\.|st.\b"
        # our none allowed non-alphanumeric characters.
        searchfor = ['!', '[', ']', '.', ',', '/', '/?']
        # clone dataframe, removing none
        gdf1 = IllegalCharacters.filter_bus_stops(gdf)
        # remove the nodes with the permitted exceptions.
        excluded_nodes = gdf1[gdf1[col_name].str.contains(pattern,
                                                          case=False,
                                                          regex=True)]
        mask = gdf1[col_name].isin(excluded_nodes[col_name])
        # removing excluded nodes from bus stops frame.
        gdf_filter = gdf1[~mask]
        # use map and regex to create a generator that str contain
        pat = '|'.join(map(re.escape, searchfor))
        # check the given column for any illegal characters
        filtered_nodes = gdf_filter[gdf_filter[col_name].str.contains(pat)]
        # report on failing nodes that contain illegal characters.
        report_failing_nodes(gdf, check_name, filtered_nodes)
        return filtered_nodes
コード例 #25
0
                    [
                        mns["CommonName"].str.contains(t, regex=False, case=False)
                        for t in terms
                    ]
                )
                # return common name mask and added
                mns = mns[cn_mask]
                #
                ln_mask = np.logical_or.reduce(
                    [
                        mns["LocalityName"].str.contains(t, regex=False, case=False)
                        for t in terms
                    ]
                )
                #
                mns = mns[ln_mask]
                # reports returns percentage of bad stops out of all stsops,
                # about 0.03%
                rep.report_failing_nodes(gdf, check_name, mns)
                return mns
            # double check if the return data frame is empty or not, if it's empty, we are good for that area.
            elif df["NameMatch"].isnull().all():
                success_message = f"{gdf.AreaName.iloc[0]} has no stops names containing locality names."
                rep.write_basic_log_file(success_message)

        except ValueError as ve:
            # ValueError: Cannot mask with non-boolean array containing NA /
            #  NaN values
            raise ve
            sys.exit(f"{check_name} failed because of {ve}")
コード例 #26
0
def naptan_sample():
    """[summary]
    """
    # what
    t = rep.report_failing_nodes()
    return t
コード例 #27
0
    def check_illegal_caps(cls, gdf, col_name="StopPoint"):
        """[summary] Descriptions:CommonNames should not contain acronyms as single
            capitals separated by spaces or full stops – with the exception of
            „R C‟, „P.H.‟, and „P.O.‟. CommonNames should not contain a sequence
            of lowercase letter followed by uppercase letter – with the
            exceptions of 'McX' and 'MacX'
        Args:
            gdf ([pandas dataframe]): [the master naptan nodes file.]
            columnName ([type]): [description]

        Returns:
            IIC [type]: [description]
        """

        except_caps = [
            "AFC",
            "ASDA",
            "BBC",
            "BP",
            "CE",
            "DHSS",
            "DLR",
            "FC",
            "GMEX",
            "HMP",
            "HQ",
            "HSBC",
            "II",
            "III",
            "IKEA",
            "IV",
            "IX",
            "MFI",
            "MOD",
            "NCP",
            "NE",
            "NR",
            "NW",
            "PH",
            "PO",
            "RAF",
            "RC",
            "RSPCA",
            "SE",
            "SPT",
            "SW",
            "VI",
            "VII",
            "VIII",
            "WMC",
            "XI",
            "XII",
            "YMCA",
            "YWCA",
        ]
        try:
            # clone
            gdf1 = gdf
            gdf1["capitals"] = gdf1[col_name].str.count("[A-Z]{3,}")
            gdf1 = gdf1[gdf1["capitals"] != 0]
            # the below, compares a list against named column
            mask = ~gdf1[col_name].apply(
                lambda x: np.intersect1d(x, except_caps).size > 0)
            # masking if required.
            illegal_caps = gdf1[mask]
            #  save the report.
            report_failing_nodes(gdf, "Check illegal capitals", illegal_caps)
            print("Illegal Captials has completed.")
            return illegal_caps
        except Exception as e:
            print(f"{e}")
        except ValueError as ve:
            print(f"{ve}")
コード例 #28
0
    def check_nodes_match_nptg_data(cls, gdf, named_area):
        """[summary] returns a list of admin areas in nptg,
        checks those are in the nodes file, if the nodes file has aac not
        in

        Args:
            gdf ([type]): [the master or named area naptan data file]
            named_area ([type]): [the named area of the naptan subframe]

        Raises:
            NotImplementedError: [description]
            NotImplementedError: [description]
            NotImplementedError: [description]
            NotImplementedError: [description]
            NotImplementedError: [description]
            NotImplementedError: [description]

        Returns:
            [type]: [description]
        """
        #
        check_name = "check_nodes_match_nptg_data"
        # list of all geographic admin areas
        admin_areas = [
            "Aberdeen",
            "Aberdeenshire",
            "Angus",
            "Argyll & Bute",
            "Bath & North East Somerset",
            "Bedford",
            "Blackburn with Darwen",
            "Blackpool",
            "Blaenau Gwent",
            "Bournemouth",
            "Bracknell Forest",
            "Bridgend",
            "Brighton and Hove",
            "Bristol",
            "Buckinghamshire",
            "Caerphilly",
            "Cambridgeshire",
            "Cardiff",
            "Carmarthenshire",
            "Central Bedfordshire",
            "Ceredigion",
            "Cheshire East",
            "Cheshire West & Chester",
            "Clackmannanshire",
            "Conwy",
            "Cornwall",
            "Cumbria",
            "Darlington",
            "Denbighshire",
            "Derby",
            "Derbyshire",
            "Devon",
            "Dorset",
            "Dumfries & Galloway",
            "Dundee",
            "Durham",
            "East Ayrshire",
            "East Dunbartonshire",
            "East Lothian",
            "East Renfrewshire",
            "East Riding of Yorkshire",
            "East Sussex",
            "Edinburgh",
            "Essex",
            "Falkirk",
            "Fife",
            "Flintshire",
            "Glasgow",
            "Gloucestershire",
            "Greater London",
            "Greater Manchester",
            "Gwynedd",
            "Halton",
            "Hampshire",
            "Hartlepool",
            "Herefordshire",
            "Hertfordshire",
            "Highland",
            "Inverclyde",
            "Isle of Anglesey",
            "Isle of Wight",
            "Kent",
            "Kingston upon Hull",
            "Lancashire",
            "Leicester",
            "Leicestershire",
            "Lincolnshire",
            "Luton",
            "Medway",
            "Merseyside",
            "Merthyr Tydfil",
            "Middlesbrough",
            "Midlothian",
            "Milton Keynes",
            "Monmouthshire",
            "Moray",
            "Neath Port Talbot",
            "Newport",
            "Norfolk",
            "North Ayrshire",
            "North East Lincolnshire",
            "North Lanarkshire",
            "North Lincolnshire",
            "North Somerset",
            "North Yorkshire",
            "Northamptonshire",
            "Northumberland",
            "Nottingham",
            "Nottinghamshire",
            "Orkney Islands",
            "Oxfordshire",
            "Pembrokeshire",
            "Perth & Kinross",
            "Peterborough",
            "Plymouth",
            "Poole",
            "Portsmouth",
            "Powys",
            "Reading",
            "Redcar & Cleveland",
            "Renfrewshire",
            "Rhondda Cynon Taff",
            "Rutland",
            "Scottish Borders",
            "Shetland Islands",
            "Shropshire",
            "Slough",
            "Somerset",
            "South Ayrshire",
            "South Gloucestershire",
            "South Lanarkshire",
            "South Yorkshire",
            "Southampton",
            "Southend-on-Sea",
            "Staffordshire",
            "Stirling",
            "Stockton-on-Tees",
            "Stoke-on-Trent",
            "Suffolk",
            "Surrey",
            "Swansea",
            "Swindon",
            "Telford & Wrekin",
            "Thurrock",
            "Torbay",
            "Torfaen",
            "Tyne & Wear",
            "Vale of Glamorgan",
            "Warrington",
            "Warwickshire",
            "West Berkshire",
            "West Dunbartonshire",
            "West Lothian",
            "West Midlands",
            "West Sussex",
            "West Yorkshire",
            "Western Isles",
            "Wiltshire",
            "Windsor & Maidenhead",
            "Wokingham",
            "Worcestershire",
            "Wrexham",
            "York",
        ]

        # TODO get the admin areas from teh nodes file, compare against the list of
        # area names
        # nptg values
        adjanct_locals = etl.load_gazette_adjanct_localities()
        admin_codes = etl.naptan_gazette_admin_area_codes()
        districts = etl.naptan_gazette_districts()
        localities = etl.naptan_gazette_localities()
        locality_alternate = etl.load_gazette_localities_alternative_names()
        locality_hierarch = etl.load_gazette_locality_hierarchy()
        plusbusmap = etl.load_gazette_plusbus_mapping()
        plusbuszone = etl.load_gazette_plusbus_zones()
        regions = etl.naptan_gazette_region()

        # node values
        node_locs = gdf["LocalityName"].unique()
        # get nptg localities,
        nptg_locs = localities["LocalityName"].unique()
        # TODO filter to nptg to nodes, get all the localities in nptg for
        #  this area
        # get the unique area code for this admin area.
        area_admin_code = node_locs["AdminCode"].unique()
        # check the area admin code in the nptg file for the corresponding
        #  localities.
        missing_localities = nptg_locs[~nptg_locs.AdminCode.isin(area_admin_code)]
        # check if locality is
        df3 = gaz_locs[gaz_locs.LocalityName.isin(gdf.LocalityName)]
        # get all the localities
        # TODO list the localities in nptg but not nodes

        # TODO plot sample on map
        # TODO write unused localities in given area to file.
        report_failing_nodes(
            gdf,
            check_name,
        )
        return
コード例 #29
0
    def stop_with_multiple_road_names(cls, gdf, col_name="CommonName"):
        """[summary]CommonNames in NaPTAN should be simple and not composite.
            Most examples of commonnames which include two of the designated
            words are ones where two road names are used in a composite name,
            contrary to NaPTAN guidance.
            This uses regex, but they could be some other way of doing this...
        Arguments:
            df {[type]} -- [description]
        """
        swmrn_gdf = gdf
        swmrn_gdf[col_name] = swmrn_gdf[col_name].str.lower()
        try:
            # leave this here, no it's not being used, just leave it anyway.
            targets = [
                "road",
                "roads",
                "street",
                "streets",
                "avenue",
                "avenues",
                "garden",
                "gardens",
                "lane",
                "lanes",
                "drive",
                "drives",
                "way",
                "ways",
            ]

            # regex patterns for detection.
            pattern = r"\b(road|roads|\
                            street|streets|\
                            avenue|\avenues|\
                            garden|gardens|\
                            lane|lanes\
                            drive|drives\
                            way|ways)\b"

            fail_rds_re = (r"\b('street|streets|avenue|avenues|garden|"
                           r"gardens|lane|lanes|drive|drives|way|ways')\b")
            fail_aves_re = (r"\b('road|roads|street|streets|garden|gardens|"
                            r"lane|lanes|drive|drives|way|ways')\b")
            fail_gdns_re = (r"\b('road|roads|street|streets|avenue|avenues|"
                            r"lane|lanes|drive|drives|way|ways')\b")
            fail_lanes_re = (r"\b('road|roads|street|streets|avenue|avenues|"
                             r"garden|gardens|drive|drives|way|ways')\b")
            fail_drives_re = (r"\b('road|roads|street|streets|avenue|avenues|"
                              r"garden|gardens|lane|lanes|way|ways')\b")
            fail_ways_re = (r"\b('road|roads|street|streets|avenue|avenues|"
                            r"garden|gardens|lane|lanes|drive|drives')\b")

            tn = swmrn_gdf[swmrn_gdf[col_name].str.contains(pattern,
                                                            regex=True)]
            roads = tn[tn[col_name].str.contains(r"\b(road|roads)\b")]
            fail_rds = roads[roads[col_name].str.contains(fail_rds_re,
                                                          regex=True)]
            aves = tn[tn[col_name].str.contains(r"\b(avenue|avenues)\b")]
            fail_aves = aves[aves[col_name].str.contains(fail_aves_re,
                                                         regex=True)]
            gdns = tn[tn[col_name].str.contains(r"\b(garden|gardens)\b")]
            failgdns = gdns[gdns[col_name].str.contains(fail_gdns_re,
                                                        regex=True)]
            lanes = tn[tn[col_name].str.contains(r"\b(lane|lanes)\b")]
            faillanes = lanes[lanes[col_name].str.contains(fail_lanes_re,
                                                           regex=True)]
            drives = tn[tn[col_name].str.contains(r"\b(drive|drives)\b")]
            faildrives = drives[drives[col_name].str.contains(fail_drives_re,
                                                              regex=True)]
            ways = tn[tn[col_name].str.contains(r"\b(way|ways)\b")]
            failways = ways[ways[col_name].str.contains(fail_ways_re,
                                                        regex=True)]
            all_dfs = [
                fail_rds, fail_aves, failgdns, faillanes, faildrives, failways
            ]
            failed_nodes = pd.concat(all_dfs)
            failed_nodes[col_name] = failed_nodes[col_name].str.title()
            rep.report_failing_nodes(gdf, "Stop with Multiple road type names",
                                     failed_nodes)
            return failed_nodes
        except Exception as e:
            raise (e)