Exemple #1
0
class ImportValidator(object):
    def __init__(self, gtfssource, gtfs):
        """
        Parameters
        ----------
        gtfs_sources: list of strings
        gtfs: GTFS, or path to a GTFS object
            A GTFS object
        """
        self.df_freq_dict = {}
        if isinstance(gtfssource, string_types + (dict, )):
            self.gtfs_sources = [gtfssource]
        else:
            assert isinstance(gtfssource, list)
            self.gtfs_sources = gtfssource
        assert len(
            self.gtfs_sources
        ) > 0, "There needs to be some source files for validating an import"

        if not isinstance(gtfs, GTFS):
            self.gtfs = GTFS(gtfs)
        else:
            self.gtfs = gtfs

        self.location = self.gtfs.get_location_name()
        self.warnings_container = WarningsContainer()

    def get_warnings(self):
        self.warnings_container.clear()
        self._validate_table_counts()
        self._validate_no_nulls()
        self._validate_danglers()
        self.warnings_container.print_summary()
        return self.warnings_container

    def _validate_table_counts(self):
        """
        Imports source .txt files, checks row counts and then compares the rowcounts with the gtfsobject
        :return:
        """
        for table_name_txt, db_table_name, row_warning in zip(
                SOURCE_TABLE_NAMES, DB_TABLE_NAMES, ROW_WARNINGS):
            source_row_count = 0

            for gtfs_source in self.gtfs_sources:
                frequencies_in_source = source_table_txt_to_pandas(
                    gtfs_source, 'frequencies.txt')
                try:
                    if table_name_txt == 'trips' and not frequencies_in_source.empty:
                        source_row_count += self._frequency_generated_trips(
                            gtfs_source, table_name_txt)

                    elif table_name_txt == 'stop_times' and not frequencies_in_source.empty:
                        source_row_count += self._frequency_generated_stop_times(
                            gtfs_source, table_name_txt)
                    else:
                        df = source_table_txt_to_pandas(
                            gtfs_source, table_name_txt)

                        source_row_count += len(df.index)
                except (IOError) as e:
                    print(e)
                    pass

            # Result from GTFSobj:
            database_row_count = self.gtfs.get_row_count(db_table_name)
            if source_row_count == database_row_count:
                print("Row counts match for " + table_name_txt +
                      " between the source and database (" +
                      str(database_row_count) + ")")

            else:
                difference = database_row_count - source_row_count
                print('Row counts do not match for ' + str(table_name_txt) +
                      ': (source=' + str(source_row_count) + ', database=' +
                      str(database_row_count) + ")")
                if table_name_txt == "calendar" and difference > 0:
                    query = "SELECT count(*) FROM (SELECT * FROM calendar ORDER BY service_I DESC LIMIT " \
                            + str(int(difference)) + \
                            ") WHERE start_date=end_date AND m=0 AND t=0 AND w=0 AND th=0 AND f=0 AND s=0 AND su=0"
                    number_of_entries_added_by_calendar_dates_loader = self.gtfs.execute_custom_query(
                        query).fetchone()[0]
                    if number_of_entries_added_by_calendar_dates_loader == difference:
                        print(
                            "    But don't worry, the extra entries seem to just dummy entries due to calendar_dates"
                        )
                    else:
                        print("    Reason for this is unknown.")
                        self.warnings_container.add_warning(
                            self.location, row_warning, difference)
                else:
                    self.warnings_container.add_warning(
                        self.location, row_warning, difference)

    def _validate_no_nulls(self):
        """
        Loads the tables from the gtfs object and counts the number of rows that have null values in
        fields that should not be null. Stores the number of null rows in warnings_container
        """
        for table, null_warning in zip(DB_TABLE_NAMES, NULL_WARNINGS):
            # TODO: make this validation source by source
            df = self.gtfs.get_table(table)
            df.drop(FIELDS_WHERE_NULL_OK[table], inplace=True, axis=1)
            # print(df.to_string())
            len_table = len(df.index)
            df.dropna(inplace=True, axis=0)
            len_non_null = len(df.index)
            nullrows = len_table - len_non_null
            if nullrows > 0:
                # print('Warning: Null values detected in table ' + table)
                self.warnings_container.add_warning(self.location,
                                                    null_warning,
                                                    value=nullrows)

    def _validate_danglers(self):
        """
        Checks for rows that are not referenced in the the tables that should be linked

        stops <> stop_times using stop_I
        stop_times <> trips <> days, using trip_I
        trips <> routes, using route_I
        :return:
        """
        for query, warning in zip(DANGLER_QUERIES, DANGLER_WARNINGS):
            dangler_count = self.gtfs.execute_custom_query(query).fetchone()[0]
            if dangler_count > 0:
                print(str(dangler_count) + " " + warning)
                self.warnings_container.add_warning(self.location,
                                                    warning,
                                                    value=dangler_count)

    def _frequency_generated_trips(self, source, txt):
        """
        This function calculates the equivalent rowcounts for trips when
        taking into account the generated rows in the gtfs object
        :param source: path to the source file
        :param txt: txt file in question
        :return: sum of all trips
        """
        df_freq = source_table_txt_to_pandas(source, u'frequencies.txt')
        df_trips = source_table_txt_to_pandas(source, txt)
        df_freq['n_trips'] = df_freq.apply(lambda row: len(
            range(str_time_to_day_seconds(row['start_time']),
                  str_time_to_day_seconds(row['end_time']), row['headway_secs']
                  )),
                                           axis=1)
        self.df_freq_dict[source] = df_freq
        df_trips_freq = pd.merge(df_freq, df_trips, how='outer', on='trip_id')

        return int(df_trips_freq['n_trips'].fillna(1).sum(axis=0))

    def _frequency_generated_stop_times(self, source, txt):
        """
        same as above except for stop times table
        :param source:
        :param txt:
        :return:
        """
        df_stop_times = source_table_txt_to_pandas(source, txt)
        df_freq = self.df_freq_dict[source]
        df_stop_freq = pd.merge(df_freq,
                                df_stop_times,
                                how='outer',
                                on='trip_id')

        return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))
Exemple #2
0
class ImportValidator(object):
    def __init__(self, gtfssource, gtfs, verbose=True):
        """
        Parameters
        ----------
        gtfs_sources: list, string, dict
            list of paths to the strings, or a dictionary directly containing the gtfs data directly
        gtfs: gtfspy.gtfs.GTFS, or path to a relevant .sqlite GTFS database
        verbose: bool
            Whether or not to print warnings on-the-fly.
        """
        if isinstance(gtfssource, string_types + (dict, )):
            self.gtfs_sources = [gtfssource]
        else:
            assert isinstance(gtfssource, list)
            self.gtfs_sources = gtfssource
        assert len(
            self.gtfs_sources
        ) > 0, "There needs to be some source files for validating an import"

        if not isinstance(gtfs, GTFS):
            self.gtfs = GTFS(gtfs)
        else:
            self.gtfs = gtfs

        self.location = self.gtfs.get_location_name()
        self.warnings_container = WarningsContainer()
        self.verbose = verbose

    def validate_and_get_warnings(self):
        self.warnings_container.clear()
        self._validate_table_row_counts()
        self._validate_no_null_values()
        self._validate_danglers()
        return self.warnings_container

    def _validate_table_row_counts(self):
        """
        Imports source .txt files, checks row counts and then compares the rowcounts with the gtfsobject
        :return:
        """
        for db_table_name in DB_TABLE_NAME_TO_SOURCE_FILE.keys():
            table_name_source_file = DB_TABLE_NAME_TO_SOURCE_FILE[
                db_table_name]
            row_warning_str = DB_TABLE_NAME_TO_ROWS_MISSING_WARNING[
                db_table_name]

            # Row count in GTFS object:
            database_row_count = self.gtfs.get_row_count(db_table_name)

            # Row counts in source files:
            source_row_count = 0
            for gtfs_source in self.gtfs_sources:
                frequencies_in_source = source_csv_to_pandas(
                    gtfs_source, 'frequencies.txt')
                try:
                    if table_name_source_file == 'trips' and not frequencies_in_source.empty:
                        source_row_count += self._frequency_generated_trips_rows(
                            gtfs_source)

                    elif table_name_source_file == 'stop_times' and not frequencies_in_source.empty:
                        source_row_count += self._compute_number_of_frequency_generated_stop_times(
                            gtfs_source)
                    else:
                        df = source_csv_to_pandas(gtfs_source,
                                                  table_name_source_file)

                        source_row_count += len(df.index)
                except IOError as e:
                    if hasattr(e, "filename") and db_table_name in e.filename:
                        pass
                    else:
                        raise e

            if source_row_count == database_row_count and self.verbose:
                print("Row counts match for " + table_name_source_file +
                      " between the source and database (" +
                      str(database_row_count) + ")")
            else:
                difference = database_row_count - source_row_count
                ('Row counts do not match for ' + str(table_name_source_file) +
                 ': (source=' + str(source_row_count) + ', database=' +
                 str(database_row_count) + ")")
                if table_name_source_file == "calendar" and difference > 0:
                    query = "SELECT count(*) FROM (SELECT * FROM calendar ORDER BY service_I DESC LIMIT " \
                            + str(int(difference)) + \
                            ") WHERE start_date=end_date AND m=0 AND t=0 AND w=0 AND th=0 AND f=0 AND s=0 AND su=0"
                    number_of_entries_added_by_calendar_dates_loader = self.gtfs.execute_custom_query(
                        query).fetchone()[0]
                    if number_of_entries_added_by_calendar_dates_loader == difference and self.verbose:
                        print(
                            "    But don't worry, the extra entries seem to just dummy entries due to calendar_dates"
                        )
                    else:
                        if self.verbose:
                            print("    Reason for this is unknown.")
                        self.warnings_container.add_warning(
                            row_warning_str, self.location, difference)
                else:
                    self.warnings_container.add_warning(
                        row_warning_str, self.location, difference)

    def _validate_no_null_values(self):
        """
        Loads the tables from the gtfs object and counts the number of rows that have null values in
        fields that should not be null. Stores the number of null rows in warnings_container
        """
        for table in DB_TABLE_NAMES:
            null_not_ok_warning = "Null values in must-have columns in table {table}".format(
                table=table)
            null_warn_warning = "Null values in good-to-have columns in table {table}".format(
                table=table)
            null_not_ok_fields = DB_TABLE_NAME_TO_FIELDS_WHERE_NULL_NOT_OK[
                table]
            null_warn_fields = DB_TABLE_NAME_TO_FIELDS_WHERE_NULL_OK_BUT_WARN[
                table]

            # CW, TODO: make this validation source by source
            df = self.gtfs.get_table(table)

            for warning, fields in zip(
                [null_not_ok_warning, null_warn_warning],
                [null_not_ok_fields, null_warn_fields]):
                null_unwanted_df = df[fields]
                rows_having_null = null_unwanted_df.isnull().any(1)
                if sum(rows_having_null) > 0:
                    rows_having_unwanted_null = df[rows_having_null.values]
                    self.warnings_container.add_warning(
                        warning, rows_having_unwanted_null,
                        len(rows_having_unwanted_null))

    def _validate_danglers(self):
        """
        Checks for rows that are not referenced in the the tables that should be linked

        stops <> stop_times using stop_I
        stop_times <> trips <> days, using trip_I
        trips <> routes, using route_I
        :return:
        """
        for query, warning in zip(DANGLER_QUERIES, DANGLER_WARNINGS):
            dangler_count = self.gtfs.execute_custom_query(query).fetchone()[0]
            if dangler_count > 0:
                if self.verbose:
                    print(str(dangler_count) + " " + warning)
                self.warnings_container.add_warning(warning,
                                                    self.location,
                                                    count=dangler_count)

    def _frequency_generated_trips_rows(self,
                                        gtfs_soure_path,
                                        return_df_freq=False):
        """
        This function calculates the equivalent rowcounts for trips when
        taking into account the generated rows in the gtfs object
        Parameters
        ----------
        gtfs_soure_path: path to the source file
        param txt: txt file in question
        :return: sum of all trips
        """
        df_freq = source_csv_to_pandas(gtfs_soure_path, 'frequencies')
        df_trips = source_csv_to_pandas(gtfs_soure_path, "trips")
        df_freq['n_trips'] = df_freq.apply(lambda row: len(
            range(str_time_to_day_seconds(row['start_time']),
                  str_time_to_day_seconds(row['end_time']), row['headway_secs']
                  )),
                                           axis=1)
        df_trips_freq = pd.merge(df_freq, df_trips, how='outer', on='trip_id')
        n_freq_generated_trips = int(
            df_trips_freq['n_trips'].fillna(1).sum(axis=0))
        if return_df_freq:
            return df_trips_freq
        else:
            return n_freq_generated_trips

    def _compute_number_of_frequency_generated_stop_times(
            self, gtfs_source_path):
        """
        Parameters
        ----------
        Same as for "_frequency_generated_trips_rows" but for stop times table
        gtfs_source_path:
        table_name:

        Return
        ------
        """
        df_freq = self._frequency_generated_trips_rows(gtfs_source_path,
                                                       return_df_freq=True)
        df_stop_times = source_csv_to_pandas(gtfs_source_path, "stop_times")
        df_stop_freq = pd.merge(df_freq,
                                df_stop_times,
                                how='outer',
                                on='trip_id')
        return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))