class ImportValidator(object): def __init__(self, gtfssource, gtfs): """ Parameters ---------- gtfs_sources: list of strings gtfs: GTFS, or path to a GTFS object A GTFS object """ self.df_freq_dict = {} if isinstance(gtfssource, string_types + (dict, )): self.gtfs_sources = [gtfssource] else: assert isinstance(gtfssource, list) self.gtfs_sources = gtfssource assert len( self.gtfs_sources ) > 0, "There needs to be some source files for validating an import" if not isinstance(gtfs, GTFS): self.gtfs = GTFS(gtfs) else: self.gtfs = gtfs self.location = self.gtfs.get_location_name() self.warnings_container = WarningsContainer() def get_warnings(self): self.warnings_container.clear() self._validate_table_counts() self._validate_no_nulls() self._validate_danglers() self.warnings_container.print_summary() return self.warnings_container def _validate_table_counts(self): """ Imports source .txt files, checks row counts and then compares the rowcounts with the gtfsobject :return: """ for table_name_txt, db_table_name, row_warning in zip( SOURCE_TABLE_NAMES, DB_TABLE_NAMES, ROW_WARNINGS): source_row_count = 0 for gtfs_source in self.gtfs_sources: frequencies_in_source = source_table_txt_to_pandas( gtfs_source, 'frequencies.txt') try: if table_name_txt == 'trips' and not frequencies_in_source.empty: source_row_count += self._frequency_generated_trips( gtfs_source, table_name_txt) elif table_name_txt == 'stop_times' and not frequencies_in_source.empty: source_row_count += self._frequency_generated_stop_times( gtfs_source, table_name_txt) else: df = source_table_txt_to_pandas( gtfs_source, table_name_txt) source_row_count += len(df.index) except (IOError) as e: print(e) pass # Result from GTFSobj: database_row_count = self.gtfs.get_row_count(db_table_name) if source_row_count == database_row_count: print("Row counts match for " + table_name_txt + " between the source and database (" + str(database_row_count) + ")") else: difference = database_row_count - source_row_count print('Row counts do not match for ' + str(table_name_txt) + ': (source=' + str(source_row_count) + ', database=' + str(database_row_count) + ")") if table_name_txt == "calendar" and difference > 0: query = "SELECT count(*) FROM (SELECT * FROM calendar ORDER BY service_I DESC LIMIT " \ + str(int(difference)) + \ ") WHERE start_date=end_date AND m=0 AND t=0 AND w=0 AND th=0 AND f=0 AND s=0 AND su=0" number_of_entries_added_by_calendar_dates_loader = self.gtfs.execute_custom_query( query).fetchone()[0] if number_of_entries_added_by_calendar_dates_loader == difference: print( " But don't worry, the extra entries seem to just dummy entries due to calendar_dates" ) else: print(" Reason for this is unknown.") self.warnings_container.add_warning( self.location, row_warning, difference) else: self.warnings_container.add_warning( self.location, row_warning, difference) def _validate_no_nulls(self): """ Loads the tables from the gtfs object and counts the number of rows that have null values in fields that should not be null. Stores the number of null rows in warnings_container """ for table, null_warning in zip(DB_TABLE_NAMES, NULL_WARNINGS): # TODO: make this validation source by source df = self.gtfs.get_table(table) df.drop(FIELDS_WHERE_NULL_OK[table], inplace=True, axis=1) # print(df.to_string()) len_table = len(df.index) df.dropna(inplace=True, axis=0) len_non_null = len(df.index) nullrows = len_table - len_non_null if nullrows > 0: # print('Warning: Null values detected in table ' + table) self.warnings_container.add_warning(self.location, null_warning, value=nullrows) def _validate_danglers(self): """ Checks for rows that are not referenced in the the tables that should be linked stops <> stop_times using stop_I stop_times <> trips <> days, using trip_I trips <> routes, using route_I :return: """ for query, warning in zip(DANGLER_QUERIES, DANGLER_WARNINGS): dangler_count = self.gtfs.execute_custom_query(query).fetchone()[0] if dangler_count > 0: print(str(dangler_count) + " " + warning) self.warnings_container.add_warning(self.location, warning, value=dangler_count) def _frequency_generated_trips(self, source, txt): """ This function calculates the equivalent rowcounts for trips when taking into account the generated rows in the gtfs object :param source: path to the source file :param txt: txt file in question :return: sum of all trips """ df_freq = source_table_txt_to_pandas(source, u'frequencies.txt') df_trips = source_table_txt_to_pandas(source, txt) df_freq['n_trips'] = df_freq.apply(lambda row: len( range(str_time_to_day_seconds(row['start_time']), str_time_to_day_seconds(row['end_time']), row['headway_secs'] )), axis=1) self.df_freq_dict[source] = df_freq df_trips_freq = pd.merge(df_freq, df_trips, how='outer', on='trip_id') return int(df_trips_freq['n_trips'].fillna(1).sum(axis=0)) def _frequency_generated_stop_times(self, source, txt): """ same as above except for stop times table :param source: :param txt: :return: """ df_stop_times = source_table_txt_to_pandas(source, txt) df_freq = self.df_freq_dict[source] df_stop_freq = pd.merge(df_freq, df_stop_times, how='outer', on='trip_id') return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))
class ImportValidator(object): def __init__(self, gtfssource, gtfs, verbose=True): """ Parameters ---------- gtfs_sources: list, string, dict list of paths to the strings, or a dictionary directly containing the gtfs data directly gtfs: gtfspy.gtfs.GTFS, or path to a relevant .sqlite GTFS database verbose: bool Whether or not to print warnings on-the-fly. """ if isinstance(gtfssource, string_types + (dict, )): self.gtfs_sources = [gtfssource] else: assert isinstance(gtfssource, list) self.gtfs_sources = gtfssource assert len( self.gtfs_sources ) > 0, "There needs to be some source files for validating an import" if not isinstance(gtfs, GTFS): self.gtfs = GTFS(gtfs) else: self.gtfs = gtfs self.location = self.gtfs.get_location_name() self.warnings_container = WarningsContainer() self.verbose = verbose def validate_and_get_warnings(self): self.warnings_container.clear() self._validate_table_row_counts() self._validate_no_null_values() self._validate_danglers() return self.warnings_container def _validate_table_row_counts(self): """ Imports source .txt files, checks row counts and then compares the rowcounts with the gtfsobject :return: """ for db_table_name in DB_TABLE_NAME_TO_SOURCE_FILE.keys(): table_name_source_file = DB_TABLE_NAME_TO_SOURCE_FILE[ db_table_name] row_warning_str = DB_TABLE_NAME_TO_ROWS_MISSING_WARNING[ db_table_name] # Row count in GTFS object: database_row_count = self.gtfs.get_row_count(db_table_name) # Row counts in source files: source_row_count = 0 for gtfs_source in self.gtfs_sources: frequencies_in_source = source_csv_to_pandas( gtfs_source, 'frequencies.txt') try: if table_name_source_file == 'trips' and not frequencies_in_source.empty: source_row_count += self._frequency_generated_trips_rows( gtfs_source) elif table_name_source_file == 'stop_times' and not frequencies_in_source.empty: source_row_count += self._compute_number_of_frequency_generated_stop_times( gtfs_source) else: df = source_csv_to_pandas(gtfs_source, table_name_source_file) source_row_count += len(df.index) except IOError as e: if hasattr(e, "filename") and db_table_name in e.filename: pass else: raise e if source_row_count == database_row_count and self.verbose: print("Row counts match for " + table_name_source_file + " between the source and database (" + str(database_row_count) + ")") else: difference = database_row_count - source_row_count ('Row counts do not match for ' + str(table_name_source_file) + ': (source=' + str(source_row_count) + ', database=' + str(database_row_count) + ")") if table_name_source_file == "calendar" and difference > 0: query = "SELECT count(*) FROM (SELECT * FROM calendar ORDER BY service_I DESC LIMIT " \ + str(int(difference)) + \ ") WHERE start_date=end_date AND m=0 AND t=0 AND w=0 AND th=0 AND f=0 AND s=0 AND su=0" number_of_entries_added_by_calendar_dates_loader = self.gtfs.execute_custom_query( query).fetchone()[0] if number_of_entries_added_by_calendar_dates_loader == difference and self.verbose: print( " But don't worry, the extra entries seem to just dummy entries due to calendar_dates" ) else: if self.verbose: print(" Reason for this is unknown.") self.warnings_container.add_warning( row_warning_str, self.location, difference) else: self.warnings_container.add_warning( row_warning_str, self.location, difference) def _validate_no_null_values(self): """ Loads the tables from the gtfs object and counts the number of rows that have null values in fields that should not be null. Stores the number of null rows in warnings_container """ for table in DB_TABLE_NAMES: null_not_ok_warning = "Null values in must-have columns in table {table}".format( table=table) null_warn_warning = "Null values in good-to-have columns in table {table}".format( table=table) null_not_ok_fields = DB_TABLE_NAME_TO_FIELDS_WHERE_NULL_NOT_OK[ table] null_warn_fields = DB_TABLE_NAME_TO_FIELDS_WHERE_NULL_OK_BUT_WARN[ table] # CW, TODO: make this validation source by source df = self.gtfs.get_table(table) for warning, fields in zip( [null_not_ok_warning, null_warn_warning], [null_not_ok_fields, null_warn_fields]): null_unwanted_df = df[fields] rows_having_null = null_unwanted_df.isnull().any(1) if sum(rows_having_null) > 0: rows_having_unwanted_null = df[rows_having_null.values] self.warnings_container.add_warning( warning, rows_having_unwanted_null, len(rows_having_unwanted_null)) def _validate_danglers(self): """ Checks for rows that are not referenced in the the tables that should be linked stops <> stop_times using stop_I stop_times <> trips <> days, using trip_I trips <> routes, using route_I :return: """ for query, warning in zip(DANGLER_QUERIES, DANGLER_WARNINGS): dangler_count = self.gtfs.execute_custom_query(query).fetchone()[0] if dangler_count > 0: if self.verbose: print(str(dangler_count) + " " + warning) self.warnings_container.add_warning(warning, self.location, count=dangler_count) def _frequency_generated_trips_rows(self, gtfs_soure_path, return_df_freq=False): """ This function calculates the equivalent rowcounts for trips when taking into account the generated rows in the gtfs object Parameters ---------- gtfs_soure_path: path to the source file param txt: txt file in question :return: sum of all trips """ df_freq = source_csv_to_pandas(gtfs_soure_path, 'frequencies') df_trips = source_csv_to_pandas(gtfs_soure_path, "trips") df_freq['n_trips'] = df_freq.apply(lambda row: len( range(str_time_to_day_seconds(row['start_time']), str_time_to_day_seconds(row['end_time']), row['headway_secs'] )), axis=1) df_trips_freq = pd.merge(df_freq, df_trips, how='outer', on='trip_id') n_freq_generated_trips = int( df_trips_freq['n_trips'].fillna(1).sum(axis=0)) if return_df_freq: return df_trips_freq else: return n_freq_generated_trips def _compute_number_of_frequency_generated_stop_times( self, gtfs_source_path): """ Parameters ---------- Same as for "_frequency_generated_trips_rows" but for stop times table gtfs_source_path: table_name: Return ------ """ df_freq = self._frequency_generated_trips_rows(gtfs_source_path, return_df_freq=True) df_stop_times = source_csv_to_pandas(gtfs_source_path, "stop_times") df_stop_freq = pd.merge(df_freq, df_stop_times, how='outer', on='trip_id') return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))