def main(execution_date, **kwargs): in_path = f"schedule/{execution_date}" print(in_path) status = pd.read_csv(read_gcfs(f"{in_path}/status.csv")) success = status[status.status == "success"] agency_errors = [] loadable_agencies = [] for ii, row in success.iterrows(): path_agency = f"{in_path}/{row['itp_id']}_{row['url_number']}" path_validation = f"{path_agency}/{VALIDATION_FILE}" print(f"reading validation file: {path_validation}") validation = json.load(read_gcfs(path_validation)) unique_codes = get_notice_codes(validation) if ERROR_MISSING_FILE not in unique_codes: loadable_agencies.append(path_agency) else: agency = dict(itp_id=row["itp_id"], url_number=row["url_number"]) agency_errors.append(agency) errors_df = pd.DataFrame(agency_errors) errors_str = errors_df.to_csv(index=False).encode() save_to_gcfs(errors_str, f"{in_path}/processed/agency_load_errors.csv", use_pipe=True) return loadable_agencies
def validator_process(execution_date, **kwargs): base_path = f"schedule/{execution_date}" status = pd.read_csv(read_gcfs(f"{base_path}/status.csv")) success = status[lambda d: d.status == "success"] # hold on to notices, so we can infer schema after # note that I've commented out the code for inferring schema below, # but it was usefule for generating, then hand-tweaking to load # into bigquery # notice_entries = [] for k, row in success.iterrows(): agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}" url = f"{agency_path}/validation.json" dst_path = f"{agency_path}/processed/validation_report.json" validation = json.load(read_gcfs(url)) # copy code-level notices, and add internal ids raw_codes = {**validation["data"]["report"]} raw_codes["calitp_itp_id"] = row["itp_id"] raw_codes["calitp_url_number"] = row["url_number"] raw_codes["calitp_gtfs_validated_by"] = validation["version"] json_codes = json.dumps(raw_codes).encode() # df_notices = process_notices(row["itp_id"], row["url_number"], validation) # csv_string = df_notices.to_csv(index=None).encode() # notice_entries.extend(df_notices.notices.tolist()) save_to_gcfs(json_codes, dst_path, use_pipe=True)
def validator_process(execution_date, **kwargs): base_path = f"schedule/{execution_date}" successes = get_successfully_downloaded_feeds(execution_date) # hold on to notices, so we can infer schema after # note that I've commented out the code for inferring schema below, # but it was usefule for generating, then hand-tweaking to load # into bigquery # notice_entries = [] for k, row in successes.iterrows(): agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}" url = f"{agency_path}/validation.json" dst_path = f"{agency_path}/processed/validation_report.json" validation = json.load(read_gcfs(url)) # copy code-level notices, and add internal ids raw_codes = {**validation["data"]["report"]} raw_codes["calitp_itp_id"] = row["itp_id"] raw_codes["calitp_url_number"] = row["url_number"] raw_codes["calitp_extracted_at"] = execution_date.to_date_string() raw_codes["calitp_gtfs_validated_by"] = validation["version"] # coerce types labeled "string" to a string coerce_notice_values_to_str(raw_codes, COERCE_TO_STRING) json_codes = json.dumps(raw_codes).encode() # df_notices = process_notices(row["itp_id"], row["url_number"], validation) # csv_string = df_notices.to_csv(index=None).encode() # notice_entries.extend(df_notices.notices.tolist()) save_to_gcfs(json_codes, dst_path, use_pipe=True)
def main(execution_date, **kwargs): # TODO: remove hard-coded project string fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra") bucket = get_bucket() f = read_gcfs(f"schedule/{execution_date}/status.csv") status = pd.read_csv(f) success = status[lambda d: d.status == "success"] gtfs_files = [] for ii, row in success.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*" gtfs_files.append(fs.glob(gtfs_url)) res = (success[["itp_id", "url_number"]].assign(gtfs_file=gtfs_files).explode( "gtfs_file").loc[lambda d: d.gtfs_file != "processed"]) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def get_successfully_downloaded_feeds(execution_date): """Get a list of feeds that were successfully downloaded (as noted in a `schedule/{execution_date}/status.csv/` file) for a given execution date. """ f = read_gcfs(f"schedule/{execution_date}/status.csv") status = pd.read_csv(f) return status[lambda d: d.status == "success"]
def _keep_columns(gcs_dirs, dst_dir, filename, required_cols, optional_cols, prepend_ids=True): for path in gcs_dirs: full_src_path = f"{path}/{filename}" full_dst_path = f"{path}/{dst_dir}/{filename}" final_header = [*required_cols, *optional_cols] # read csv using object dtype, so pandas does not coerce data df = pd.read_csv(read_gcfs(full_src_path), dtype="object") # preprocess data to include cal-itp id columns --- # column names: calitp_id, calitp_url_number if prepend_ids: # hacky, but parse /path/.../{itp_id}/{url_number} basename = path.split("/")[-1] itp_id, url_number = map(int, basename.split("_")) df = df.assign(calitp_itp_id=itp_id, calitp_url_number=url_number) # get specified columns, inserting NA columns where needed ---- df_cols = set(df.columns) opt_cols_present = [x for x in optional_cols if x in df_cols] df_select = df[[*required_cols, *opt_cols_present]] # fill in missing columns ---- for ii, colname in enumerate(final_header): if colname not in df_select: print("INSERTING MISSING COLUMN") df_select.insert(ii, colname, pd.NA) print("SHAPE: ", df_select.shape) # save result ---- csv_result = df_select encoded = csv_result.to_csv(index=False).encode() save_to_gcfs(encoded, full_dst_path, use_pipe=True)
def _keep_columns( src_path, dst_path, colnames, itp_id=None, url_number=None, extracted_at=None, **kwargs, ): """Save a CSV file with only the needed columns for a particular table. Args: src_path (string): Location of the input CSV file dst_path (string): Location of the output CSV file colnames (list): List of the colnames that should be included in output CSV file. itp_id (string, optional): itp_id to use when saving record. Defaults to None. url_number (string, optional): url_number to use when saving record. Defaults to None. extracted_at (string, optional): date string of extraction time. Defaults to None. Raises: pandas.errors.ParserError: Can be thrown when the given input file is not a valid CSV file. Ex: a single row could have too many columns. """ # Read csv using object dtype, so pandas does not coerce data. # The following line of code inside the try block can throw a # pandas.errors.ParserError, but the responsibility to catch this error is assumed # to be implemented in the code that calls this method. try: df = pd.read_csv( read_gcfs(src_path), dtype="object", encoding_errors="replace", **kwargs ) except EmptyDataError: # in the rare case of a totally empty data file, create a DataFrame # with no rows, and the target columns df = pd.DataFrame({k: [] for k in colnames}) if itp_id is not None: df["calitp_itp_id"] = itp_id if url_number is not None: df["calitp_url_number"] = url_number # get specified columns, inserting NA columns where needed ---- df_cols = set(df.columns) cols_present = [x for x in colnames if x in df_cols] df_select = df.loc[:, cols_present] # fill in missing columns ---- print("DataFrame missing columns: ", set(df_select.columns) - set(colnames)) for ii, colname in enumerate(colnames): if colname not in df_select: df_select.insert(ii, colname, pd.NA) if extracted_at is not None: df_select["calitp_extracted_at"] = extracted_at # save result ---- csv_result = df_select.to_csv(index=False).encode() save_to_gcfs(csv_result, dst_path, use_pipe=True)