def validator_process(execution_date, **kwargs): base_path = f"schedule/{execution_date}" successes = get_successfully_downloaded_feeds(execution_date) # hold on to notices, so we can infer schema after # note that I've commented out the code for inferring schema below, # but it was usefule for generating, then hand-tweaking to load # into bigquery # notice_entries = [] for k, row in successes.iterrows(): agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}" url = f"{agency_path}/validation.json" dst_path = f"{agency_path}/processed/validation_report.json" validation = json.load(read_gcfs(url)) # copy code-level notices, and add internal ids raw_codes = {**validation["data"]["report"]} raw_codes["calitp_itp_id"] = row["itp_id"] raw_codes["calitp_url_number"] = row["url_number"] raw_codes["calitp_extracted_at"] = execution_date.to_date_string() raw_codes["calitp_gtfs_validated_by"] = validation["version"] # coerce types labeled "string" to a string coerce_notice_values_to_str(raw_codes, COERCE_TO_STRING) json_codes = json.dumps(raw_codes).encode() # df_notices = process_notices(row["itp_id"], row["url_number"], validation) # csv_string = df_notices.to_csv(index=None).encode() # notice_entries.extend(df_notices.notices.tolist()) save_to_gcfs(json_codes, dst_path, use_pipe=True)
def execute(self, context): # use the DAG's logical date as the data interval start, # and ensure the 'start' hour is 0 no matter what the 'schedule_interval' is. start_datetime = context.get("execution_date").set(hour=0) # add 23 hours to the start date to make the total range equal to 24 hours. # (the 'end' parameter is inclusive: https://developers.amplitude.com/docs/export-api#export-api-parameters) start = start_datetime.strftime(DATE_FORMAT) end = (start_datetime + timedelta(hours=23)).strftime(DATE_FORMAT) events_df = amplitude_to_df( start, end, api_key_env=self.api_key_env, secret_key_env=self.secret_key_env, rename_fields=self.rename_fields, ) events_jsonl = events_df.to_json(orient="records", lines=True, date_format="iso") gcs_file_path = f"{self.app_name}/{start}-{end}.jsonl" bucket_name = ("ingest_amplitude_raw_dev" if is_development() else "ingest_amplitude_raw_prod") # if a file already exists at `gcs_file_path`, GCS will overwrite the existing file calitp.save_to_gcfs(events_jsonl.encode(), gcs_file_path, bucket=bucket_name, use_pipe=True)
def main(execution_date, **kwargs): in_path = f"schedule/{execution_date}" print(in_path) successes = get_successfully_downloaded_feeds(execution_date) agency_errors = [] loadable_agencies = [] for ii, row in successes.iterrows(): path_agency = f"{in_path}/{row['itp_id']}_{row['url_number']}" path_validation = f"{path_agency}/{VALIDATION_FILE}" print(f"reading validation file: {path_validation}") validation = json.load(read_gcfs(path_validation)) unique_codes = get_notice_codes(validation) if ERROR_MISSING_FILE not in unique_codes: loadable_agencies.append(path_agency) else: agency = dict(itp_id=row["itp_id"], url_number=row["url_number"]) agency_errors.append(agency) errors_df = pd.DataFrame(agency_errors) errors_str = errors_df.to_csv(index=False).encode() save_to_gcfs(errors_str, f"{in_path}/processed/agency_load_errors.csv", use_pipe=True) return loadable_agencies
def main(execution_date, **kwargs): # TODO: remove hard-coded project string fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra") bucket = get_bucket() f = read_gcfs(f"schedule/{execution_date}/status.csv") status = pd.read_csv(f) success = status[lambda d: d.status == "success"] gtfs_files = [] for ii, row in success.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*" gtfs_files.append(fs.glob(gtfs_url)) res = (success[["itp_id", "url_number"]].assign(gtfs_file=gtfs_files).explode( "gtfs_file").loc[lambda d: d.gtfs_file != "processed"]) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def main(execution_date, **kwargs): fs = get_fs() bucket = get_bucket() successes = get_successfully_downloaded_feeds(execution_date) gtfs_file = [] for ii, row in successes.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" agency_url = f"{bucket}/schedule/{execution_date}/{agency_folder}" dir_files = [x for x in fs.listdir(agency_url) if x["type"] == "file"] for x in dir_files: gtfs_file.append( { "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "full_path": x["name"], "name": x["name"].split("/")[-1], "size": x["size"], "md5_hash": x["md5Hash"], } ) res = pd.DataFrame(gtfs_file) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def gen_list(execution_date, **kwargs): """ task callable to generate the list and push into xcom """ # get a table of feed urls from agencies.yml # we fetch both the raw and filled w/ API key versions to save filled_agencies_file = ("data/agencies.filled.yml" if is_development() else "data/agencies.yml") feeds_raw = make_gtfs_list(pipe_file_name("data/agencies_raw.yml")) feeds = make_gtfs_list(pipe_file_name(filled_agencies_file)) path_metadata = f"schedule/{execution_date}/metadata" save_to_gcfs( feeds_raw.to_csv(index=False).encode(), f"{path_metadata}/feeds_raw.csv", use_pipe=True, ) save_to_gcfs(feeds.to_csv(index=False).encode(), f"{path_metadata}/feeds.csv", use_pipe=True) # note that right now we useairflow's xcom functionality in this dag. # because xcom can only store a small amount of data, we have to drop some # columns. this is the only dag that uses xcom, and we should remove it! df_subset = feeds.drop(columns=[ "gtfs_rt_vehicle_positions_url", "gtfs_rt_service_alerts_url", "gtfs_rt_trip_updates_url", ]) return df_subset.to_dict("records")
def validator_process(execution_date, **kwargs): base_path = f"schedule/{execution_date}" status = pd.read_csv(read_gcfs(f"{base_path}/status.csv")) success = status[lambda d: d.status == "success"] # hold on to notices, so we can infer schema after # note that I've commented out the code for inferring schema below, # but it was usefule for generating, then hand-tweaking to load # into bigquery # notice_entries = [] for k, row in success.iterrows(): agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}" url = f"{agency_path}/validation.json" dst_path = f"{agency_path}/processed/validation_report.json" validation = json.load(read_gcfs(url)) # copy code-level notices, and add internal ids raw_codes = {**validation["data"]["report"]} raw_codes["calitp_itp_id"] = row["itp_id"] raw_codes["calitp_url_number"] = row["url_number"] raw_codes["calitp_gtfs_validated_by"] = validation["version"] json_codes = json.dumps(raw_codes).encode() # df_notices = process_notices(row["itp_id"], row["url_number"], validation) # csv_string = df_notices.to_csv(index=None).encode() # notice_entries.extend(df_notices.notices.tolist()) save_to_gcfs(json_codes, dst_path, use_pipe=True)
def downloader(task_instance, execution_date, **kwargs): """Download gtfs data from agency urls Returns dict of form {gtfs_paths, errors} """ provider_set = task_instance.xcom_pull(task_ids="generate_provider_list") url_status = [] gtfs_paths = [] for row in provider_set: print(row) try: res_path = download_url( row["gtfs_schedule_url"], row["itp_id"], row["url_number"], execution_date, ) gtfs_paths.append(res_path) status = "success" except NoFeedError as e: logging.warn(f"error downloading agency {row['agency_name']}") logging.info(e) status = str(e) url_status.append(status) df_status = pd.DataFrame(provider_set).assign(status=url_status) src_path = Path(SRC_DIR) / f"{execution_date}/status.csv" dst_path = Path(DST_DIR) / f"{execution_date}/status.csv" df_status.convert_dtypes().to_csv(src_path, index=False) save_to_gcfs(src_path, dst_path) df_errors = df_status[lambda d: d.status != "success"] error_agencies = df_errors[["agency_name", "gtfs_schedule_url", "status"]] error_records = error_agencies.to_dict(orient="record") logging.info(f"error agencies: {error_agencies.agency_name.tolist()}") return {"gtfs_paths": gtfs_paths, "errors": error_records}
def glob_daily_files(date_string, fs, logger): # TODO: remove hard-coded project string # <datetime>/<itp_id>/<url_number>/<filename> path_to_glob = f"{get_bucket()}/rt/{date_string}*/*/*/*" logger.info("Globbing {}".format(path_to_glob)) all_files = fs.glob(path_to_glob, detail=True) logger.info("Finished globbing") fs.dircache.clear() raw_res = pd.DataFrame(all_files.values()) if not len(raw_res): logger.info("No data for this date") return # ---- # Do some light pre-processing to add internal data ser_id = raw_res["name"].str.split("/") raw_res["calitp_itp_id"] = ser_id.str.get(-3).astype(int) raw_res["calitp_url_number"] = ser_id.str.get(-2).astype(int) raw_res["calitp_extracted_at"] = ser_id.str.get(-4) raw_res["full_path"] = raw_res["name"] raw_res["name"] = ser_id.str.get(-1) raw_res["md5_hash"] = raw_res["md5Hash"] res = raw_res[[ "calitp_itp_id", "calitp_url_number", "calitp_extracted_at", "name", "size", "md5_hash", ]] output_path = f"rt-processed/calitp_files/{date_string}.csv" save_to_gcfs( res.to_csv(index=False).encode(), output_path, use_pipe=True, )
def _keep_columns(gcs_dirs, dst_dir, filename, required_cols, optional_cols, prepend_ids=True): for path in gcs_dirs: full_src_path = f"{path}/{filename}" full_dst_path = f"{path}/{dst_dir}/{filename}" final_header = [*required_cols, *optional_cols] # read csv using object dtype, so pandas does not coerce data df = pd.read_csv(read_gcfs(full_src_path), dtype="object") # preprocess data to include cal-itp id columns --- # column names: calitp_id, calitp_url_number if prepend_ids: # hacky, but parse /path/.../{itp_id}/{url_number} basename = path.split("/")[-1] itp_id, url_number = map(int, basename.split("_")) df = df.assign(calitp_itp_id=itp_id, calitp_url_number=url_number) # get specified columns, inserting NA columns where needed ---- df_cols = set(df.columns) opt_cols_present = [x for x in optional_cols if x in df_cols] df_select = df[[*required_cols, *opt_cols_present]] # fill in missing columns ---- for ii, colname in enumerate(final_header): if colname not in df_select: print("INSERTING MISSING COLUMN") df_select.insert(ii, colname, pd.NA) print("SHAPE: ", df_select.shape) # save result ---- csv_result = df_select encoded = csv_result.to_csv(index=False).encode() save_to_gcfs(encoded, full_dst_path, use_pipe=True)
def execute(self, context): df = airtable_to_df( self.air_base_id, self.air_table_name, self.id_name, self.rename_fields, self.column_prefix, self.api_key, ) if self.table_name: print(f"Writing table with shape: {df.shape}") write_table(df, self.table_name) if self.gcs_path: clean_gcs_path = re.sub(r"\/+$", "", self.gcs_path) gcs_file = ( f"{clean_gcs_path}/{context['execution_date']}/{self.table_name}.csv" ) print(f"Uploading to gcs at {gcs_file}") save_to_gcfs(df.to_csv(index=False).encode(), f"{gcs_file}", use_pipe=True)
def download_url(url, itp_id, url_number, execution_date): """ Download a URL as a task item using airflow. **kwargs are airflow """ headers = { "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4)" "AppleWebKit/537.36 (KHTML, like Gecko)" "Chrome/83.0.4103.97 Safari/537.36") } if pd.isna(itp_id): raise NoFeedError("missing itp_id") try: r = requests.get(url, headers=headers) r.raise_for_status() except requests.exceptions.HTTPError as err: logging.warning(f"No feed found for {url}, {err}") raise NoFeedError("error: HTTPError") except UnicodeError: # this occurs when a misformatted url is given raise NoFeedError("error: UnicodeError") except Exception as e: raise NoFeedError(f"error: {e}") try: z = zipfile.ZipFile(io.BytesIO(r.content)) # replace here with s3fs rel_path = Path(f"{execution_date}/{int(itp_id)}_{int(url_number)}") src_path = SRC_DIR / rel_path dst_path = DST_DIR / rel_path src_path.mkdir(parents=True, exist_ok=True) z.extractall(src_path) full_path = save_to_gcfs(src_path, dst_path, recursive=True) except zipfile.BadZipFile: logging.warning(f"failed to zipfile {url}") raise NoFeedError("error: BadZipFile") return full_path
} ) # second entry removed df2 = pd.DataFrame( { "calitp_itp_id": [1], "calitp_url_number": [0], "x": [1], "calitp_extracted_at": "2021-01-02", } ) # new first entry, second entry returns df3 = pd.DataFrame( { "calitp_itp_id": [1, 1], "calitp_url_number": [0, 0], "x": [99, 2], "calitp_extracted_at": "2021-01-03", } ) for ii, df in enumerate([df1, df2, df3]): save_to_gcfs( df.to_csv(index=False).encode(), f"sandbox/external_table_{ii + 1}.csv", use_pipe=True, )
def main(execution_date, ti, **kwargs): tables = get_table(f"{DATASET}.calitp_included_gtfs_tables", as_df=True) # TODO: replace w/ pybigquery pulling schemas directly from tables # pull schemas from external table tasks. these tasks only run once, so their # xcom data is stored as a prior date. schemas = [ get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name ] # ti.xcom_pull( # dag_id="gtfs_schedule_history", task_ids=tables, include_prior_dates=True # ) # fetch latest feeds that need loading from warehouse ---- date_string = execution_date.to_date_string() tbl_feed = get_table(f"{DATASET}.calitp_feed_updates") q_today = tbl_feed.select().where( tbl_feed.c.calitp_extracted_at == date_string) df_latest_updates = (pd.read_sql(q_today, q_today.bind).rename( columns=lambda s: s.replace("calitp_", "")).convert_dtypes()) # this zip needs to be converted to a list in order to be iterated through multiple # times in an inner loop per each feed update below. This resolves a regression as # described in https://github.com/cal-itp/data-infra/issues/848. table_details = list(zip(tables.file_name, tables.is_required, schemas)) fs = get_fs() bucket = get_bucket() # load new feeds ---- print(f"Number of feeds being loaded: {df_latest_updates.shape[0]}") ttl_feeds_copied = 0 feed_tables_process_results = [] feed_process_resuls = [] for k, row in df_latest_updates.iterrows(): # initialize variable to track whether a parsing error occurred and which tables # were loaded so far parse_error_encountered_in_this_feed = False id_and_url = f"{row['itp_id']}_{row['url_number']}" # process and copy over tables into external table folder ---- for table_file, is_required, colnames in table_details: # validation report handled in a separate task, since it is in a subfolder # and should be ran separately in case the feed is unparseable. if table_file == constants.VALIDATION_REPORT: continue src_path = "/".join( ["schedule", str(execution_date), id_and_url, table_file]) dst_path = "/".join([ "schedule", "processed", f"{date_string}_{id_and_url}", table_file ]) print(f"Copying from {src_path} to {dst_path}") if not is_required and not fs.exists(f"{bucket}/{src_path}"): print(f"Skipping missing optional file: {src_path}") else: parse_error_encountered = False try: _keep_columns( src_path, dst_path, colnames, row["itp_id"], row["url_number"], date_string, ) except ParserError: print( f"Fatal parsing error encountered in {table_file} for id and " "URL: {id_and_url}.") parse_error_encountered = True parse_error_encountered_in_this_feed = True feed_tables_process_results.append({ "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "filename": table_file, "parse_error_encountered": parse_error_encountered, }) # note the parse result for this feed feed_process_resuls.append({ "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "parse_error_encountered": parse_error_encountered_in_this_feed, }) ttl_feeds_copied += 1 print("total feeds copied:", ttl_feeds_copied) # save feed and feed table process results to external tables save_to_gcfs( pd.DataFrame(feed_process_resuls).to_csv(index=False).encode(), f"schedule/{execution_date}/processed/feed_parse_result.csv", use_pipe=True, ) save_to_gcfs( pd.DataFrame(feed_tables_process_results).to_csv(index=False).encode(), f"schedule/{execution_date}/processed/feed_tables_parse_result.csv", use_pipe=True, )
def _keep_columns( src_path, dst_path, colnames, itp_id=None, url_number=None, extracted_at=None, **kwargs, ): """Save a CSV file with only the needed columns for a particular table. Args: src_path (string): Location of the input CSV file dst_path (string): Location of the output CSV file colnames (list): List of the colnames that should be included in output CSV file. itp_id (string, optional): itp_id to use when saving record. Defaults to None. url_number (string, optional): url_number to use when saving record. Defaults to None. extracted_at (string, optional): date string of extraction time. Defaults to None. Raises: pandas.errors.ParserError: Can be thrown when the given input file is not a valid CSV file. Ex: a single row could have too many columns. """ # Read csv using object dtype, so pandas does not coerce data. # The following line of code inside the try block can throw a # pandas.errors.ParserError, but the responsibility to catch this error is assumed # to be implemented in the code that calls this method. try: df = pd.read_csv( read_gcfs(src_path), dtype="object", encoding_errors="replace", **kwargs ) except EmptyDataError: # in the rare case of a totally empty data file, create a DataFrame # with no rows, and the target columns df = pd.DataFrame({k: [] for k in colnames}) if itp_id is not None: df["calitp_itp_id"] = itp_id if url_number is not None: df["calitp_url_number"] = url_number # get specified columns, inserting NA columns where needed ---- df_cols = set(df.columns) cols_present = [x for x in colnames if x in df_cols] df_select = df.loc[:, cols_present] # fill in missing columns ---- print("DataFrame missing columns: ", set(df_select.columns) - set(colnames)) for ii, colname in enumerate(colnames): if colname not in df_select: df_select.insert(ii, colname, pd.NA) if extracted_at is not None: df_select["calitp_extracted_at"] = extracted_at # save result ---- csv_result = df_select.to_csv(index=False).encode() save_to_gcfs(csv_result, dst_path, use_pipe=True)