Ejemplo n.º 1
0
def main(execution_date, **kwargs):
    fs = get_fs()
    bucket = get_bucket()

    successes = get_successfully_downloaded_feeds(execution_date)

    gtfs_file = []
    for ii, row in successes.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        agency_url = f"{bucket}/schedule/{execution_date}/{agency_folder}"

        dir_files = [x for x in fs.listdir(agency_url) if x["type"] == "file"]

        for x in dir_files:
            gtfs_file.append(
                {
                    "calitp_itp_id": row["itp_id"],
                    "calitp_url_number": row["url_number"],
                    "calitp_extracted_at": execution_date.to_date_string(),
                    "full_path": x["name"],
                    "name": x["name"].split("/")[-1],
                    "size": x["size"],
                    "md5_hash": x["md5Hash"],
                }
            )

    res = pd.DataFrame(gtfs_file)

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
Ejemplo n.º 2
0
 def schedule_path(self):
     return os.path.join(
         get_bucket(),
         "schedule",
         # we timestamp with Airflow _execution date_ currently which is fun
         str(
             self.tick.subtract(days=1).replace(hour=0, minute=0,
                                                second=0)),
         f"{self.itp_id}_{self.url}",
     )
Ejemplo n.º 3
0
def validation_notice_fields():
    bucket = get_bucket()

    print(f"Globbing: {bucket}/schedule/processed/*/validation_report.json")

    fs = get_fs()
    reports = fs.glob(f"{bucket}/schedule/processed/*/validation_report.json")

    code_fields = defaultdict(lambda: set())

    print(f"Iterating through {len(reports)} reports")
    for fname in reports:
        report = json.load(fs.open(fname))
        # one entry per code (e.g. the code: invalid phone number)
        for notice in report["notices"]:
            # one entry per specific code violation (e.g. each invalid phone number)
            for entry in notice["notices"]:
                # map each code to the fields in its notice
                # (e.g. duplicate_route_name has a duplicatedField field
                for field_name, value in entry.items():
                    if isinstance(value, dict):
                        # handle the few cases where there's one level of nesting
                        sub_fields = [field_name + "." + v for v in value]
                        code_fields[notice["code"]].update(sub_fields)
                    else:
                        # handle the common case of no sub-objects
                        code_fields[notice["code"]].update(entry.keys())

    validation_json_fields = pd.DataFrame({
        "code":
        code_fields.keys(),
        "field":
        list(map(list, code_fields.values()))
    }).explode("field")

    write_table(validation_json_fields,
                "gtfs_schedule_history.validation_notice_fields")
Ejemplo n.º 4
0
def main(execution_date, ti, **kwargs):
    fs = get_fs()
    bucket = get_bucket()
    successes = get_successfully_downloaded_feeds(execution_date)

    ttl_feeds_copied = 0
    for k, row in successes.iterrows():
        date_string = execution_date.to_date_string()

        # only handle today's updated data (backfill dag to run all) ----

        # copy processed validator results ----
        id_and_url = f"{row['itp_id']}_{row['url_number']}"
        src_validator = "/".join([
            bucket,
            "schedule",
            str(execution_date),
            id_and_url,
            "processed",
            constants.VALIDATION_REPORT,
        ])
        dst_validator = "/".join([
            bucket,
            "schedule",
            "processed",
            f"{date_string}_{id_and_url}",
            constants.VALIDATION_REPORT,
        ])

        print(f"Copying from {src_validator} to {dst_validator}")

        fs.copy(src_validator, dst_validator)

        ttl_feeds_copied += 1

    print("total feeds copied:", ttl_feeds_copied)
def main(execution_date, ti, **kwargs):
    tables = get_table(f"{DATASET}.calitp_included_gtfs_tables", as_df=True)

    # TODO: replace w/ pybigquery pulling schemas directly from tables
    # pull schemas from external table tasks. these tasks only run once, so their
    # xcom data is stored as a prior date.
    schemas = [
        get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name
    ]
    # ti.xcom_pull(
    #     dag_id="gtfs_schedule_history", task_ids=tables, include_prior_dates=True
    # )

    # fetch latest feeds that need loading  from warehouse ----
    date_string = execution_date.to_date_string()

    tbl_feed = get_table(f"{DATASET}.calitp_feed_updates")
    q_today = tbl_feed.select().where(
        tbl_feed.c.calitp_extracted_at == date_string)

    df_latest_updates = (pd.read_sql(q_today, q_today.bind).rename(
        columns=lambda s: s.replace("calitp_", "")).convert_dtypes())

    # this zip needs to be converted to a list in order to be iterated through multiple
    # times in an inner loop per each feed update below. This resolves a regression as
    # described in https://github.com/cal-itp/data-infra/issues/848.
    table_details = list(zip(tables.file_name, tables.is_required, schemas))
    fs = get_fs()
    bucket = get_bucket()

    # load new feeds ----
    print(f"Number of feeds being loaded: {df_latest_updates.shape[0]}")

    ttl_feeds_copied = 0
    feed_tables_process_results = []
    feed_process_resuls = []
    for k, row in df_latest_updates.iterrows():
        # initialize variable to track whether a parsing error occurred and which tables
        # were loaded so far
        parse_error_encountered_in_this_feed = False
        id_and_url = f"{row['itp_id']}_{row['url_number']}"

        # process and copy over tables into external table folder ----
        for table_file, is_required, colnames in table_details:
            # validation report handled in a separate task, since it is in a subfolder
            # and should be ran separately in case the feed is unparseable.
            if table_file == constants.VALIDATION_REPORT:
                continue

            src_path = "/".join(
                ["schedule",
                 str(execution_date), id_and_url, table_file])
            dst_path = "/".join([
                "schedule", "processed", f"{date_string}_{id_and_url}",
                table_file
            ])

            print(f"Copying from {src_path} to {dst_path}")

            if not is_required and not fs.exists(f"{bucket}/{src_path}"):
                print(f"Skipping missing optional file: {src_path}")
            else:
                parse_error_encountered = False
                try:
                    _keep_columns(
                        src_path,
                        dst_path,
                        colnames,
                        row["itp_id"],
                        row["url_number"],
                        date_string,
                    )
                except ParserError:
                    print(
                        f"Fatal parsing error encountered in {table_file} for id and "
                        "URL: {id_and_url}.")
                    parse_error_encountered = True
                    parse_error_encountered_in_this_feed = True

                feed_tables_process_results.append({
                    "calitp_itp_id":
                    row["itp_id"],
                    "calitp_url_number":
                    row["url_number"],
                    "calitp_extracted_at":
                    execution_date.to_date_string(),
                    "filename":
                    table_file,
                    "parse_error_encountered":
                    parse_error_encountered,
                })

        # note the parse result for this feed
        feed_process_resuls.append({
            "calitp_itp_id":
            row["itp_id"],
            "calitp_url_number":
            row["url_number"],
            "calitp_extracted_at":
            execution_date.to_date_string(),
            "parse_error_encountered":
            parse_error_encountered_in_this_feed,
        })

        ttl_feeds_copied += 1

        print("total feeds copied:", ttl_feeds_copied)

    # save feed and feed table process results to external tables
    save_to_gcfs(
        pd.DataFrame(feed_process_resuls).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_parse_result.csv",
        use_pipe=True,
    )
    save_to_gcfs(
        pd.DataFrame(feed_tables_process_results).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_tables_parse_result.csv",
        use_pipe=True,
    )
Ejemplo n.º 6
0
    def fix_prefix(self, entry):
        bucket = get_bucket() if not self.bucket else self.bucket
        entry = entry.replace("gs://",
                              "") if entry.startswith("gs://") else entry

        return f"{bucket}/{entry}"