Example #1
0
def validator_process(execution_date, **kwargs):
    base_path = f"schedule/{execution_date}"
    successes = get_successfully_downloaded_feeds(execution_date)

    # hold on to notices, so we can infer schema after
    # note that I've commented out the code for inferring schema below,
    # but it was usefule for generating, then hand-tweaking to load
    # into bigquery
    # notice_entries = []
    for k, row in successes.iterrows():
        agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}"
        url = f"{agency_path}/validation.json"
        dst_path = f"{agency_path}/processed/validation_report.json"

        validation = json.load(read_gcfs(url))

        # copy code-level notices, and add internal ids
        raw_codes = {**validation["data"]["report"]}
        raw_codes["calitp_itp_id"] = row["itp_id"]
        raw_codes["calitp_url_number"] = row["url_number"]
        raw_codes["calitp_extracted_at"] = execution_date.to_date_string()
        raw_codes["calitp_gtfs_validated_by"] = validation["version"]

        # coerce types labeled "string" to a string
        coerce_notice_values_to_str(raw_codes, COERCE_TO_STRING)

        json_codes = json.dumps(raw_codes).encode()
        # df_notices = process_notices(row["itp_id"], row["url_number"], validation)
        # csv_string = df_notices.to_csv(index=None).encode()
        # notice_entries.extend(df_notices.notices.tolist())

        save_to_gcfs(json_codes, dst_path, use_pipe=True)
    def execute(self, context):
        # use the DAG's logical date as the data interval start,
        # and ensure the 'start' hour is 0 no matter what the 'schedule_interval' is.
        start_datetime = context.get("execution_date").set(hour=0)

        # add 23 hours to the start date to make the total range equal to 24 hours.
        # (the 'end' parameter is inclusive: https://developers.amplitude.com/docs/export-api#export-api-parameters)
        start = start_datetime.strftime(DATE_FORMAT)
        end = (start_datetime + timedelta(hours=23)).strftime(DATE_FORMAT)

        events_df = amplitude_to_df(
            start,
            end,
            api_key_env=self.api_key_env,
            secret_key_env=self.secret_key_env,
            rename_fields=self.rename_fields,
        )

        events_jsonl = events_df.to_json(orient="records",
                                         lines=True,
                                         date_format="iso")
        gcs_file_path = f"{self.app_name}/{start}-{end}.jsonl"

        bucket_name = ("ingest_amplitude_raw_dev"
                       if is_development() else "ingest_amplitude_raw_prod")

        # if a file already exists at `gcs_file_path`, GCS will overwrite the existing file
        calitp.save_to_gcfs(events_jsonl.encode(),
                            gcs_file_path,
                            bucket=bucket_name,
                            use_pipe=True)
Example #3
0
def main(execution_date, **kwargs):
    in_path = f"schedule/{execution_date}"
    print(in_path)

    successes = get_successfully_downloaded_feeds(execution_date)

    agency_errors = []
    loadable_agencies = []
    for ii, row in successes.iterrows():
        path_agency = f"{in_path}/{row['itp_id']}_{row['url_number']}"
        path_validation = f"{path_agency}/{VALIDATION_FILE}"

        print(f"reading validation file: {path_validation}")
        validation = json.load(read_gcfs(path_validation))

        unique_codes = get_notice_codes(validation)

        if ERROR_MISSING_FILE not in unique_codes:
            loadable_agencies.append(path_agency)
        else:
            agency = dict(itp_id=row["itp_id"], url_number=row["url_number"])
            agency_errors.append(agency)

    errors_df = pd.DataFrame(agency_errors)
    errors_str = errors_df.to_csv(index=False).encode()
    save_to_gcfs(errors_str,
                 f"{in_path}/processed/agency_load_errors.csv",
                 use_pipe=True)

    return loadable_agencies
def main(execution_date, **kwargs):
    # TODO: remove hard-coded project string
    fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

    bucket = get_bucket()

    f = read_gcfs(f"schedule/{execution_date}/status.csv")
    status = pd.read_csv(f)

    success = status[lambda d: d.status == "success"]

    gtfs_files = []
    for ii, row in success.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*"

        gtfs_files.append(fs.glob(gtfs_url))

    res = (success[["itp_id",
                    "url_number"]].assign(gtfs_file=gtfs_files).explode(
                        "gtfs_file").loc[lambda d: d.gtfs_file != "processed"])

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
Example #5
0
def main(execution_date, **kwargs):
    fs = get_fs()
    bucket = get_bucket()

    successes = get_successfully_downloaded_feeds(execution_date)

    gtfs_file = []
    for ii, row in successes.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        agency_url = f"{bucket}/schedule/{execution_date}/{agency_folder}"

        dir_files = [x for x in fs.listdir(agency_url) if x["type"] == "file"]

        for x in dir_files:
            gtfs_file.append(
                {
                    "calitp_itp_id": row["itp_id"],
                    "calitp_url_number": row["url_number"],
                    "calitp_extracted_at": execution_date.to_date_string(),
                    "full_path": x["name"],
                    "name": x["name"].split("/")[-1],
                    "size": x["size"],
                    "md5_hash": x["md5Hash"],
                }
            )

    res = pd.DataFrame(gtfs_file)

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
def gen_list(execution_date, **kwargs):
    """
    task callable to generate the list and push into
    xcom
    """

    # get a table of feed urls from agencies.yml
    # we fetch both the raw and filled w/ API key versions to save
    filled_agencies_file = ("data/agencies.filled.yml"
                            if is_development() else "data/agencies.yml")
    feeds_raw = make_gtfs_list(pipe_file_name("data/agencies_raw.yml"))
    feeds = make_gtfs_list(pipe_file_name(filled_agencies_file))

    path_metadata = f"schedule/{execution_date}/metadata"

    save_to_gcfs(
        feeds_raw.to_csv(index=False).encode(),
        f"{path_metadata}/feeds_raw.csv",
        use_pipe=True,
    )
    save_to_gcfs(feeds.to_csv(index=False).encode(),
                 f"{path_metadata}/feeds.csv",
                 use_pipe=True)

    # note that right now we useairflow's xcom functionality in this dag.
    # because xcom can only store a small amount of data, we have to drop some
    # columns. this is the only dag that uses xcom, and we should remove it!
    df_subset = feeds.drop(columns=[
        "gtfs_rt_vehicle_positions_url",
        "gtfs_rt_service_alerts_url",
        "gtfs_rt_trip_updates_url",
    ])

    return df_subset.to_dict("records")
def validator_process(execution_date, **kwargs):
    base_path = f"schedule/{execution_date}"

    status = pd.read_csv(read_gcfs(f"{base_path}/status.csv"))
    success = status[lambda d: d.status == "success"]

    # hold on to notices, so we can infer schema after
    # note that I've commented out the code for inferring schema below,
    # but it was usefule for generating, then hand-tweaking to load
    # into bigquery
    # notice_entries = []
    for k, row in success.iterrows():
        agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}"
        url = f"{agency_path}/validation.json"
        dst_path = f"{agency_path}/processed/validation_report.json"

        validation = json.load(read_gcfs(url))

        # copy code-level notices, and add internal ids
        raw_codes = {**validation["data"]["report"]}
        raw_codes["calitp_itp_id"] = row["itp_id"]
        raw_codes["calitp_url_number"] = row["url_number"]
        raw_codes["calitp_gtfs_validated_by"] = validation["version"]

        json_codes = json.dumps(raw_codes).encode()
        # df_notices = process_notices(row["itp_id"], row["url_number"], validation)
        # csv_string = df_notices.to_csv(index=None).encode()
        # notice_entries.extend(df_notices.notices.tolist())

        save_to_gcfs(json_codes, dst_path, use_pipe=True)
Example #8
0
def downloader(task_instance, execution_date, **kwargs):
    """Download gtfs data from agency urls

    Returns dict of form {gtfs_paths, errors}
    """

    provider_set = task_instance.xcom_pull(task_ids="generate_provider_list")
    url_status = []

    gtfs_paths = []
    for row in provider_set:
        print(row)
        try:
            res_path = download_url(
                row["gtfs_schedule_url"],
                row["itp_id"],
                row["url_number"],
                execution_date,
            )
            gtfs_paths.append(res_path)

            status = "success"
        except NoFeedError as e:
            logging.warn(f"error downloading agency {row['agency_name']}")
            logging.info(e)

            status = str(e)

        url_status.append(status)

    df_status = pd.DataFrame(provider_set).assign(status=url_status)

    src_path = Path(SRC_DIR) / f"{execution_date}/status.csv"
    dst_path = Path(DST_DIR) / f"{execution_date}/status.csv"

    df_status.convert_dtypes().to_csv(src_path, index=False)
    save_to_gcfs(src_path, dst_path)

    df_errors = df_status[lambda d: d.status != "success"]
    error_agencies = df_errors[["agency_name", "gtfs_schedule_url", "status"]]
    error_records = error_agencies.to_dict(orient="record")

    logging.info(f"error agencies: {error_agencies.agency_name.tolist()}")

    return {"gtfs_paths": gtfs_paths, "errors": error_records}
Example #9
0
def glob_daily_files(date_string, fs, logger):
    # TODO: remove hard-coded project string

    # <datetime>/<itp_id>/<url_number>/<filename>
    path_to_glob = f"{get_bucket()}/rt/{date_string}*/*/*/*"
    logger.info("Globbing {}".format(path_to_glob))
    all_files = fs.glob(path_to_glob, detail=True)
    logger.info("Finished globbing")
    fs.dircache.clear()

    raw_res = pd.DataFrame(all_files.values())

    if not len(raw_res):
        logger.info("No data for this date")

        return

    # ----
    # Do some light pre-processing to add internal data

    ser_id = raw_res["name"].str.split("/")

    raw_res["calitp_itp_id"] = ser_id.str.get(-3).astype(int)
    raw_res["calitp_url_number"] = ser_id.str.get(-2).astype(int)
    raw_res["calitp_extracted_at"] = ser_id.str.get(-4)

    raw_res["full_path"] = raw_res["name"]
    raw_res["name"] = ser_id.str.get(-1)
    raw_res["md5_hash"] = raw_res["md5Hash"]

    res = raw_res[[
        "calitp_itp_id",
        "calitp_url_number",
        "calitp_extracted_at",
        "name",
        "size",
        "md5_hash",
    ]]
    output_path = f"rt-processed/calitp_files/{date_string}.csv"
    save_to_gcfs(
        res.to_csv(index=False).encode(),
        output_path,
        use_pipe=True,
    )
Example #10
0
def _keep_columns(gcs_dirs,
                  dst_dir,
                  filename,
                  required_cols,
                  optional_cols,
                  prepend_ids=True):
    for path in gcs_dirs:
        full_src_path = f"{path}/{filename}"
        full_dst_path = f"{path}/{dst_dir}/{filename}"

        final_header = [*required_cols, *optional_cols]

        # read csv using object dtype, so pandas does not coerce data
        df = pd.read_csv(read_gcfs(full_src_path), dtype="object")

        # preprocess data to include cal-itp id columns ---
        # column names: calitp_id, calitp_url_number
        if prepend_ids:
            # hacky, but parse /path/.../{itp_id}/{url_number}
            basename = path.split("/")[-1]
            itp_id, url_number = map(int, basename.split("_"))

            df = df.assign(calitp_itp_id=itp_id, calitp_url_number=url_number)

        # get specified columns, inserting NA columns where needed ----
        df_cols = set(df.columns)
        opt_cols_present = [x for x in optional_cols if x in df_cols]

        df_select = df[[*required_cols, *opt_cols_present]]

        # fill in missing columns ----
        for ii, colname in enumerate(final_header):
            if colname not in df_select:
                print("INSERTING MISSING COLUMN")
                df_select.insert(ii, colname, pd.NA)
            print("SHAPE: ", df_select.shape)

        # save result ----
        csv_result = df_select

        encoded = csv_result.to_csv(index=False).encode()
        save_to_gcfs(encoded, full_dst_path, use_pipe=True)
Example #11
0
    def execute(self, context):
        df = airtable_to_df(
            self.air_base_id,
            self.air_table_name,
            self.id_name,
            self.rename_fields,
            self.column_prefix,
            self.api_key,
        )

        if self.table_name:
            print(f"Writing table with shape: {df.shape}")
            write_table(df, self.table_name)

        if self.gcs_path:
            clean_gcs_path = re.sub(r"\/+$", "", self.gcs_path)
            gcs_file = (
                f"{clean_gcs_path}/{context['execution_date']}/{self.table_name}.csv"
            )
            print(f"Uploading to gcs at {gcs_file}")
            save_to_gcfs(df.to_csv(index=False).encode(), f"{gcs_file}", use_pipe=True)
Example #12
0
def download_url(url, itp_id, url_number, execution_date):
    """
    Download a URL as a task item
    using airflow. **kwargs are airflow
    """

    headers = {
        "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4)"
                       "AppleWebKit/537.36 (KHTML, like Gecko)"
                       "Chrome/83.0.4103.97 Safari/537.36")
    }
    if pd.isna(itp_id):
        raise NoFeedError("missing itp_id")

    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
    except requests.exceptions.HTTPError as err:
        logging.warning(f"No feed found for {url}, {err}")
        raise NoFeedError("error: HTTPError")
    except UnicodeError:
        # this occurs when a misformatted url is given
        raise NoFeedError("error: UnicodeError")
    except Exception as e:
        raise NoFeedError(f"error: {e}")

    try:
        z = zipfile.ZipFile(io.BytesIO(r.content))
        # replace here with s3fs
        rel_path = Path(f"{execution_date}/{int(itp_id)}_{int(url_number)}")
        src_path = SRC_DIR / rel_path
        dst_path = DST_DIR / rel_path

        src_path.mkdir(parents=True, exist_ok=True)
        z.extractall(src_path)

        full_path = save_to_gcfs(src_path, dst_path, recursive=True)
    except zipfile.BadZipFile:
        logging.warning(f"failed to zipfile {url}")
        raise NoFeedError("error: BadZipFile")

    return full_path
    }
)

# second entry removed
df2 = pd.DataFrame(
    {
        "calitp_itp_id": [1],
        "calitp_url_number": [0],
        "x": [1],
        "calitp_extracted_at": "2021-01-02",
    }
)


# new first entry, second entry returns
df3 = pd.DataFrame(
    {
        "calitp_itp_id": [1, 1],
        "calitp_url_number": [0, 0],
        "x": [99, 2],
        "calitp_extracted_at": "2021-01-03",
    }
)

for ii, df in enumerate([df1, df2, df3]):
    save_to_gcfs(
        df.to_csv(index=False).encode(),
        f"sandbox/external_table_{ii + 1}.csv",
        use_pipe=True,
    )
def main(execution_date, ti, **kwargs):
    tables = get_table(f"{DATASET}.calitp_included_gtfs_tables", as_df=True)

    # TODO: replace w/ pybigquery pulling schemas directly from tables
    # pull schemas from external table tasks. these tasks only run once, so their
    # xcom data is stored as a prior date.
    schemas = [
        get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name
    ]
    # ti.xcom_pull(
    #     dag_id="gtfs_schedule_history", task_ids=tables, include_prior_dates=True
    # )

    # fetch latest feeds that need loading  from warehouse ----
    date_string = execution_date.to_date_string()

    tbl_feed = get_table(f"{DATASET}.calitp_feed_updates")
    q_today = tbl_feed.select().where(
        tbl_feed.c.calitp_extracted_at == date_string)

    df_latest_updates = (pd.read_sql(q_today, q_today.bind).rename(
        columns=lambda s: s.replace("calitp_", "")).convert_dtypes())

    # this zip needs to be converted to a list in order to be iterated through multiple
    # times in an inner loop per each feed update below. This resolves a regression as
    # described in https://github.com/cal-itp/data-infra/issues/848.
    table_details = list(zip(tables.file_name, tables.is_required, schemas))
    fs = get_fs()
    bucket = get_bucket()

    # load new feeds ----
    print(f"Number of feeds being loaded: {df_latest_updates.shape[0]}")

    ttl_feeds_copied = 0
    feed_tables_process_results = []
    feed_process_resuls = []
    for k, row in df_latest_updates.iterrows():
        # initialize variable to track whether a parsing error occurred and which tables
        # were loaded so far
        parse_error_encountered_in_this_feed = False
        id_and_url = f"{row['itp_id']}_{row['url_number']}"

        # process and copy over tables into external table folder ----
        for table_file, is_required, colnames in table_details:
            # validation report handled in a separate task, since it is in a subfolder
            # and should be ran separately in case the feed is unparseable.
            if table_file == constants.VALIDATION_REPORT:
                continue

            src_path = "/".join(
                ["schedule",
                 str(execution_date), id_and_url, table_file])
            dst_path = "/".join([
                "schedule", "processed", f"{date_string}_{id_and_url}",
                table_file
            ])

            print(f"Copying from {src_path} to {dst_path}")

            if not is_required and not fs.exists(f"{bucket}/{src_path}"):
                print(f"Skipping missing optional file: {src_path}")
            else:
                parse_error_encountered = False
                try:
                    _keep_columns(
                        src_path,
                        dst_path,
                        colnames,
                        row["itp_id"],
                        row["url_number"],
                        date_string,
                    )
                except ParserError:
                    print(
                        f"Fatal parsing error encountered in {table_file} for id and "
                        "URL: {id_and_url}.")
                    parse_error_encountered = True
                    parse_error_encountered_in_this_feed = True

                feed_tables_process_results.append({
                    "calitp_itp_id":
                    row["itp_id"],
                    "calitp_url_number":
                    row["url_number"],
                    "calitp_extracted_at":
                    execution_date.to_date_string(),
                    "filename":
                    table_file,
                    "parse_error_encountered":
                    parse_error_encountered,
                })

        # note the parse result for this feed
        feed_process_resuls.append({
            "calitp_itp_id":
            row["itp_id"],
            "calitp_url_number":
            row["url_number"],
            "calitp_extracted_at":
            execution_date.to_date_string(),
            "parse_error_encountered":
            parse_error_encountered_in_this_feed,
        })

        ttl_feeds_copied += 1

        print("total feeds copied:", ttl_feeds_copied)

    # save feed and feed table process results to external tables
    save_to_gcfs(
        pd.DataFrame(feed_process_resuls).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_parse_result.csv",
        use_pipe=True,
    )
    save_to_gcfs(
        pd.DataFrame(feed_tables_process_results).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_tables_parse_result.csv",
        use_pipe=True,
    )
Example #15
0
def _keep_columns(
    src_path,
    dst_path,
    colnames,
    itp_id=None,
    url_number=None,
    extracted_at=None,
    **kwargs,
):
    """Save a CSV file with only the needed columns for a particular table.

    Args:
        src_path (string): Location of the input CSV file
        dst_path (string): Location of the output CSV file
        colnames (list): List of the colnames that should be included in output CSV
            file.
        itp_id (string, optional): itp_id to use when saving record. Defaults to None.
        url_number (string, optional): url_number to use when saving record. Defaults to
            None.
        extracted_at (string, optional): date string of extraction time. Defaults to
            None.

    Raises:
        pandas.errors.ParserError: Can be thrown when the given input file is not a
            valid CSV file. Ex: a single row could have too many columns.
    """

    # Read csv using object dtype, so pandas does not coerce data.
    # The following line of code inside the try block can throw a
    # pandas.errors.ParserError, but the responsibility to catch this error is assumed
    # to be implemented in the code that calls this method.
    try:
        df = pd.read_csv(
            read_gcfs(src_path), dtype="object", encoding_errors="replace", **kwargs
        )
    except EmptyDataError:
        # in the rare case of a totally empty data file, create a DataFrame
        # with no rows, and the target columns
        df = pd.DataFrame({k: [] for k in colnames})

    if itp_id is not None:
        df["calitp_itp_id"] = itp_id

    if url_number is not None:
        df["calitp_url_number"] = url_number

    # get specified columns, inserting NA columns where needed ----
    df_cols = set(df.columns)
    cols_present = [x for x in colnames if x in df_cols]

    df_select = df.loc[:, cols_present]

    # fill in missing columns ----
    print("DataFrame missing columns: ", set(df_select.columns) - set(colnames))

    for ii, colname in enumerate(colnames):
        if colname not in df_select:
            df_select.insert(ii, colname, pd.NA)

    if extracted_at is not None:
        df_select["calitp_extracted_at"] = extracted_at

    # save result ----
    csv_result = df_select.to_csv(index=False).encode()

    save_to_gcfs(csv_result, dst_path, use_pipe=True)