Ejemplo n.º 1
0
    def execute(self, context):
        df = airtable_to_df(
            self.air_base_id,
            self.air_table_name,
            self.id_name,
            self.rename_fields,
            self.column_prefix,
            self.api_key,
        )

        if self.table_name:
            print(f"Writing table with shape: {df.shape}")
            write_table(df, self.table_name)

        if self.gcs_path:
            clean_gcs_path = re.sub(r"\/+$", "", self.gcs_path)
            gcs_file = (
                f"{clean_gcs_path}/{context['execution_date']}/{self.table_name}.csv"
            )
            print(f"Uploading to gcs at {gcs_file}")
            save_to_gcfs(df.to_csv(index=False).encode(), f"{gcs_file}", use_pipe=True)
Ejemplo n.º 2
0
def validation_notice_fields():
    bucket = get_bucket()

    print(f"Globbing: {bucket}/schedule/processed/*/validation_report.json")

    fs = get_fs()
    reports = fs.glob(f"{bucket}/schedule/processed/*/validation_report.json")

    code_fields = defaultdict(lambda: set())

    print(f"Iterating through {len(reports)} reports")
    for fname in reports:
        report = json.load(fs.open(fname))
        # one entry per code (e.g. the code: invalid phone number)
        for notice in report["notices"]:
            # one entry per specific code violation (e.g. each invalid phone number)
            for entry in notice["notices"]:
                # map each code to the fields in its notice
                # (e.g. duplicate_route_name has a duplicatedField field
                for field_name, value in entry.items():
                    if isinstance(value, dict):
                        # handle the few cases where there's one level of nesting
                        sub_fields = [field_name + "." + v for v in value]
                        code_fields[notice["code"]].update(sub_fields)
                    else:
                        # handle the common case of no sub-objects
                        code_fields[notice["code"]].update(entry.keys())

    validation_json_fields = pd.DataFrame({
        "code":
        code_fields.keys(),
        "field":
        list(map(list, code_fields.values()))
    }).explode("field")

    write_table(validation_json_fields,
                "gtfs_schedule_history.validation_notice_fields")
Ejemplo n.º 3
0
# ---
# operator: operators.PythonToWarehouseOperator
# table_name: "gtfs_rt.validation_code_descriptions"
# fields:
#   code: RT Validation error code name
#   description: A description of the validation error
#   is_critical: Whether this error is considered a Cal-ITP critical error
# ---

import pandas as pd
from calitp import write_table, to_snakecase

sheet_url = (
    "https://docs.google.com/spreadsheets"
    "/d/1GDDaDlsBPCYn3dtYPSABnce9ns3ekJ8Jzfgyy56lZz4/export?gid=617612870&format=csv"
)

code_descriptions = pd.read_csv(sheet_url).pipe(to_snakecase)

write_table(code_descriptions, "gtfs_rt.validation_code_descriptions")
Ejemplo n.º 4
0
import pandas as pd
from calitp import get_engine, write_table

from testing import Tester

COLNAMES = ["x", "y"]

df_has_null = pd.DataFrame([(1, None), (2, "b")], columns=COLNAMES)

df_not_uniq = pd.DataFrame([(1, "a"), (1, "b")], columns=COLNAMES)

df_not_composite_uniq = pd.DataFrame([(1, "a"), (2, "b"), (1, "a")], columns=COLNAMES)

engine = get_engine()

write_table(df_has_null, "sandbox.testing_has_null")
write_table(df_not_uniq, "sandbox.testing_not_uniq")
write_table(df_not_composite_uniq, "sandbox.testing_not_composite_uniq")

# FAIL: nulls
tester = Tester.from_tests(
    engine, "sandbox.testing_has_null", {"check_null": ["x", "y"]}
)
print(tester.get_test_results())
assert not tester.all_passed()

# PASS: no nulls
tester = Tester.from_tests(
    engine, "sandbox.testing_not_uniq", {"check_null": ["x", "y"]}
)
print(tester.get_test_results())
Ejemplo n.º 5
0
# operator: operators.PythonToWarehouseOperator
# table_name: "gtfs_schedule_history.validation_code_descriptions"
# fields:
#   severity: Severity of the error code (e.g. validation_codes.severity)
#   code: Code name (e.g. validation_codes.code)
# ---

import pandas as pd
from calitp import write_table, to_snakecase

sheet_url = (
    "https://docs.google.com/spreadsheets"
    "/d/1GDDaDlsBPCYn3dtYPSABnce9ns3ekJ8Jzfgyy56lZz4/export?gid=0&format=csv")

code_descriptions = (pd.read_csv(sheet_url).pipe(to_snakecase).rename(
    columns={
        "type": "severity",
        "name": "code"
    }))

code_descriptions["code"] = (code_descriptions.code.str.replace(
    r"(?<!^)(?=[A-Z])", "_").str.lower().str.replace("_notice$", "").replace({
        "i_o_error":
        "io_error",
        "u_r_i_syntax_error":
        "uri_syntax_error"
    }))

write_table(code_descriptions,
            "gtfs_schedule_history.validation_code_descriptions")
Ejemplo n.º 6
0
# ---
# operator: operators.PythonToWarehouseOperator
# table_name: "sandbox.python_to_warehouse"
# fields:
#   g: The g field python
#   x: The x field python
# doc_md: |
#   This is an example of the PythonOperator.
#
# dependencies:
#   - create_dataset
# ---

import pandas as pd
from calitp import write_table

df = pd.DataFrame({"g": ["a", "b"], "x": [1, 2]})

write_table(df, "sandbox.python_to_warehouse")