Exemple #1
0
    def test_get_sql(self, tmp_path):
        os.makedirs(tmp_path / "telmetry_derived")
        query_file = tmp_path / "telmetry_derived" / "query.sql"

        sql_content = "SELECT 123 "
        query_file.write_text(sql_content)

        assert DryRun(sqlfile=str(query_file)).get_sql() == sql_content
        with pytest.raises(ValueError):
            DryRun(sqlfile="invalid path").get_sql()
def prod_schemas_uri():
    """Return URI for the schemas tarball deployed to shared-prod.

    We construct a fake query and send it to the dry run service in order
    to read dataset labels, which contains the commit hash associated
    with the most recent production schemas deploy.
    """
    dryrun = DryRun("telemetry_derived/foo/query.sql", content="SELECT 1")
    build_id = dryrun.get_dataset_labels()["schemas_build_id"]
    commit_hash = build_id.split("_")[-1]
    return f"{MPS_URI}/archive/{commit_hash}.tar.gz"
Exemple #3
0
    def test_get_referenced_tables(self, tmp_path):
        os.makedirs(tmp_path / "telmetry_derived")
        query_file = tmp_path / "telmetry_derived" / "query.sql"
        query_file.write_text(
            "SELECT * FROM telemetry_derived.clients_daily_v6 "
            "WHERE submission_date = '2020-01-01'"
        )
        dryrun = DryRun(str(query_file))
        response = dryrun.get_referenced_tables()

        assert len(response) == 1
        assert response[0]["datasetId"] == "telemetry_derived"
        assert response[0]["tableId"] == "clients_daily_v6"
Exemple #4
0
    def test_view_file_valid(self, tmp_query_path):
        view_file = tmp_query_path / "view.sql"
        view_file.write_text(
            """
            SELECT
            *
            FROM
            `moz-fx-data-shared-prod.telemetry_derived.clients_daily_v6`
        """
        )

        # this view file is only valid with strip_dml flag
        dryrun = DryRun(sqlfile=str(view_file), strip_dml=True)
        assert dryrun.get_error() is Errors.DATE_FILTER_NEEDED
        assert dryrun.is_valid()
Exemple #5
0
def generate_derived_dataset_docs(out_dir, project_dir):
    """Generate documentation for derived datasets."""
    project_doc_dir = Path(out_dir) / "mozdata"

    # get a list of all user-facing datasets
    data_sets = [
        item for item in os.listdir(project_dir)
        if os.path.isdir(os.path.join(project_dir, item)) and all(
            name not in item for name in NON_USER_FACING_DATASET_SUFFIXES)
    ]

    for table in data_sets:
        output = []
        source_urls = {}
        with open(project_doc_dir / f"{table}.md", "w") as dataset_doc:
            # Manually set title to prevent Mkdocs from removing
            # underscores and capitalizing file names
            # https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
            dataset_doc.write(f"---\ntitle: {table}\n---\n\n")

            for root, dirs, files in os.walk(Path(project_dir) / table):
                # show views in an alphabetical order
                dirs.sort()
                if dirs:
                    continue
                dataset_name = root.split("/")[-1]
                source_urls["Source Directory"] = f"{SOURCE_URL}/{root}"

                metadata = {}
                if METADATA_FILE in files:
                    source_urls[
                        "Metadata File"] = f"{SOURCE_URL}/{root}/{METADATA_FILE}"
                    with open(os.path.join(root, METADATA_FILE)) as stream:
                        try:
                            metadata = yaml.safe_load(stream)
                        except yaml.YAMLError as error:
                            print(error)
                if VIEW_FILE in files:
                    source_urls[
                        "View Definition"] = f"{SOURCE_URL}/{root}/{VIEW_FILE}"
                    view_file = os.path.join(root, VIEW_FILE)

                    referenced_tables = DryRun(
                        sqlfile=view_file,
                        strip_dml=True).get_referenced_tables()
                file_loader = FileSystemLoader(
                    "bigquery_etl/docs/derived_datasets/templates")
                # Set up a new template environment
                env = Environment(loader=file_loader)
                # Create template with the markdown source text
                template = env.get_template("table.md")

                output = template.render(
                    metadata=metadata,
                    table_name=dataset_name,
                    source_urls=source_urls,
                    referenced_tables=referenced_tables,
                    project_url=f"{SOURCE_URL}/sql",
                )
                dataset_doc.write(output)
Exemple #6
0
def validate(project_dirs):
    """Validate UDF docs."""
    is_valid = True

    for project_dir in project_dirs:
        if os.path.isdir(project_dir):
            parsed_routines = read_routine_dir(project_dir)

            for root, dirs, files in os.walk(project_dir):
                if os.path.basename(root) == EXAMPLE_DIR:
                    for file in files:
                        dry_run_sql = sub_local_routines(
                            (Path(root) / file).read_text(),
                            project_dir,
                            parsed_routines,
                        )

                        # store sql in temporary file for dry_run
                        tmp_dir = Path(tempfile.mkdtemp()) / Path(root)
                        tmp_dir.mkdir(parents=True, exist_ok=True)
                        tmp_example_file = tmp_dir / file
                        tmp_example_file.write_text(dry_run_sql)

                        if not DryRun(str(tmp_example_file)).is_valid():
                            is_valid = False

    if not is_valid:
        print("Invalid examples.")
        sys.exit(1)
Exemple #7
0
    def test_dry_run_sql_file(self, tmp_path):
        query_file = tmp_path / "query.sql"
        query_file.write_text("SELECT 123")

        dryrun = DryRun(str(query_file))
        response = dryrun.dry_run_result
        assert response["valid"]
def validate(project_dirs):
    """Validate UDF docs."""
    # parse UDFs
    parsed_udfs = read_udf_dirs(*project_dirs)
    is_valid = True

    for project_dir in project_dirs:
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                if os.path.basename(root) == EXAMPLE_DIR:
                    for file in files:
                        dry_run_sql = sql_for_dry_run(os.path.join(root, file),
                                                      parsed_udfs, project_dir)

                        # store sql in temporary file for dry_run
                        tmp_dir = Path(tempfile.mkdtemp()) / Path(root)
                        tmp_dir.mkdir(parents=True, exist_ok=True)
                        tmp_example_file = tmp_dir / file
                        tmp_example_file.write_text(dry_run_sql)

                        if not DryRun(str(tmp_example_file)).is_valid():
                            is_valid = False

    if not is_valid:
        print("Invalid examples.")
        sys.exit(1)
Exemple #9
0
    def _get_referenced_tables(self):
        """
        Perform a dry_run to get tables the query depends on.

        Queries that reference more than 50 tables will not have a complete list
        of dependencies. See https://cloud.google.com/bigquery/docs/reference/
        rest/v2/Job#JobStatistics2.FIELDS.referenced_tables
        """
        logging.info(f"Get dependencies for {self.task_name}")

        if self.referenced_tables is None:
            table_names = set()
            query_files = [self.query_file]

            if self.multipart:
                # dry_run all files if query is split into multiple parts
                query_files = glob.glob(self.sql_file_path + "/*.sql")

            for query_file in query_files:
                referenced_tables = DryRun(query_file).get_referenced_tables()

                if len(referenced_tables) >= 50:
                    logging.warn(
                        "Query has 50 or more tables. Queries that reference more "
                        "than 50 tables will not have a complete list of "
                        "dependencies.")

                for t in referenced_tables:
                    table_names.add((t["datasetId"], t["tableId"]))

            # the order of table dependencies changes between requests
            # sort to maintain same order between DAG generation runs
            self.referenced_tables = sorted(table_names)
        return self.referenced_tables
Exemple #10
0
def validate_fully_qualified_references(view_file):
    """Check that referenced tables and views are fully qualified."""
    with open(view_file) as f:
        sql = f.read()

    # dry run only returns referenced tables, not views
    referenced_tables = DryRun(str(view_file)).get_referenced_tables()
    for line in sqlparse.format(sql, strip_comments=True).split("\n"):
        for referenced_table in referenced_tables:
            # If a view is referenced in the query then instead of the view name,
            # the tables that are referenced it the view definition are returned.
            # Some of these tables are not part of the SQL of the view that is
            # validated.
            ref_dataset = referenced_table[1]
            ref_table = referenced_table[2]
            # check for references to standard view names
            ref_view_dataset = ref_dataset.rsplit("_", 1)[0]
            ref_view = re.match(r"^(.*?)(_v\d+)?$", ref_table)[1]
            for ref_dataset, ref_table in [
                (ref_dataset, ref_table),
                (ref_view_dataset, ref_view),
            ]:
                if re.search(fr"(?<!\.)`?\b{ref_dataset}`?\.`?{ref_table}\b",
                             line):
                    print(
                        f"{view_file} ERROR\n"
                        f"{ref_dataset}.{ref_table} missing project ID qualifier."
                    )
                    return False
        return True
Exemple #11
0
    def from_query_file(cls, query_file: Path):
        """Create schema from a query file."""
        if not query_file.is_file() or query_file.suffix != ".sql":
            raise Exception(f"{query_file} is not a valid SQL file.")

        schema = DryRun(str(query_file)).get_schema()
        return cls(schema)
Exemple #12
0
    def test_get_error(self, tmp_query_path):
        view_file = tmp_query_path / "view.sql"

        view_file.write_text(
            """
        CREATE OR REPLACE VIEW
          `moz-fx-data-shared-prod.telemetry.clients_daily`
        AS
        SELECT
        *
        FROM
          `moz-fx-data-shared-prod.telemetry_derived.clients_daily_v6`
        """
        )

        valid_dml_stripped = """
        SELECT
        *
        FROM
          `moz-fx-data-shared-prod.telemetry_derived.clients_daily_v6`
        WHERE submission_date > current_date()
        """

        invalid_dml_stripped = """
        SELECT
        *
        FROM
          `moz-fx-data-shared-prod.telemetry_derived.clients_daily_v6`
        WHERE something
        WHERE submission_date > current_date()
        """

        assert DryRun(sqlfile=str(view_file)).get_error() is Errors.READ_ONLY
        assert (
            DryRun(sqlfile=str(view_file), strip_dml=True).get_error()
            is Errors.DATE_FILTER_NEEDED
        )
        assert (
            DryRun(sqlfile=str(view_file), content=invalid_dml_stripped).get_error()
            is Errors.DATE_FILTER_NEEDED_AND_SYNTAX
        )
        assert (
            DryRun(
                sqlfile=str(view_file), content=valid_dml_stripped, strip_dml=True
            ).get_error()
            is None
        )
Exemple #13
0
    def for_table(cls, project, dataset, table, partitioned_by=None):
        """Get the schema for a BigQuery table."""
        query = f"SELECT * FROM {project}.{dataset}.{table}"

        if partitioned_by:
            query += f" WHERE DATE({partitioned_by}) = DATE('2020-01-01')"

        # write query to temporary file so it can get dry run
        tmp = NamedTemporaryFile()
        with open(tmp.name, "w") as f:
            f.write(query)

        try:
            return cls(DryRun(str(tmp.name)).get_schema())
        except Exception as e:
            print(f"Cannot get schema for {project}.{dataset}.{table}: {e}")
            return cls({"fields": []})
Exemple #14
0
    def test_get_referenced_tables(self, tmp_query_path):
        query_file = tmp_query_path / "query.sql"
        query_file.write_text(
            "SELECT * FROM telemetry_derived.clients_daily_v6 "
            "WHERE submission_date = '2020-01-01'"
        )
        query_dryrun = DryRun(str(query_file)).get_referenced_tables()

        assert len(query_dryrun) == 1
        assert query_dryrun[0]["datasetId"] == "telemetry_derived"
        assert query_dryrun[0]["tableId"] == "clients_daily_v6"

        view_file = tmp_query_path / "view.sql"
        view_file.write_text(
            """
            CREATE OR REPLACE VIEW
            `moz-fx-data-shared-prod.telemetry.clients_daily`
            AS
            SELECT
            *
            FROM
            `moz-fx-data-shared-prod.telemetry_derived.clients_daily_v6`
        """
        )
        view_dryrun = DryRun(str(view_file), strip_dml=True).get_referenced_tables()

        assert len(view_dryrun) == 1
        assert view_dryrun[0]["datasetId"] == "telemetry_derived"
        assert view_dryrun[0]["tableId"] == "clients_daily_v6"

        view_file.write_text(
            """
        SELECT document_id
        FROM mozdata.org_mozilla_firefox.baseline
        WHERE submission_timestamp > current_timestamp()
        UNION ALL
        SELECT document_id
        FROM mozdata.org_mozilla_fenix.baseline
        WHERE submission_timestamp > current_timestamp()
        """
        )
        multiple_tables = DryRun(str(view_file)).get_referenced_tables()
        multiple_tables.sort(key=lambda x: x["datasetId"])

        assert len(multiple_tables) == 2
        assert multiple_tables[0]["datasetId"] == "org_mozilla_fenix_stable"
        assert multiple_tables[0]["tableId"] == "baseline_v1"
        assert multiple_tables[1]["datasetId"] == "org_mozilla_firefox_stable"
        assert multiple_tables[1]["tableId"] == "baseline_v1"
def main():
    """Validate SQL examples."""
    args = parser.parse_args()

    # parse UDFs
    parsed_udfs = read_udf_dirs(*args.project_dirs)

    for project_dir in args.project_dirs:
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                if os.path.basename(root) == EXAMPLE_DIR:
                    for file in files:
                        dry_run_sql = sql_for_dry_run(os.path.join(root, file),
                                                      parsed_udfs, project_dir)

                        # store sql in temporary file for dry_run
                        tmp_dir = Path(tempfile.mkdtemp()) / Path(root)
                        tmp_dir.mkdir(parents=True, exist_ok=True)
                        tmp_example_file = tmp_dir / file
                        tmp_example_file.write_text(dry_run_sql)

                        DryRun(str(tmp_example_file)).is_valid()
Exemple #16
0
    def test_sql_file_invalid(self, tmp_path):
        query_file = tmp_path / "query.sql"
        query_file.write_text("SELECT INVALID 123")

        dryrun = DryRun(str(query_file))
        assert dryrun.is_valid() is False
def referenced_table_exists(view_sql):
    """Dry run the given view SQL to see if its referent exists."""
    dryrun = DryRun("foo/bar/view.sql", content=view_sql)
    return 404 not in [e.get("code") for e in dryrun.errors()]
Exemple #18
0
    def test_get_referenced_tables_empty(self, tmp_path):
        query_file = tmp_path / "query.sql"
        query_file.write_text("SELECT 123")

        dryrun = DryRun(str(query_file))
        assert dryrun.get_referenced_tables() == []