Example #1
0
def main():
    """Publish query data as JSON to GCS."""
    args, query_arguments = parser.parse_known_args()

    try:
        metadata = Metadata.of_sql_file(args.query_file)
    except FileNotFoundError:
        print("No metadata file for: {}".format(args.query_file))
        return

    # check if the data should be published as JSON
    if not metadata.is_public_json():
        return

    if not validate_public_data(metadata, args.query_file):
        sys.exit(1)

    storage_client = storage.Client()
    client = bigquery.Client(args.public_project_id)

    publisher = JsonPublisher(
        client,
        storage_client,
        args.public_project_id,
        args.query_file,
        args.api_version,
        args.target_bucket,
        args.parameter,
    )
    publisher.publish_json()
Example #2
0
    def of_query(cls, query_file):
        """
        Create task that schedules the corresponding query in Airflow.

        Raises FileNotFoundError if not metadata file exists for query.
        """
        metadata = Metadata.of_sql_file(query_file)
        return cls(query_file, metadata)
Example #3
0
    def test_of_sql_file(self):
        metadata_file = (TEST_DIR / "data" / "test_sql" / "test" /
                         "non_incremental_query_v1" / "query.sql")
        metadata = Metadata.of_sql_file(metadata_file)

        assert metadata.friendly_name == "Test table for a non-incremental query"
        assert metadata.description == "Test table for a non-incremental query"
        assert metadata.review_bug() == "1999999"
Example #4
0
def run(
    query_file,
    dataset_id,
    destination_table,
    query_arguments,
    public_project_id=PUBLIC_PROJECT_ID,
):
    """Execute bq to run a query."""
    if dataset_id is not None:
        # dataset ID was parsed by argparse but needs to be passed as parameter
        # when running the query
        query_arguments.append("--dataset_id={}".format(dataset_id))

    use_public_table = False

    try:
        metadata = Metadata.of_sql_file(query_file)
        if metadata.is_public_bigquery():
            if not validate_public_data(metadata, query_file):
                sys.exit(1)

            # change the destination table to write results to the public dataset;
            # a view to the public table in the internal dataset is created
            # when CI runs
            if (dataset_id is not None and destination_table is not None
                    and re.match(DESTINATION_TABLE_RE, destination_table)):
                destination_table = "{}:{}.{}".format(public_project_id,
                                                      dataset_id,
                                                      destination_table)
                query_arguments.append(
                    "--destination_table={}".format(destination_table))
                use_public_table = True
            else:
                print("ERROR: Cannot run public dataset query. Parameters"
                      " --destination_table=<table without dataset ID> and"
                      " --dataset_id=<dataset> required")
                sys.exit(1)
    except yaml.YAMLError as e:
        print(e)
        sys.exit(1)
    except FileNotFoundError:
        print("INFO: No metadata.yaml found for {}", query_file)

    if not use_public_table and destination_table is not None:
        # destination table was parsed by argparse, however if it wasn't modified to
        # point to a public table it needs to be passed as parameter for the query
        query_arguments.append(
            "--destination_table={}".format(destination_table))

    with open(query_file) as query_stream:
        # run the query as shell command so that passed parameters can be used as is
        subprocess.check_call(["bq"] + query_arguments, stdin=query_stream)
Example #5
0
    def of_query(cls, query_file, metadata=None, dag_collection=None):
        """
        Create task that schedules the corresponding query in Airflow.

        Raises FileNotFoundError if not metadata file exists for query.
        If `metadata` is set, then it is used instead of the metadata.yaml
        file that might exist alongside the query file.
        """
        converter = cattr.Converter()
        if metadata is None:
            metadata = Metadata.of_sql_file(query_file)

        dag_name = metadata.scheduling.get("dag_name")
        if dag_name is None:
            raise UnscheduledTask(
                f"Metadata for {query_file} does not contain scheduling information."
            )

        task_config = {"query_file": str(query_file)}
        task_config.update(metadata.scheduling)

        if len(metadata.owners) <= 0:
            raise TaskParseException(
                f"No owner specified in metadata for {query_file}."
            )

        # Airflow only allows to set one owner, so we just take the first
        task_config["owner"] = metadata.owners[0]

        # Get default email from default_args if available
        default_email = []
        if dag_collection is not None:
            dag = dag_collection.dag_by_name(dag_name)
            if dag is not None:
                default_email = dag.default_args.email
        email = task_config.get("email", default_email)
        # owners get added to the email list
        task_config["email"] = list(set(email + metadata.owners))

        # data processed in task should be published
        if metadata.is_public_json():
            task_config["public_json"] = True

        try:
            return converter.structure(task_config, cls)
        except TypeError as e:
            raise TaskParseException(
                f"Invalid scheduling information format for {query_file}: {e}"
            )
Example #6
0
    def __init__(
        self,
        client,
        storage_client,
        project_id,
        query_file,
        api_version,
        target_bucket,
        parameter=None,
        gcs_path="",
    ):
        """Init JsonPublisher."""
        self.project_id = project_id
        self.query_file = query_file
        self.api_version = api_version
        self.target_bucket = target_bucket
        self.gcs_path = gcs_path
        self.parameter = parameter
        self.client = client
        self.storage_client = storage_client
        self.temp_table = None
        self.date = None
        self.stage_gcs_path = self.gcs_path + "stage/json/"

        self.metadata = Metadata.of_sql_file(self.query_file)

        # only for incremental exports files are written into separate directories
        # for each date, ignore date parameters for non-incremental exports
        if self.metadata.is_incremental_export() and self.parameter:
            for p in self.parameter:
                date_search = re.search(SUBMISSION_DATE_RE, p)

                if date_search:
                    self.date = date_search.group(1)

        query_file_re = re.search(QUERY_FILE_RE, self.query_file)
        if query_file_re:
            self.dataset = query_file_re.group(1)
            self.table = query_file_re.group(2)
            self.version = query_file_re.group(3)
        else:
            logging.error("Invalid file naming format: {}", self.query_file)
            sys.exit(1)
 def test_of_sql_file_no_metadata(self):
     metadata_file = (TEST_DIR / "data" / "test_sql" /
                      "moz-fx-data-test-project" / "test" /
                      "no_metadata_query_v1" / "query.sql")
     with pytest.raises(FileNotFoundError):
         Metadata.of_sql_file(metadata_file)