def main(): """Publish query data as JSON to GCS.""" args, query_arguments = parser.parse_known_args() try: metadata = Metadata.of_sql_file(args.query_file) except FileNotFoundError: print("No metadata file for: {}".format(args.query_file)) return # check if the data should be published as JSON if not metadata.is_public_json(): return if not validate_public_data(metadata, args.query_file): sys.exit(1) storage_client = storage.Client() client = bigquery.Client(args.public_project_id) publisher = JsonPublisher( client, storage_client, args.public_project_id, args.query_file, args.api_version, args.target_bucket, args.parameter, ) publisher.publish_json()
def of_query(cls, query_file): """ Create task that schedules the corresponding query in Airflow. Raises FileNotFoundError if not metadata file exists for query. """ metadata = Metadata.of_sql_file(query_file) return cls(query_file, metadata)
def test_of_sql_file(self): metadata_file = (TEST_DIR / "data" / "test_sql" / "test" / "non_incremental_query_v1" / "query.sql") metadata = Metadata.of_sql_file(metadata_file) assert metadata.friendly_name == "Test table for a non-incremental query" assert metadata.description == "Test table for a non-incremental query" assert metadata.review_bug() == "1999999"
def run( query_file, dataset_id, destination_table, query_arguments, public_project_id=PUBLIC_PROJECT_ID, ): """Execute bq to run a query.""" if dataset_id is not None: # dataset ID was parsed by argparse but needs to be passed as parameter # when running the query query_arguments.append("--dataset_id={}".format(dataset_id)) use_public_table = False try: metadata = Metadata.of_sql_file(query_file) if metadata.is_public_bigquery(): if not validate_public_data(metadata, query_file): sys.exit(1) # change the destination table to write results to the public dataset; # a view to the public table in the internal dataset is created # when CI runs if (dataset_id is not None and destination_table is not None and re.match(DESTINATION_TABLE_RE, destination_table)): destination_table = "{}:{}.{}".format(public_project_id, dataset_id, destination_table) query_arguments.append( "--destination_table={}".format(destination_table)) use_public_table = True else: print("ERROR: Cannot run public dataset query. Parameters" " --destination_table=<table without dataset ID> and" " --dataset_id=<dataset> required") sys.exit(1) except yaml.YAMLError as e: print(e) sys.exit(1) except FileNotFoundError: print("INFO: No metadata.yaml found for {}", query_file) if not use_public_table and destination_table is not None: # destination table was parsed by argparse, however if it wasn't modified to # point to a public table it needs to be passed as parameter for the query query_arguments.append( "--destination_table={}".format(destination_table)) with open(query_file) as query_stream: # run the query as shell command so that passed parameters can be used as is subprocess.check_call(["bq"] + query_arguments, stdin=query_stream)
def of_query(cls, query_file, metadata=None, dag_collection=None): """ Create task that schedules the corresponding query in Airflow. Raises FileNotFoundError if not metadata file exists for query. If `metadata` is set, then it is used instead of the metadata.yaml file that might exist alongside the query file. """ converter = cattr.Converter() if metadata is None: metadata = Metadata.of_sql_file(query_file) dag_name = metadata.scheduling.get("dag_name") if dag_name is None: raise UnscheduledTask( f"Metadata for {query_file} does not contain scheduling information." ) task_config = {"query_file": str(query_file)} task_config.update(metadata.scheduling) if len(metadata.owners) <= 0: raise TaskParseException( f"No owner specified in metadata for {query_file}." ) # Airflow only allows to set one owner, so we just take the first task_config["owner"] = metadata.owners[0] # Get default email from default_args if available default_email = [] if dag_collection is not None: dag = dag_collection.dag_by_name(dag_name) if dag is not None: default_email = dag.default_args.email email = task_config.get("email", default_email) # owners get added to the email list task_config["email"] = list(set(email + metadata.owners)) # data processed in task should be published if metadata.is_public_json(): task_config["public_json"] = True try: return converter.structure(task_config, cls) except TypeError as e: raise TaskParseException( f"Invalid scheduling information format for {query_file}: {e}" )
def __init__( self, client, storage_client, project_id, query_file, api_version, target_bucket, parameter=None, gcs_path="", ): """Init JsonPublisher.""" self.project_id = project_id self.query_file = query_file self.api_version = api_version self.target_bucket = target_bucket self.gcs_path = gcs_path self.parameter = parameter self.client = client self.storage_client = storage_client self.temp_table = None self.date = None self.stage_gcs_path = self.gcs_path + "stage/json/" self.metadata = Metadata.of_sql_file(self.query_file) # only for incremental exports files are written into separate directories # for each date, ignore date parameters for non-incremental exports if self.metadata.is_incremental_export() and self.parameter: for p in self.parameter: date_search = re.search(SUBMISSION_DATE_RE, p) if date_search: self.date = date_search.group(1) query_file_re = re.search(QUERY_FILE_RE, self.query_file) if query_file_re: self.dataset = query_file_re.group(1) self.table = query_file_re.group(2) self.version = query_file_re.group(3) else: logging.error("Invalid file naming format: {}", self.query_file) sys.exit(1)
def test_of_sql_file_no_metadata(self): metadata_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "no_metadata_query_v1" / "query.sql") with pytest.raises(FileNotFoundError): Metadata.of_sql_file(metadata_file)