def get_dags(project_id, dags_config): """Return all configured DAGs including associated tasks.""" tasks = [] dag_collection = DagCollection.from_file(dags_config) for project_dir in project_dirs(project_id): # parse metadata.yaml to retrieve scheduling information if os.path.isdir(project_dir): for root, dirs, files in os.walk(project_dir): try: if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) task = Task.of_query(query_file, dag_collection=dag_collection) elif QUERY_PART_FILE in files: # multipart query query_file = os.path.join(root, QUERY_PART_FILE) task = Task.of_multipart_query( query_file, dag_collection=dag_collection) elif SCRIPT_FILE in files: query_file = os.path.join(root, SCRIPT_FILE) task = Task.of_script(query_file, dag_collection=dag_collection) elif PYTHON_SCRIPT_FILE in files: query_file = os.path.join(root, PYTHON_SCRIPT_FILE) task = Task.of_python_script( query_file, dag_collection=dag_collection) else: continue except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: # logging.debug( # f"No scheduling information for {query_file}." # ) # # most tasks lack scheduling information for now pass except Exception as e: # in the case that there was some other error, report the query # that failed before exiting logging.error( f"Error processing task for query {query_file}") raise e else: tasks.append(task) else: logging.error(""" Invalid project_dir: {}, project_dir must be a directory with structure <sql>/<project>/<dataset>/<table>/metadata.yaml. """.format(project_dir)) return dag_collection.with_tasks(tasks)
def parsed_routines(): """Get cached parsed routines.""" global _parsed_routines if _parsed_routines is None: _parsed_routines = { routine.filepath: routine for project in (project_dirs() + ["tests/assert"]) for routine in parse_routines(project) } return _parsed_routines
def main(): """Publish routine.""" args = parser.parse_args() if args.target is not None: projects = [args.target] else: projects = project_dirs() for project in projects: publish( args.target, args.project_id, os.path.join(SQL_DIR, project, args.dependency_dir), args.gcs_bucket, args.gcs_path, args.public, )
def main(): """Publish csv files as BigQuery tables.""" args = _parse_args() projects = project_dirs(args.project_id) for data_dir in projects: for root, dirs, files in os.walk(data_dir): for filename in files: if filename == DATA_FILENAME: schema_file_path = (os.path.join(root, SCHEMA_FILENAME) if SCHEMA_FILENAME in files else None) description_file_path = (os.path.join( root, DESCRIPTION_FILENAME) if DESCRIPTION_FILENAME in files else None) _load_table( os.path.join(root, filename), schema_file_path, description_file_path, args.project_id, )
def main(): """Generate and upload GCS metadata.""" args = parser.parse_args() storage_client = storage.Client(args.project_id) # set log level try: logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s") except ValueError as e: parser.error(f"argument --log-level: {e}") projects = project_dirs() all_metadata = [] for target in projects: if os.path.isdir(target): gcs_table_metadata = get_public_gcs_table_metadata( storage_client, args.target_bucket, args.api_version, args.endpoint, target, ) all_metadata += gcs_table_metadata publish_table_metadata( storage_client, gcs_table_metadata, args.target_bucket ) else: print( f"Invalid target: {target}, target must be a directory with" "structure <project>/<dataset>/<table>/metadata.yaml." ) output_file = f"gs://{args.target_bucket}/all-datasets.json" publish_all_datasets_metadata(all_metadata, output_file) set_content_type( storage_client, args.target_bucket, "all-datasets.json", "application/json", )
def main(): """Publish csv files as BigQuery tables.""" args = _parse_args() # This machinery is only compatible with # the sql/moz-fx-data-shared-prod/static directory. projects = project_dirs("moz-fx-data-shared-prod") for data_dir in projects: for root, dirs, files in os.walk(data_dir): for filename in files: if filename == DATA_FILENAME: schema_file_path = (os.path.join(root, SCHEMA_FILENAME) if SCHEMA_FILENAME in files else None) description_file_path = (os.path.join( root, DESCRIPTION_FILENAME) if DESCRIPTION_FILENAME in files else None) _load_table( os.path.join(root, filename), schema_file_path, description_file_path, args.project_id, )
def test_project_dirs(self): assert project_dirs("test") == ["sql/test"] existing_projects = project_dirs() assert "sql/moz-fx-data-shared-prod" in existing_projects
def is_valid_project(ctx, param, value): """Check if the provided project_id corresponds to an existing project.""" if value is None or value in [Path(p).name for p in project_dirs()]: return value raise click.BadParameter(f"Invalid project {value}")