Ejemplo n.º 1
0
def get_dags(project_id, dags_config):
    """Return all configured DAGs including associated tasks."""
    tasks = []
    dag_collection = DagCollection.from_file(dags_config)

    for project_dir in project_dirs(project_id):
        # parse metadata.yaml to retrieve scheduling information
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                try:
                    if QUERY_FILE in files:
                        query_file = os.path.join(root, QUERY_FILE)
                        task = Task.of_query(query_file,
                                             dag_collection=dag_collection)
                    elif QUERY_PART_FILE in files:
                        # multipart query
                        query_file = os.path.join(root, QUERY_PART_FILE)
                        task = Task.of_multipart_query(
                            query_file, dag_collection=dag_collection)
                    elif SCRIPT_FILE in files:
                        query_file = os.path.join(root, SCRIPT_FILE)
                        task = Task.of_script(query_file,
                                              dag_collection=dag_collection)
                    elif PYTHON_SCRIPT_FILE in files:
                        query_file = os.path.join(root, PYTHON_SCRIPT_FILE)
                        task = Task.of_python_script(
                            query_file, dag_collection=dag_collection)
                    else:
                        continue
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    # logging.debug(
                    #     f"No scheduling information for {query_file}."
                    # )
                    #
                    # most tasks lack scheduling information for now
                    pass
                except Exception as e:
                    # in the case that there was some other error, report the query
                    # that failed before exiting
                    logging.error(
                        f"Error processing task for query {query_file}")
                    raise e
                else:
                    tasks.append(task)

        else:
            logging.error("""
                Invalid project_dir: {}, project_dir must be a directory with
                structure <sql>/<project>/<dataset>/<table>/metadata.yaml.
                """.format(project_dir))

    return dag_collection.with_tasks(tasks)
Ejemplo n.º 2
0
def parsed_routines():
    """Get cached parsed routines."""
    global _parsed_routines
    if _parsed_routines is None:
        _parsed_routines = {
            routine.filepath: routine
            for project in (project_dirs() + ["tests/assert"])
            for routine in parse_routines(project)
        }

    return _parsed_routines
Ejemplo n.º 3
0
def main():
    """Publish routine."""
    args = parser.parse_args()

    if args.target is not None:
        projects = [args.target]
    else:
        projects = project_dirs()

    for project in projects:
        publish(
            args.target,
            args.project_id,
            os.path.join(SQL_DIR, project, args.dependency_dir),
            args.gcs_bucket,
            args.gcs_path,
            args.public,
        )
Ejemplo n.º 4
0
def main():
    """Publish csv files as BigQuery tables."""
    args = _parse_args()
    projects = project_dirs(args.project_id)

    for data_dir in projects:
        for root, dirs, files in os.walk(data_dir):
            for filename in files:
                if filename == DATA_FILENAME:
                    schema_file_path = (os.path.join(root, SCHEMA_FILENAME)
                                        if SCHEMA_FILENAME in files else None)
                    description_file_path = (os.path.join(
                        root, DESCRIPTION_FILENAME) if DESCRIPTION_FILENAME
                                             in files else None)
                    _load_table(
                        os.path.join(root, filename),
                        schema_file_path,
                        description_file_path,
                        args.project_id,
                    )
Ejemplo n.º 5
0
def main():
    """Generate and upload GCS metadata."""
    args = parser.parse_args()
    storage_client = storage.Client(args.project_id)

    # set log level
    try:
        logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s")
    except ValueError as e:
        parser.error(f"argument --log-level: {e}")

    projects = project_dirs()
    all_metadata = []

    for target in projects:
        if os.path.isdir(target):
            gcs_table_metadata = get_public_gcs_table_metadata(
                storage_client,
                args.target_bucket,
                args.api_version,
                args.endpoint,
                target,
            )
            all_metadata += gcs_table_metadata
            publish_table_metadata(
                storage_client, gcs_table_metadata, args.target_bucket
            )
        else:
            print(
                f"Invalid target: {target}, target must be a directory with"
                "structure <project>/<dataset>/<table>/metadata.yaml."
            )

    output_file = f"gs://{args.target_bucket}/all-datasets.json"
    publish_all_datasets_metadata(all_metadata, output_file)
    set_content_type(
        storage_client,
        args.target_bucket,
        "all-datasets.json",
        "application/json",
    )
Ejemplo n.º 6
0
def main():
    """Publish csv files as BigQuery tables."""
    args = _parse_args()

    # This machinery is only compatible with
    # the sql/moz-fx-data-shared-prod/static directory.
    projects = project_dirs("moz-fx-data-shared-prod")

    for data_dir in projects:
        for root, dirs, files in os.walk(data_dir):
            for filename in files:
                if filename == DATA_FILENAME:
                    schema_file_path = (os.path.join(root, SCHEMA_FILENAME)
                                        if SCHEMA_FILENAME in files else None)
                    description_file_path = (os.path.join(
                        root, DESCRIPTION_FILENAME) if DESCRIPTION_FILENAME
                                             in files else None)
                    _load_table(
                        os.path.join(root, filename),
                        schema_file_path,
                        description_file_path,
                        args.project_id,
                    )
Ejemplo n.º 7
0
    def test_project_dirs(self):
        assert project_dirs("test") == ["sql/test"]

        existing_projects = project_dirs()
        assert "sql/moz-fx-data-shared-prod" in existing_projects
Ejemplo n.º 8
0
def is_valid_project(ctx, param, value):
    """Check if the provided project_id corresponds to an existing project."""
    if value is None or value in [Path(p).name for p in project_dirs()]:
        return value
    raise click.BadParameter(f"Invalid project {value}")