def should_work(_):
     yield DynamicOutput(1, output_name="numbers", mapping_key="1")
     yield DynamicOutput(2, output_name="numbers", mapping_key="2")
     yield DynamicOutput("a", output_name="letters", mapping_key="a")
     yield DynamicOutput("b", output_name="letters", mapping_key="b")
     yield DynamicOutput("c", output_name="letters", mapping_key="c")
     yield Output("*", "wildcard")
def test_invalid_mapping_keys():
    with pytest.raises(DagsterInvalidDefinitionError):
        DynamicOutput(True, mapping_key="")

    with pytest.raises(DagsterInvalidDefinitionError):
        DynamicOutput(True, mapping_key="?")

    with pytest.raises(DagsterInvalidDefinitionError):
        DynamicOutput(True, mapping_key="foo.baz")
Beispiel #3
0
def get_gtfs_files(context, original_filepath):
    feed_files = gk.list_feed(original_filepath)['file_name']
    for item in feed_files:
        filename = Path(item).stem
        yield DynamicOutput(filename,
                            mapping_key=filename,
                            output_name='filename')
Beispiel #4
0
 def solid2(_, _input1):
     for i in range(4):
         yield DynamicOutput(
             7,
             mapping_key=str(i),
             output_name="output2",
             metadata_entries=[entry2],
         )
Beispiel #5
0
def files_in_directory(context):
    path = context.solid_config["path"]
    dirname, _, filenames = next(os.walk(path))
    for file in filenames:
        yield DynamicOutput(
            value=os.path.join(dirname, file),
            # create a mapping key from the file name
            mapping_key=file.replace(".", "_").replace("-", "_"),
        )
Beispiel #6
0
def ingest_metadata_type(
    context: AbstractComputeExecutionContext, result: list[JobId],
    scratch_dataset_name: HcaScratchDatasetName
) -> Iterator[MetadataTypeFanoutResult]:
    """
    For each metadata type, return a dynamic output over which we can later map
    This saves us from hardcoding solids for each type
    """
    for metadata_type in context.solid_config["metadata_types"]:
        yield DynamicOutput(value=MetadataTypeFanoutResult(
            scratch_dataset_name, metadata_type.value,
            context.solid_config["prefix"]),
                            mapping_key=metadata_type.value,
                            output_name="table_fanout_result")
Beispiel #7
0
def resolve_dependencies_and_execute(context, queries_names,
                                     materialization_locked: bool,
                                     materialization_lock):

    try:
        # Setup directed graph for DAG sorting
        graph = nx.DiGraph()

        # Get dependencies
        dependencies = {}
        rp = RedisPal(constants.REDIS_HOST.value)
        materialized_views: dict = rp.get(
            constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
        if materialized_views:
            for query_name in queries_names:
                if query_name in materialized_views[
                        "views"] and materialized_views["views"][query_name][
                            "materialized"]:
                    graph.add_node(query_name)
                    dependencies[query_name] = materialized_views["views"][
                        query_name]["depends_on"]
                else:
                    context.log.warning(
                        f"{query_name} not found on Redis! Skipping...")

        # Log dependencies
        context.log.info(f"Dependencies: {dependencies}")

        # Add edges to graph
        for query_name in queries_names:
            if query_name in dependencies:
                for dep in dependencies[query_name]:
                    if dep in graph.nodes:
                        graph.add_edge(dep, query_name)

        context.log.info(f"Graph: {graph.edges()}")

        # Get topological order
        order = list(nx.topological_sort(graph))

        # Log topological order
        context.log.info(f"Order: {order}")

        # Execute queries in topological order
        for q in order:
            yield DynamicOutput(q, mapping_key=q.replace(".", "_"))
    except:
        locks.release(materialization_lock)
        raise
Beispiel #8
0
def get_runs(context, execution_date):
    execution_date = datetime.strptime(execution_date, "%Y-%m-%d")
    now = execution_date + timedelta(hours=11, minutes=30)
    this_time_yesterday = now - timedelta(days=1)
    min_timestamp = convert_datetime_to_unix_time(this_time_yesterday)
    max_timestamp = convert_datetime_to_unix_time(now)
    context.log.info(f"{execution_date} of type {type(execution_date)}")
    ftp_client = connect_ftp(os.getenv("FTPS_HOST"),
                             os.getenv("FTPS_USERNAME"), os.getenv("FTPS_PWD"))

    # Change to working directory
    ftp_client.cwd("/")
    for folder in ftp_client.mlsd():

        # Config yaml file will be folder_fileprefix.yaml
        if folder[1]["type"] == "dir" and folder[0] in ALLOWED_FOLDERS:
            # CWD to folder
            context.log.info(f"Entering folder {folder[0]}")
            folder_name = folder[0].lower()

            # Read file list
            for filepath in ftp_client.mlsd(folder_name):
                filename = filepath[0]
                fileprefix = filename.split("_")[0].lower()
                timestamp = filepath[1]["modify"]
                file_mtime = datetime.timestamp(parser.parse(timestamp))

                if file_mtime >= min_timestamp and file_mtime < max_timestamp:

                    # Download file to local folder
                    try:
                        config = read_config(
                            Path(__file__).parent /
                            f"{folder_name}_{fileprefix}.yaml")
                        table_id = config["resources"]["basedosdados_config"][
                            "config"]["table_id"]
                        date = tuple(re.findall("\d+", filename))
                        ano = date[2][:4]
                        mes = date[2][4:6]
                        dia = date[2][6:]
                        relative_filepath = Path(
                            "raw/br_rj_riodejaneiro_rdo",
                            table_id,
                            f"ano={ano}",
                            f"mes={mes}",
                            f"dia={dia}",
                        )
                        local_filepath = Path(FTPS_DIRECTORY,
                                              relative_filepath)
                        Path(local_filepath).mkdir(parents=True, exist_ok=True)

                        ftp_path = str(Path(folder_name, filename))
                        local_path = str(Path(local_filepath, filename))

                        # Run pipeline
                        config["solids"]["download_file_from_ftp"][
                            "inputs"] = {
                                "ftp_path": {
                                    "value": ftp_path
                                },
                                "local_path": {
                                    "value": local_path
                                },
                            }
                        config["solids"]["parse_file_path_and_partitions"][
                            "inputs"]["bucket_path"][
                                "value"] = f"{relative_filepath}/{filename}"
                        config["solids"]["upload_file_to_storage"] = {
                            "inputs": {
                                "file_path": {
                                    "value": local_path
                                }
                            }
                        }
                        yield DynamicOutput(
                            config,
                            mapping_key=
                            f"{folder_name}_{fileprefix}_{uuid.uuid4().hex}",
                        )

                    except jinja2.TemplateNotFound as err:
                        context.log.warning(
                            f"Config file for file {filename} was not found. Skipping file."
                        )
                        context.log.warning(f"{Path(__file__).parent}")
                ftp_client.cwd("/")
        else:
            context.log.warning(
                f"Skipping file {folder[0]} since it is not inside a folder")
            continue
Beispiel #9
0
def get_configs_for_materialized_view(context, query_names: list,
                                      materialization_locked: bool,
                                      materialization_lock) -> dict:
    """Retrieves configs for materialized views"""
    try:
        for query_name in query_names:

            # Split query name into dataset_name and view_name
            dataset_name, view_name = query_name.split(".")

            # Load configs from GCS
            view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.yaml'
            defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name)}/defaults.yaml'
            context.log.info(f"Defaults blob: {defaults_yaml}")
            context.log.info(f"View blob: {view_yaml}")
            defaults_blob = get_blob(defaults_yaml,
                                     SENSOR_BUCKET,
                                     mode="staging")
            view_blob = get_blob(view_yaml, SENSOR_BUCKET, mode="staging")
            if defaults_blob is None:
                raise Exception(f"Blob {defaults_yaml} not found!")
            defaults_dict = yaml.safe_load(defaults_blob.download_as_string())
            if view_blob:
                view_dict = yaml.safe_load(view_blob.download_as_string())
            else:
                context.log.warning(
                    f"Blob {view_yaml} not found. This is not an error.")
                view_dict = {}

            # Merge configs
            query_params = {**defaults_dict, **view_dict}

            # Build base configs
            now = datetime.datetime.now(pytz.timezone("America/Sao_Paulo"))
            run_key = build_run_key(query_name, now)
            with open(
                    str(
                        Path(__file__).parent /
                        "materialized_views_base_config.yaml"), "r") as f:
                base_params: dict = yaml.safe_load(f)
            base_params["run_timestamp"] = "'{}'".format(
                convert_datetime_to_datetime_string(now))
            base_params["maestro_sha"] = "'{}'".format(
                fetch_branch_sha(constants.MAESTRO_REPOSITORY.value,
                                 constants.MAESTRO_DEFAULT_BRANCH.value))
            base_params["maestro_bq_sha"] = "'{}'".format(
                fetch_branch_sha(constants.MAESTRO_BQ_REPOSITORY.value,
                                 constants.MAESTRO_BQ_DEFAULT_BRANCH.value))
            base_params["run_key"] = "'{}'".format(run_key)

            # Few more params
            r = Redis(constants.REDIS_HOST.value)
            rp = RedisPal(constants.REDIS_HOST.value)
            lock = Redlock(
                key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
                masters=[r],
                auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
            )
            table_name = parse_filepath_to_tablename(view_yaml)
            with lock:
                managed = rp.get(
                    constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                if managed is None:
                    managed = {}
                    managed["views"] = {}
                if query_name not in managed["views"]:
                    raise Exception(
                        f"Query {query_name} not found in managed views: {managed}"
                    )
                d = managed["views"][query_name]
                changed = d["query_modified"]
                context.log.info(f"{query_name} changed: {changed}")
                d["query_modified"] = False
                last_run = d["last_run"]
                d["last_run"] = now
                rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                       managed)

            # Get query on GCS
            query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.sql'
            query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging")
            if query_blob is None:
                raise Exception(f"Blob {query_file} not found!")
            base_query = query_blob.download_as_string().decode("utf-8")

            # Get parent queries on GCS
            parent_queries = {}
            for query_name in d["depends_on"]:
                if query_name in managed["views"] and managed["views"][
                        query_name]["materialized"]:
                    continue
                query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.sql'
                query_blob = get_blob(query_file,
                                      SENSOR_BUCKET,
                                      mode="staging")
                if query_blob is None:
                    context.log.warning(
                        f"Blob for parent query \"{query_file}\" not found, skipping..."
                    )
                    continue
                parent_view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.yaml'
                parent_view_blob = get_blob(parent_view_yaml,
                                            SENSOR_BUCKET,
                                            mode="staging")
                if parent_view_blob is not None:
                    parent_view_dict = yaml.safe_load(
                        parent_view_blob.download_as_string())
                else:
                    parent_view_dict = {}
                parent_defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:1]))}/defaults.yaml'
                parent_defaults_blob = get_blob(parent_defaults_yaml,
                                                SENSOR_BUCKET,
                                                mode="staging")
                if parent_defaults_blob is not None:
                    parent_defaults_dict = yaml.safe_load(
                        parent_defaults_blob.download_as_string())
                else:
                    context.log.warning(
                        f"Blob for parent query \"{parent_defaults_yaml}\" not found, skipping..."
                    )
                    continue
                parent_queries[query_name] = {}
                parent_queries[query_name][
                    "base_query"] = query_blob.download_as_string().decode(
                        "utf-8")
                parent_queries[query_name]["query_params"] = {
                    **parent_defaults_dict,
                    **parent_view_dict
                }
            context.log.info(f"Parent queries: {parent_queries}")

            # Build configs
            # - table_name: str
            # - changed: bool
            # - base_query: str
            # - base_params: dict
            # - query_params: dict
            # - now: str
            # - last_run: str
            date_ranges = get_date_ranges(
                last_run
                if last_run else query_params["backfill"]["start_timestamp"],
                query_params["backfill"]["interval"], now)
            context.log.info(f"{date_ranges}")
            for i, _ in enumerate(date_ranges[:-1]):
                configs = {
                    "table_name": table_name,
                    "changed": changed if i == 0 else False,
                    "base_query": base_query,
                    "base_params": base_params,
                    "query_params": query_params,
                    "now": date_ranges[i + 1],
                    "last_run": date_ranges[i],
                    "parent_queries": parent_queries,
                }
                yield DynamicOutput(
                    {
                        "config_dict": configs,
                        "materialization_lock": materialization_lock
                    },
                    mapping_key=
                    f'{configs["table_name"]}_{configs["last_run"]}_{configs["now"]}'
                    .replace(".",
                             "_").replace("-",
                                          "_").replace(" ",
                                                       "_").replace(":", "_"))
    except Exception as e:
        try:
            locks.release(materialization_lock)
        except:
            pass
        raise e
Beispiel #10
0
def update_managed_views(
    context,
    blob_names,
    materialization_locked: bool,
    materialization_lock: Redlock,
):
    try:
        # Setup Redis and Redlock
        r = Redis(constants.REDIS_HOST.value)
        rp = RedisPal(constants.REDIS_HOST.value)
        views_lock = Redlock(
            key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
            masters=[r],
            auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
        )

        # Initialize graph
        graph = nx.DiGraph()

        # If blob_name ends with "defaults.yaml", we need to
        # either add it to Redis or update its values and add
        # runs for every child it has and its dependencies.
        for blob_name in [
                b for b in blob_names if b.endswith("defaults.yaml")
        ]:

            # Get dataset name
            blob_path = "/".join([n for n in blob_name.split("/")
                                  if n != ""][:-1])
            dataset_name: str = blob_path.split("/")[-1]

            context.log.info("#" * 80)
            context.log.info(f"Updating {dataset_name} defaults")

            # Read the blob
            blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging")
            if blob is None:
                raise Exception(f"Blob {blob_name} not found")
            blob_dict: dict = yaml.safe_load(blob.download_as_string())

            # Add it to Redis
            with views_lock:
                materialized_views: dict = rp.get(
                    constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                if materialized_views is None:
                    materialized_views = {}
                    materialized_views["views"] = {}
                # Add every child to Redis
                if "views" not in blob_dict:
                    raise Exception(
                        f"Malformed blob (missing views key): {blob_name}")
                for key in blob_dict["views"].keys():

                    # Build key with dataset_name
                    m_key = f"{dataset_name}.{key}"

                    # This child also needs a run
                    context.log.info(f"Adding {m_key} to runs")
                    if m_key not in graph.nodes:
                        graph.add_node(m_key)

                    # Avoid KeyError
                    if "views" not in materialized_views:
                        materialized_views["views"] = {}

                    # Add to Redis
                    if m_key not in materialized_views["views"]:
                        materialized_views["views"][m_key] = {}
                    update_dict_with_dict(
                        materialized_views["views"][m_key], {
                            "cron_expression": blob_dict["scheduling"]["cron"],
                            "last_run": None,
                            "materialized":
                            blob_dict["views"][key]["materialized"],
                            "query_modified": True,
                            "depends_on":
                            blob_dict["views"][key]["depends_on"],
                        })

                    # Adds dependencies to runs
                    for dep in blob_dict["views"][key]["depends_on"]:
                        context.log.info(
                            f"Adding {dep} to runs as dependency of {m_key}")
                        if dep not in graph.nodes:
                            graph.add_node(dep)
                        graph.add_edge(dep, m_key)

                    # Try to find specific values for this view
                    blob = get_blob(blob_path + key + ".yaml",
                                    SENSOR_BUCKET,
                                    mode="staging")
                    if blob:
                        # Replace values in Redis
                        specific = yaml.safe_load(
                            blob.download_as_string().decode("utf-8"))
                        materialized_views["views"][m_key][
                            "cron_expression"] = specific["scheduling"]["cron"]
                    else:
                        context.log.warning(
                            f"No specific values for {m_key} found. This is not an error."
                        )

                # Update Redis effectively
                rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                       materialized_views)

        # Otherwise, we need to add the blob_name and its
        # dependencies to the graph.
        for blob_name in [
                b for b in blob_names if not b.endswith("defaults.yaml")
        ]:

            # Get table name
            file_name = ".".join(blob_name.split("/")[-2:])
            table_name = ".".join(file_name.split(".")[:-1])

            context.log.info("#" * 80)
            context.log.info(f"Updating {table_name} specific values...")

            # If it's YAML file, update values on Redis
            if blob_name.endswith(".yaml"):

                # Read the blob
                blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging")
                if blob is None:
                    raise Exception(f"Blob {blob_name} not found")
                blob_dict: dict = yaml.safe_load(blob.download_as_string())

                # Update Redis
                with views_lock:
                    materialized_views: dict = rp.get(
                        constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                    if materialized_views is None:
                        materialized_views = {}
                        materialized_views["views"] = {}

                    if table_name not in materialized_views["views"]:
                        materialized_views["views"][table_name] = {}
                    update_dict_with_dict(
                        materialized_views["views"][table_name], {
                            "cron_expression": blob_dict["scheduling"]["cron"],
                            "last_run": None,
                            "query_modified": True,
                        })
                    rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                           materialized_views)

            # Add table_name and its dependencies to runs
            context.log.info(f"Adding {table_name} to runs")
            if table_name not in graph.nodes:
                graph.add_node(table_name)

            materialized_views: dict = rp.get(
                constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
            if materialized_views is None:
                materialized_views = {}
                materialized_views["views"] = {}
            if table_name in materialized_views["views"]:
                for dep in materialized_views["views"][table_name][
                        "depends_on"]:
                    context.log.info(
                        f"Adding {dep} to runs as dependency of {table_name}")
                    if dep not in graph.nodes:
                        graph.add_node(dep)
                    graph.add_edge(dep, table_name)

        context.log.info(f"Graph edges: {graph.edges()}")

        # Get topological order
        order = list(nx.topological_sort(graph))

        # Filter out views that are not on materialized_views["views"]
        order = [o for o in order if o in materialized_views["views"]]

        # Log topological order
        context.log.info(f"Order: {order}")

        # Execute queries in topological order
        for q in order:
            yield DynamicOutput(
                {
                    "view_name": q,
                    "materialization_lock": materialization_lock
                },
                mapping_key=q.replace(".", "_"))
    except Exception as e:
        try:
            materialization_lock.release()
        except:
            pass
        raise e
Beispiel #11
0
def dynamic_numbers(_):
    yield DynamicOutput(1, mapping_key="1")
    yield DynamicOutput(2, mapping_key="2")
Beispiel #12
0
 def dynamic_solid(_):
     yield DynamicOutput(1, mapping_key="mapping_1")
     yield DynamicOutput(2, mapping_key="mapping_2")
def emit(_context, range_input):
    for i in range(range_input):
        yield DynamicOutput(value=i, mapping_key=str(i))
def emit(context):
    for i in range(context.solid_config["range"]):
        yield DynamicOutput(value=i, mapping_key=str(i))
Beispiel #15
0
 def dyn_desc(_) -> Iterator[DynamicOutput]:
     """
     Returns:
         numbers
     """
     yield DynamicOutput(4, "4")
Beispiel #16
0
 def dynamic_add(_, x):
     yield DynamicOutput(x + 1, mapping_key="1")
     yield DynamicOutput(x + 2, mapping_key="2")
 def should_fail(_):
     yield DynamicOutput(True, mapping_key="dunk")
     yield DynamicOutput(True, mapping_key="dunk")
def emit(context):
    if context.solid_config["fail"]:
        raise Exception("FAILURE")

    for i in range(context.solid_config["range"]):
        yield DynamicOutput(value=i, mapping_key=str(i))
 def should_work(_):
     yield DynamicOutput(1, mapping_key="1")
     yield DynamicOutput(2, mapping_key="2")
def dynamic_echo(_, nums):
    for x in nums:
        yield DynamicOutput(value=x, mapping_key=str(x))
Beispiel #21
0
def emit(_):
    for i in range(3):
        yield DynamicOutput(value=i, mapping_key=str(i))
Beispiel #22
0
 def numbers(_):
     for i in range(3):
         yield DynamicOutput(i, mapping_key=str(i))