def should_work(_): yield DynamicOutput(1, output_name="numbers", mapping_key="1") yield DynamicOutput(2, output_name="numbers", mapping_key="2") yield DynamicOutput("a", output_name="letters", mapping_key="a") yield DynamicOutput("b", output_name="letters", mapping_key="b") yield DynamicOutput("c", output_name="letters", mapping_key="c") yield Output("*", "wildcard")
def test_invalid_mapping_keys(): with pytest.raises(DagsterInvalidDefinitionError): DynamicOutput(True, mapping_key="") with pytest.raises(DagsterInvalidDefinitionError): DynamicOutput(True, mapping_key="?") with pytest.raises(DagsterInvalidDefinitionError): DynamicOutput(True, mapping_key="foo.baz")
def get_gtfs_files(context, original_filepath): feed_files = gk.list_feed(original_filepath)['file_name'] for item in feed_files: filename = Path(item).stem yield DynamicOutput(filename, mapping_key=filename, output_name='filename')
def solid2(_, _input1): for i in range(4): yield DynamicOutput( 7, mapping_key=str(i), output_name="output2", metadata_entries=[entry2], )
def files_in_directory(context): path = context.solid_config["path"] dirname, _, filenames = next(os.walk(path)) for file in filenames: yield DynamicOutput( value=os.path.join(dirname, file), # create a mapping key from the file name mapping_key=file.replace(".", "_").replace("-", "_"), )
def ingest_metadata_type( context: AbstractComputeExecutionContext, result: list[JobId], scratch_dataset_name: HcaScratchDatasetName ) -> Iterator[MetadataTypeFanoutResult]: """ For each metadata type, return a dynamic output over which we can later map This saves us from hardcoding solids for each type """ for metadata_type in context.solid_config["metadata_types"]: yield DynamicOutput(value=MetadataTypeFanoutResult( scratch_dataset_name, metadata_type.value, context.solid_config["prefix"]), mapping_key=metadata_type.value, output_name="table_fanout_result")
def resolve_dependencies_and_execute(context, queries_names, materialization_locked: bool, materialization_lock): try: # Setup directed graph for DAG sorting graph = nx.DiGraph() # Get dependencies dependencies = {} rp = RedisPal(constants.REDIS_HOST.value) materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views: for query_name in queries_names: if query_name in materialized_views[ "views"] and materialized_views["views"][query_name][ "materialized"]: graph.add_node(query_name) dependencies[query_name] = materialized_views["views"][ query_name]["depends_on"] else: context.log.warning( f"{query_name} not found on Redis! Skipping...") # Log dependencies context.log.info(f"Dependencies: {dependencies}") # Add edges to graph for query_name in queries_names: if query_name in dependencies: for dep in dependencies[query_name]: if dep in graph.nodes: graph.add_edge(dep, query_name) context.log.info(f"Graph: {graph.edges()}") # Get topological order order = list(nx.topological_sort(graph)) # Log topological order context.log.info(f"Order: {order}") # Execute queries in topological order for q in order: yield DynamicOutput(q, mapping_key=q.replace(".", "_")) except: locks.release(materialization_lock) raise
def get_runs(context, execution_date): execution_date = datetime.strptime(execution_date, "%Y-%m-%d") now = execution_date + timedelta(hours=11, minutes=30) this_time_yesterday = now - timedelta(days=1) min_timestamp = convert_datetime_to_unix_time(this_time_yesterday) max_timestamp = convert_datetime_to_unix_time(now) context.log.info(f"{execution_date} of type {type(execution_date)}") ftp_client = connect_ftp(os.getenv("FTPS_HOST"), os.getenv("FTPS_USERNAME"), os.getenv("FTPS_PWD")) # Change to working directory ftp_client.cwd("/") for folder in ftp_client.mlsd(): # Config yaml file will be folder_fileprefix.yaml if folder[1]["type"] == "dir" and folder[0] in ALLOWED_FOLDERS: # CWD to folder context.log.info(f"Entering folder {folder[0]}") folder_name = folder[0].lower() # Read file list for filepath in ftp_client.mlsd(folder_name): filename = filepath[0] fileprefix = filename.split("_")[0].lower() timestamp = filepath[1]["modify"] file_mtime = datetime.timestamp(parser.parse(timestamp)) if file_mtime >= min_timestamp and file_mtime < max_timestamp: # Download file to local folder try: config = read_config( Path(__file__).parent / f"{folder_name}_{fileprefix}.yaml") table_id = config["resources"]["basedosdados_config"][ "config"]["table_id"] date = tuple(re.findall("\d+", filename)) ano = date[2][:4] mes = date[2][4:6] dia = date[2][6:] relative_filepath = Path( "raw/br_rj_riodejaneiro_rdo", table_id, f"ano={ano}", f"mes={mes}", f"dia={dia}", ) local_filepath = Path(FTPS_DIRECTORY, relative_filepath) Path(local_filepath).mkdir(parents=True, exist_ok=True) ftp_path = str(Path(folder_name, filename)) local_path = str(Path(local_filepath, filename)) # Run pipeline config["solids"]["download_file_from_ftp"][ "inputs"] = { "ftp_path": { "value": ftp_path }, "local_path": { "value": local_path }, } config["solids"]["parse_file_path_and_partitions"][ "inputs"]["bucket_path"][ "value"] = f"{relative_filepath}/{filename}" config["solids"]["upload_file_to_storage"] = { "inputs": { "file_path": { "value": local_path } } } yield DynamicOutput( config, mapping_key= f"{folder_name}_{fileprefix}_{uuid.uuid4().hex}", ) except jinja2.TemplateNotFound as err: context.log.warning( f"Config file for file {filename} was not found. Skipping file." ) context.log.warning(f"{Path(__file__).parent}") ftp_client.cwd("/") else: context.log.warning( f"Skipping file {folder[0]} since it is not inside a folder") continue
def get_configs_for_materialized_view(context, query_names: list, materialization_locked: bool, materialization_lock) -> dict: """Retrieves configs for materialized views""" try: for query_name in query_names: # Split query name into dataset_name and view_name dataset_name, view_name = query_name.split(".") # Load configs from GCS view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.yaml' defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name)}/defaults.yaml' context.log.info(f"Defaults blob: {defaults_yaml}") context.log.info(f"View blob: {view_yaml}") defaults_blob = get_blob(defaults_yaml, SENSOR_BUCKET, mode="staging") view_blob = get_blob(view_yaml, SENSOR_BUCKET, mode="staging") if defaults_blob is None: raise Exception(f"Blob {defaults_yaml} not found!") defaults_dict = yaml.safe_load(defaults_blob.download_as_string()) if view_blob: view_dict = yaml.safe_load(view_blob.download_as_string()) else: context.log.warning( f"Blob {view_yaml} not found. This is not an error.") view_dict = {} # Merge configs query_params = {**defaults_dict, **view_dict} # Build base configs now = datetime.datetime.now(pytz.timezone("America/Sao_Paulo")) run_key = build_run_key(query_name, now) with open( str( Path(__file__).parent / "materialized_views_base_config.yaml"), "r") as f: base_params: dict = yaml.safe_load(f) base_params["run_timestamp"] = "'{}'".format( convert_datetime_to_datetime_string(now)) base_params["maestro_sha"] = "'{}'".format( fetch_branch_sha(constants.MAESTRO_REPOSITORY.value, constants.MAESTRO_DEFAULT_BRANCH.value)) base_params["maestro_bq_sha"] = "'{}'".format( fetch_branch_sha(constants.MAESTRO_BQ_REPOSITORY.value, constants.MAESTRO_BQ_DEFAULT_BRANCH.value)) base_params["run_key"] = "'{}'".format(run_key) # Few more params r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) table_name = parse_filepath_to_tablename(view_yaml) with lock: managed = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if managed is None: managed = {} managed["views"] = {} if query_name not in managed["views"]: raise Exception( f"Query {query_name} not found in managed views: {managed}" ) d = managed["views"][query_name] changed = d["query_modified"] context.log.info(f"{query_name} changed: {changed}") d["query_modified"] = False last_run = d["last_run"] d["last_run"] = now rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, managed) # Get query on GCS query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.sql' query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging") if query_blob is None: raise Exception(f"Blob {query_file} not found!") base_query = query_blob.download_as_string().decode("utf-8") # Get parent queries on GCS parent_queries = {} for query_name in d["depends_on"]: if query_name in managed["views"] and managed["views"][ query_name]["materialized"]: continue query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.sql' query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging") if query_blob is None: context.log.warning( f"Blob for parent query \"{query_file}\" not found, skipping..." ) continue parent_view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.yaml' parent_view_blob = get_blob(parent_view_yaml, SENSOR_BUCKET, mode="staging") if parent_view_blob is not None: parent_view_dict = yaml.safe_load( parent_view_blob.download_as_string()) else: parent_view_dict = {} parent_defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:1]))}/defaults.yaml' parent_defaults_blob = get_blob(parent_defaults_yaml, SENSOR_BUCKET, mode="staging") if parent_defaults_blob is not None: parent_defaults_dict = yaml.safe_load( parent_defaults_blob.download_as_string()) else: context.log.warning( f"Blob for parent query \"{parent_defaults_yaml}\" not found, skipping..." ) continue parent_queries[query_name] = {} parent_queries[query_name][ "base_query"] = query_blob.download_as_string().decode( "utf-8") parent_queries[query_name]["query_params"] = { **parent_defaults_dict, **parent_view_dict } context.log.info(f"Parent queries: {parent_queries}") # Build configs # - table_name: str # - changed: bool # - base_query: str # - base_params: dict # - query_params: dict # - now: str # - last_run: str date_ranges = get_date_ranges( last_run if last_run else query_params["backfill"]["start_timestamp"], query_params["backfill"]["interval"], now) context.log.info(f"{date_ranges}") for i, _ in enumerate(date_ranges[:-1]): configs = { "table_name": table_name, "changed": changed if i == 0 else False, "base_query": base_query, "base_params": base_params, "query_params": query_params, "now": date_ranges[i + 1], "last_run": date_ranges[i], "parent_queries": parent_queries, } yield DynamicOutput( { "config_dict": configs, "materialization_lock": materialization_lock }, mapping_key= f'{configs["table_name"]}_{configs["last_run"]}_{configs["now"]}' .replace(".", "_").replace("-", "_").replace(" ", "_").replace(":", "_")) except Exception as e: try: locks.release(materialization_lock) except: pass raise e
def update_managed_views( context, blob_names, materialization_locked: bool, materialization_lock: Redlock, ): try: # Setup Redis and Redlock r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) views_lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) # Initialize graph graph = nx.DiGraph() # If blob_name ends with "defaults.yaml", we need to # either add it to Redis or update its values and add # runs for every child it has and its dependencies. for blob_name in [ b for b in blob_names if b.endswith("defaults.yaml") ]: # Get dataset name blob_path = "/".join([n for n in blob_name.split("/") if n != ""][:-1]) dataset_name: str = blob_path.split("/")[-1] context.log.info("#" * 80) context.log.info(f"Updating {dataset_name} defaults") # Read the blob blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging") if blob is None: raise Exception(f"Blob {blob_name} not found") blob_dict: dict = yaml.safe_load(blob.download_as_string()) # Add it to Redis with views_lock: materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} # Add every child to Redis if "views" not in blob_dict: raise Exception( f"Malformed blob (missing views key): {blob_name}") for key in blob_dict["views"].keys(): # Build key with dataset_name m_key = f"{dataset_name}.{key}" # This child also needs a run context.log.info(f"Adding {m_key} to runs") if m_key not in graph.nodes: graph.add_node(m_key) # Avoid KeyError if "views" not in materialized_views: materialized_views["views"] = {} # Add to Redis if m_key not in materialized_views["views"]: materialized_views["views"][m_key] = {} update_dict_with_dict( materialized_views["views"][m_key], { "cron_expression": blob_dict["scheduling"]["cron"], "last_run": None, "materialized": blob_dict["views"][key]["materialized"], "query_modified": True, "depends_on": blob_dict["views"][key]["depends_on"], }) # Adds dependencies to runs for dep in blob_dict["views"][key]["depends_on"]: context.log.info( f"Adding {dep} to runs as dependency of {m_key}") if dep not in graph.nodes: graph.add_node(dep) graph.add_edge(dep, m_key) # Try to find specific values for this view blob = get_blob(blob_path + key + ".yaml", SENSOR_BUCKET, mode="staging") if blob: # Replace values in Redis specific = yaml.safe_load( blob.download_as_string().decode("utf-8")) materialized_views["views"][m_key][ "cron_expression"] = specific["scheduling"]["cron"] else: context.log.warning( f"No specific values for {m_key} found. This is not an error." ) # Update Redis effectively rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) # Otherwise, we need to add the blob_name and its # dependencies to the graph. for blob_name in [ b for b in blob_names if not b.endswith("defaults.yaml") ]: # Get table name file_name = ".".join(blob_name.split("/")[-2:]) table_name = ".".join(file_name.split(".")[:-1]) context.log.info("#" * 80) context.log.info(f"Updating {table_name} specific values...") # If it's YAML file, update values on Redis if blob_name.endswith(".yaml"): # Read the blob blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging") if blob is None: raise Exception(f"Blob {blob_name} not found") blob_dict: dict = yaml.safe_load(blob.download_as_string()) # Update Redis with views_lock: materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} if table_name not in materialized_views["views"]: materialized_views["views"][table_name] = {} update_dict_with_dict( materialized_views["views"][table_name], { "cron_expression": blob_dict["scheduling"]["cron"], "last_run": None, "query_modified": True, }) rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) # Add table_name and its dependencies to runs context.log.info(f"Adding {table_name} to runs") if table_name not in graph.nodes: graph.add_node(table_name) materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} if table_name in materialized_views["views"]: for dep in materialized_views["views"][table_name][ "depends_on"]: context.log.info( f"Adding {dep} to runs as dependency of {table_name}") if dep not in graph.nodes: graph.add_node(dep) graph.add_edge(dep, table_name) context.log.info(f"Graph edges: {graph.edges()}") # Get topological order order = list(nx.topological_sort(graph)) # Filter out views that are not on materialized_views["views"] order = [o for o in order if o in materialized_views["views"]] # Log topological order context.log.info(f"Order: {order}") # Execute queries in topological order for q in order: yield DynamicOutput( { "view_name": q, "materialization_lock": materialization_lock }, mapping_key=q.replace(".", "_")) except Exception as e: try: materialization_lock.release() except: pass raise e
def dynamic_numbers(_): yield DynamicOutput(1, mapping_key="1") yield DynamicOutput(2, mapping_key="2")
def dynamic_solid(_): yield DynamicOutput(1, mapping_key="mapping_1") yield DynamicOutput(2, mapping_key="mapping_2")
def emit(_context, range_input): for i in range(range_input): yield DynamicOutput(value=i, mapping_key=str(i))
def emit(context): for i in range(context.solid_config["range"]): yield DynamicOutput(value=i, mapping_key=str(i))
def dyn_desc(_) -> Iterator[DynamicOutput]: """ Returns: numbers """ yield DynamicOutput(4, "4")
def dynamic_add(_, x): yield DynamicOutput(x + 1, mapping_key="1") yield DynamicOutput(x + 2, mapping_key="2")
def should_fail(_): yield DynamicOutput(True, mapping_key="dunk") yield DynamicOutput(True, mapping_key="dunk")
def emit(context): if context.solid_config["fail"]: raise Exception("FAILURE") for i in range(context.solid_config["range"]): yield DynamicOutput(value=i, mapping_key=str(i))
def should_work(_): yield DynamicOutput(1, mapping_key="1") yield DynamicOutput(2, mapping_key="2")
def dynamic_echo(_, nums): for x in nums: yield DynamicOutput(value=x, mapping_key=str(x))
def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i))
def numbers(_): for i in range(3): yield DynamicOutput(i, mapping_key=str(i))