def gen_flow(self) -> Flow: if self.parallel: # generate a separate task for each account supermetrics_downloads = apply_map(self.gen_supermetrics_task, self.ds_accounts, flow=self) else: supermetrics_downloads = self.gen_supermetrics_task( ds_accounts=self.ds_accounts, flow=self) csv_to_blob_storage_task.bind( from_path=self.local_file_path, to_path=self.blob_path, overwrite=self.overwrite_blob, flow=self, ) blob_to_azure_sql_task.bind( blob_path=self.blob_path, schema=self.schema, table=self.table, dtypes=self.dtypes, sep=self.sep, if_exists=self.if_exists, flow=self, ) csv_to_blob_storage_task.set_upstream(supermetrics_downloads, flow=self) blob_to_azure_sql_task.set_upstream(csv_to_blob_storage_task, flow=self)
def gen_flow(self) -> Flow: extract_flow_runs = apply_map(self.gen_start_flow_run_task, self.extract_flows_names, flow=self) transform_flow_run = start_flow_run_task_2.bind( flow_name=self.transform_flow_name, project_name=self.project_name, flow=self, ) transform_flow_run.set_upstream(extract_flow_runs, flow=self)
def gen_flow(self) -> Flow: if self.report_url: dfs = apply_map(self.gen_c4c_report_months, self.report_urls_with_filters, flow=self) df = union_dfs_task.bind(dfs, flow=self) elif self.url: df = self.gen_c4c( url=self.url, report_url=self.report_url, env=self.env, endpoint=self.endpoint, params=self.params, flow=self, ) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) if self.output_file_extension == ".parquet": df_to_file = df_to_parquet_task.bind( df=df_with_metadata, path=self.local_file_path, if_exists=self.if_exists, flow=self, ) else: df_to_file = df_to_csv_task.bind( df=df_with_metadata, path=self.local_file_path, if_exists=self.if_exists, flow=self, ) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, sp_credentials_secret=self.adls_sp_credentials_secret, flow=self, ) df_with_metadata.set_upstream(df, flow=self) df_to_file.set_upstream(df_with_metadata, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path)
def gen_flow(self) -> Flow: if self.parallel: # generate a separate task for each account dfs = apply_map(self.gen_supermetrics_task, self.ds_accounts, flow=self) df = union_dfs_task.bind(dfs, flow=self) else: df = self.gen_supermetrics_task(ds_accounts=self.ds_accounts, flow=self) write_json = write_to_json.bind( dict_=self.expectation_suite, path=os.path.join( self.expectations_path, self.expectation_suite_name + ".json" ), flow=self, ) validation = validation_task.bind( df=df, expectations_path=self.expectations_path, expectation_suite_name=self.expectation_suite_name, evaluation_parameters=self.evaluation_parameters, keep_output=self.keep_validation_output, flow=self, ) if not self.keep_validation_output: validation_cleanup = cleanup_validation_clutter.bind( expectations_path=self.expectations_path, flow=self ) validation_cleanup.set_upstream(validation, flow=self) validation_upstream = validation_cleanup else: validation_upstream = validation df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_to_be_loaded = df_mapp_mixed_dtypes_for_parquet( df_with_metadata, dtypes_dict, flow=self ) if self.output_file_extension == ".parquet": df_to_file = df_to_parquet_task.bind( df=df_to_be_loaded, path=self.local_file_path, if_exists=self.if_exists, flow=self, ) else: df_to_file = df_to_csv_task.bind( df=df_with_metadata, path=self.local_file_path, if_exists=self.if_exists, flow=self, ) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, overwrite=self.overwrite_adls, sp_credentials_secret=self.adls_sp_credentials_secret, vault_name=self.vault_name, flow=self, ) dtypes_updated = update_dtypes_dict(dtypes_dict, flow=self) dtypes_to_json_task.bind( dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, overwrite=self.overwrite_adls, sp_credentials_secret=self.adls_sp_credentials_secret, vault_name=self.vault_name, flow=self, ) write_json.set_upstream(df, flow=self) validation.set_upstream(write_json, flow=self) df_with_metadata.set_upstream(validation_upstream, flow=self) df_to_be_loaded.set_upstream(dtypes_dict, flow=self) dtypes_dict.set_upstream(df_with_metadata, flow=self) dtypes_to_json_task.set_upstream(dtypes_updated, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path)