Exemple #1
0
    def gen_flow(self) -> Flow:
        if self.parallel:
            # generate a separate task for each account
            supermetrics_downloads = apply_map(self.gen_supermetrics_task,
                                               self.ds_accounts,
                                               flow=self)
        else:
            supermetrics_downloads = self.gen_supermetrics_task(
                ds_accounts=self.ds_accounts, flow=self)

        csv_to_blob_storage_task.bind(
            from_path=self.local_file_path,
            to_path=self.blob_path,
            overwrite=self.overwrite_blob,
            flow=self,
        )
        blob_to_azure_sql_task.bind(
            blob_path=self.blob_path,
            schema=self.schema,
            table=self.table,
            dtypes=self.dtypes,
            sep=self.sep,
            if_exists=self.if_exists,
            flow=self,
        )

        csv_to_blob_storage_task.set_upstream(supermetrics_downloads,
                                              flow=self)
        blob_to_azure_sql_task.set_upstream(csv_to_blob_storage_task,
                                            flow=self)
Exemple #2
0
 def gen_flow(self) -> Flow:
     extract_flow_runs = apply_map(self.gen_start_flow_run_task,
                                   self.extract_flows_names,
                                   flow=self)
     transform_flow_run = start_flow_run_task_2.bind(
         flow_name=self.transform_flow_name,
         project_name=self.project_name,
         flow=self,
     )
     transform_flow_run.set_upstream(extract_flow_runs, flow=self)
    def gen_flow(self) -> Flow:
        if self.report_url:
            dfs = apply_map(self.gen_c4c_report_months,
                            self.report_urls_with_filters,
                            flow=self)
            df = union_dfs_task.bind(dfs, flow=self)
        elif self.url:
            df = self.gen_c4c(
                url=self.url,
                report_url=self.report_url,
                env=self.env,
                endpoint=self.endpoint,
                params=self.params,
                flow=self,
            )

        df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self)

        if self.output_file_extension == ".parquet":
            df_to_file = df_to_parquet_task.bind(
                df=df_with_metadata,
                path=self.local_file_path,
                if_exists=self.if_exists,
                flow=self,
            )
        else:
            df_to_file = df_to_csv_task.bind(
                df=df_with_metadata,
                path=self.local_file_path,
                if_exists=self.if_exists,
                flow=self,
            )

        file_to_adls_task.bind(
            from_path=self.local_file_path,
            to_path=self.adls_file_path,
            sp_credentials_secret=self.adls_sp_credentials_secret,
            flow=self,
        )

        df_with_metadata.set_upstream(df, flow=self)
        df_to_file.set_upstream(df_with_metadata, flow=self)
        file_to_adls_task.set_upstream(df_to_file, flow=self)

        set_key_value(key=self.adls_dir_path, value=self.adls_file_path)
Exemple #4
0
    def gen_flow(self) -> Flow:
        if self.parallel:
            # generate a separate task for each account
            dfs = apply_map(self.gen_supermetrics_task, self.ds_accounts, flow=self)
            df = union_dfs_task.bind(dfs, flow=self)
        else:
            df = self.gen_supermetrics_task(ds_accounts=self.ds_accounts, flow=self)

        write_json = write_to_json.bind(
            dict_=self.expectation_suite,
            path=os.path.join(
                self.expectations_path, self.expectation_suite_name + ".json"
            ),
            flow=self,
        )

        validation = validation_task.bind(
            df=df,
            expectations_path=self.expectations_path,
            expectation_suite_name=self.expectation_suite_name,
            evaluation_parameters=self.evaluation_parameters,
            keep_output=self.keep_validation_output,
            flow=self,
        )

        if not self.keep_validation_output:
            validation_cleanup = cleanup_validation_clutter.bind(
                expectations_path=self.expectations_path, flow=self
            )
            validation_cleanup.set_upstream(validation, flow=self)
            validation_upstream = validation_cleanup
        else:
            validation_upstream = validation

        df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self)
        dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self)

        df_to_be_loaded = df_mapp_mixed_dtypes_for_parquet(
            df_with_metadata, dtypes_dict, flow=self
        )

        if self.output_file_extension == ".parquet":
            df_to_file = df_to_parquet_task.bind(
                df=df_to_be_loaded,
                path=self.local_file_path,
                if_exists=self.if_exists,
                flow=self,
            )
        else:
            df_to_file = df_to_csv_task.bind(
                df=df_with_metadata,
                path=self.local_file_path,
                if_exists=self.if_exists,
                flow=self,
            )

        file_to_adls_task.bind(
            from_path=self.local_file_path,
            to_path=self.adls_file_path,
            overwrite=self.overwrite_adls,
            sp_credentials_secret=self.adls_sp_credentials_secret,
            vault_name=self.vault_name,
            flow=self,
        )

        dtypes_updated = update_dtypes_dict(dtypes_dict, flow=self)
        dtypes_to_json_task.bind(
            dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self
        )
        json_to_adls_task.bind(
            from_path=self.local_json_path,
            to_path=self.adls_schema_file_dir_file,
            overwrite=self.overwrite_adls,
            sp_credentials_secret=self.adls_sp_credentials_secret,
            vault_name=self.vault_name,
            flow=self,
        )

        write_json.set_upstream(df, flow=self)
        validation.set_upstream(write_json, flow=self)
        df_with_metadata.set_upstream(validation_upstream, flow=self)
        df_to_be_loaded.set_upstream(dtypes_dict, flow=self)
        dtypes_dict.set_upstream(df_with_metadata, flow=self)

        dtypes_to_json_task.set_upstream(dtypes_updated, flow=self)
        file_to_adls_task.set_upstream(df_to_file, flow=self)
        json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self)
        set_key_value(key=self.adls_dir_path, value=self.adls_file_path)