def _internal_root_pipeline(): root_pipeline = Pipeline(id='mara_singer', description='Root pipeline of module mara_singer') root_pipeline.add(pipeline) return root_pipeline
def euro_exchange_rates_pipeline(db_alias: str): pipeline = Pipeline( id="load_euro_exchange_rates", description= "Loads daily Euro exchange rates since 1999 from the European central bank", base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_schema_and_table", description="Re-creates currency exchange rate schema", commands=[ ExecuteSQL(sql_file_name='create_schema_and_table.sql', echo_queries=False) ])) pipeline.add(Task( id='load_exchange_rate', description='Loads exchange rates from the European central bank', commands=[ ReadScriptOutput(file_name='load_exchange_rate.py', target_table='euro_fx.exchange_rate', db_alias=db_alias) ]), upstreams=['create_schema_and_table']) pipeline.add(Task(id="postprocess_exchange_rate", description="Adds values for missing days", commands=[ ExecuteSQL( sql_file_name='postprocess_exchange_rate.sql', echo_queries=False) ]), upstreams=['load_exchange_rate']) return pipeline
def add_schema_copying_to_pipeline(pipeline: Pipeline, schema_name, source_db_alias: str, target_db_alias: str, max_number_of_parallel_tasks: int = 4): """ Adds schema copying to the end of a pipeline. When the pipeline already has a final node, then the all except the last command are run before the copying, and the last command after. Args: pipeline: The pipeline to modify schema_name: The schema to copy source_db_alias: The alias of the PostgreSQL database to copy from target_db_alias: The alias of the PostgreSQL database to copy to max_number_of_parallel_tasks: How many operations to run at parallel at max. """ task_id = "copy_schema" description = f"Copies the {schema_name} schema to the {target_db_alias} db" commands = [] if pipeline.final_node: assert (isinstance(pipeline.final_node, Task)) description = pipeline.final_node.description + ' + ' + description task_id = pipeline.final_node.id + '_and_' + task_id commands = pipeline.final_node.commands pipeline.remove(pipeline.final_node) pipeline.add_final( ParallelCopySchema( id=task_id, description=description, schema_name=schema_name, source_db_alias=source_db_alias, target_db_alias=target_db_alias, max_number_of_parallel_tasks=max_number_of_parallel_tasks, commands_before=commands[:-1], commands_after=commands[-1:]))
def utils_pipeline(with_hll=False, with_cstore_fdw=False): pipeline = Pipeline( id="initialize_utils", description= "Creates an utils schema with a number of functions around the ETL best practices of Project A", base_path=pathlib.Path(__file__).parent) pipeline.add_initial( Task(id="create_utils_schema", description="Re-creates the utils schema", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;") ])) pipeline.add( Task(id='chunking', description='Runs file chunking.sql', commands=[ ExecuteSQL(sql_file_name='chunking.sql', echo_queries=False, replace={ 'number_of_chunks': lambda: config.number_of_chunks() }) ])) def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ])) for file_name_without_extension in [ 'consistency_checks', 'data_sets', 'partitioning', 'indexes_and_constraints', 'schema_switching', 'enums' ]: add_task_for_file(file_name_without_extension) if with_hll: add_task_for_file('hll') if with_cstore_fdw: add_task_for_file('cstore_fdw') return pipeline
import pathlib import etl_tools.utils from mara_pipelines.commands.files import Compression from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="github", description= "Builds a Github activity cube using the public Github archive data set", base_path=pathlib.Path(__file__).parent, labels={"Schema": "gh_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name='recreate_schemas.sql'), ExecuteSQL(sql_file_name="create_data_schema.sql", file_dependencies=["create_data_schema.sql"]) ])) read_repo_activity_file_dependencies = [ "create_repo_activity_data_table.sql", "create_data_schema.sql" ] pipeline.add(
import pathlib from mara_pipelines.commands.sql import ExecuteSQL, Copy from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines import config pipeline = Pipeline( id="load_marketing_data", description= "Jobs related with loading marketing leads data from the backend database", max_number_of_parallel_tasks=5, base_path=pathlib.Path(__file__).parent, labels={"Schema": "m_data"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the marketing data schema", commands=[ ExecuteSQL(sql_file_name='../recreate_marketing_data_schema.sql', file_dependencies=[ pathlib.Path(__file__).parent.parent / 'recreate_marketing_data_schema.sql' ]) ])) tables = ['closed_deal', 'marketing_qualified_lead'] for table in tables: pipeline.add( Task(id=f"load_{table}", description=f'Loads the {table}s from the backend database',
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="marketing", description="Builds the Leads cube based on marketing and e-commerce data", base_path=pathlib.Path(__file__).parent, labels={"Schema": "m_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name="recreate_schemas.sql")])) pipeline.add( Task(id="preprocess_lead", description="Preprocess the marketing leads", commands=[ExecuteSQL(sql_file_name="preprocess_lead.sql")])) pipeline.add(Task( id="transform_smaller_dimensions", description="Transform smaller marketing dimensions", commands=[ExecuteSQL(sql_file_name="transform_smaller_dimensions.sql")]), upstreams=["preprocess_lead"]) pipeline.add(Task(id="transform_lead", description="Creates the lead dim table", commands=[ ExecuteSQL(sql_file_name="transform_lead.sql",
import pathlib from mara_pipelines.commands.python import RunFunction from mara_pipelines.commands.sql import ExecuteSQL, Copy from mara_pipelines.pipelines import Pipeline, Task from mara_schema.config import data_sets from mara_schema.sql_generation import data_set_sql_query from .cstore_tables import create_cstore_table_for_query from .. import initialize_db pipeline = Pipeline( id="flatten_data_sets_for_metabase", description= "Creates data set tables for Metabase (completely flattened, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'metabase'}) pipeline.add_initial( Task(id="initialize_schema", description="Recreates the metabase schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util; DROP SCHEMA IF EXISTS metabase_next CASCADE; CREATE SCHEMA metabase_next; """, echo_queries=False, db_alias='metabase-data-write'),
import pathlib import etl_tools.utils from mara_pipelines.commands.files import Compression from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="pypi", description= "Builds a PyPI downloads cube using the public PyPi BigQuery data set", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pypi_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name='recreate_schemas.sql'), ExecuteSQL(sql_file_name="create_data_schema.sql", file_dependencies=["create_data_schema.sql"]) ])) read_download_file_dependencies = [ "create_download_counts_data_table.sql", "create_data_schema.sql" ] pipeline.add(
import pathlib import etl_tools.utils from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="python_projects", description= "Combines PyPI downloads and github activities to a Python project activity cube", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pp_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')])) pipeline.add( Task( id="extract_python_repo_activity", description= 'Extracts activity metrics for github repos that have a corresponding pypi package (by name)', commands=[ ExecuteSQL(sql_file_name="extract_python_repo_activity.sql") ])) pipeline.add(ParallelExecuteSQL( id="transform_python_project_activity",
def root_pipeline(): import app.pipelines.initialize_db import app.pipelines.load_data.load_ecommerce_data import app.pipelines.load_data.load_marketing_data import app.pipelines.e_commerce import app.pipelines.marketing import app.pipelines.generate_artifacts import app.pipelines.update_frontends import app.pipelines.consistency_checks import app.pipelines.update_frontends pipeline = Pipeline( id='mara_example_project_1', description='An example pipeline that integrates e-commerce and marketing funnel data') pipeline.add_initial(app.pipelines.initialize_db.pipeline) pipeline.add(app.pipelines.load_data.load_ecommerce_data.pipeline) pipeline.add(app.pipelines.load_data.load_marketing_data.pipeline) pipeline.add(app.pipelines.e_commerce.pipeline, upstreams=['load_ecommerce_data']) pipeline.add(app.pipelines.marketing.pipeline, upstreams=['load_marketing_data', 'e_commerce']) pipeline.add(app.pipelines.generate_artifacts.pipeline, upstreams=['marketing']) pipeline.add(app.pipelines.update_frontends.pipeline, upstreams=['generate_artifacts']) pipeline.add(app.pipelines.consistency_checks.pipeline, upstreams=['generate_artifacts']) return pipeline
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="e_commerce", description="Builds the e-commerce cubes and datasets", base_path=pathlib.Path(__file__).parent, labels={"Schema": "ec_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name="recreate_schemas.sql") ])) pipeline.add( Task(id="preprocess_customer", description="Preprocess customers and consolidate the data to a single record per customer with " "a unique ID", commands=[ ExecuteSQL(sql_file_name="preprocess_customer.sql") ])) pipeline.add( Task(id="preprocess_order", description="Preprocess orders to get correct unique customer ID", commands=[ ExecuteSQL(sql_file_name="preprocess_order.sql")
"""Contains an internal pipeline for singer tasks runable """ from mara_pipelines.commands.bash import RunBash from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from mara_singer.commands.singer import SingerTapDiscover import mara_singer.config pipeline = Pipeline(id='_singer', description="Internal Singer.io management pipeline") for tap_name in mara_singer.config.tap_names(): tap_pipeline = Pipeline(id=tap_name.replace('-', '_'), description=f'Package {tap_name}') tap_pipeline.add( Task(id='discover', description=f'Reload the {tap_name} catalog', commands=[SingerTapDiscover(tap_name=tap_name)])) pipeline.add(tap_pipeline) def _internal_root_pipeline(): root_pipeline = Pipeline(id='mara_singer', description='Root pipeline of module mara_singer') root_pipeline.add(pipeline)
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: source_db = mara_db.dbs.db(self.source_db_alias) target_db = mara_db.dbs.db(self.target_db_alias) assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB)) assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB)) with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: pg_version = cursor.connection.server_version ddl_task = Task( id='create_tables_and_functions', description= 'Re-creates the schema, tables structure and functions on the target db', commands=[ # schema and table structure bash.RunBash( command="(echo 'DROP SCHEMA IF EXISTS " + self.schema_name + " CASCADE;';\\\n" + " pg_dump --username="******" --host=" + source_db.host + " --schema=" + self.schema_name + " --section=pre-data --no-owner --no-privileges " + source_db.database + ") \\\n" + " | " + mara_db.shell.query_command( self.target_db_alias, echo_queries=False) + ' --quiet' ), # function definitions bash.RunBash(command=f'''echo " SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def FROM (SELECT oid, prokind, pronamespace FROM pg_proc p WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace WHERE pg_proc.pronamespace = pg_namespace.oid AND nspname = '{self.schema_name}'" \\\n''' + " | " + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + " | " + mara_db.shell.query_command(self.target_db_alias, echo_queries=False)) ]) sub_pipeline.add(ddl_task) # copy content of tables number_of_chunks = self.max_number_of_parallel_tasks * 3 table_copy_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_table_copy_chunk = [0] * number_of_chunks table_types = {} with mara_db.postgresql.postgres_cursor_context( self.source_db_alias ) as cursor: # type: psycopg2.extensions.cursor cursor.execute( """ SELECT pg_class.relname AS table, relkind, CASE WHEN relkind = 'f' THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy ELSE pg_total_relation_size(pg_class.oid) END / 1000000.0 AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE nspname = '""" + self.schema_name + """' AND relkind IN ('r', 'f') AND relhassubclass = 'f' ORDER BY size DESC""") for table_name, type, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_table_copy_chunk)), key=current_size_per_table_copy_chunk.__getitem__) current_size_per_table_copy_chunk[smallest_chunk_index] += size table_copy_chunks[smallest_chunk_index].append(table_name) table_types[table_name] = type copy_tasks = [] for i, tables in table_copy_chunks.items(): if tables: task = Task( id=f'copy_tables_{i}', description='Copies table content to the frontend db', commands=[ RunBash( command= f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n' + ' | ' + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + ' | ' + mara_db.shell.copy_from_stdin_command( self.target_db_alias, target_table= f'{self.schema_name}.{table_name}')) for table_name in tables ]) copy_tasks.append(task) sub_pipeline.add(task, upstreams=[ddl_task]) # create indexes index_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_index_chunk = [0] * number_of_chunks with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: cursor.execute(""" SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i' ORDER BY size DESC;""") for ddl, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_index_chunk)), key=current_size_per_index_chunk.__getitem__) current_size_per_index_chunk[smallest_chunk_index] += size index_chunks[smallest_chunk_index].append(ddl) for i, index_statements in index_chunks.items(): if index_statements: index_task = Task( id=f'add_indexes_{i}', description='Re-creates indexes on frontend db', commands=[ ExecuteSQL(sql_statement=statement, db_alias=self.target_db_alias) for statement in index_statements ]) sub_pipeline.add(index_task, upstreams=copy_tasks)
import pathlib from mara_pipelines.commands.sql import ExecuteSQL, Copy from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="load_ecommerce_data", description= "Jobs related with loading e-commerce data from the backend database", max_number_of_parallel_tasks=5, base_path=pathlib.Path(__file__).parent, labels={"Schema": "ec_data"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the e-commerce data schema", commands=[ ExecuteSQL(sql_file_name='../recreate_ecommerce_data_schema.sql', file_dependencies=[ pathlib.Path(__file__).parent.parent / 'recreate_ecommerce_data_schema.sql' ]) ])) tables = [ 'customer', 'order', 'order_item', 'product', 'product_category_name_translation', 'seller' ] for table in tables: pipeline.add(
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="generate_artifacts", description="Create flattened data set tables for various front-ends", base_path=pathlib.Path(__file__).parent) from .metabase import pipeline as metabase_pipeline pipeline.add(metabase_pipeline) from .mara_data_explorer import pipeline as mara_data_explorer_pipeline pipeline.add(mara_data_explorer_pipeline) from .mondrian import pipeline as mondrian_pipeline pipeline.add(mondrian_pipeline) pipeline.add_final( Task(id='replace_schemas', description='Replaces the frontend schemas with their next versions', commands=[ ExecuteSQL(sql_file_name='switch_metabase_schema.sql', db_alias='metabase-data-write'), ExecuteSQL( sql_statement= f"SELECT util.replace_schema('data_sets', 'data_sets_next')",
import pathlib import mara_mondrian.connection import mara_schema.config from mara_pipelines.commands.python import RunFunction from mara_pipelines.logging import logger from mara_pipelines.pipelines import Pipeline, Task import mara_metabase.metadata pipeline = Pipeline(id="update_frontends", description="Updates Metabase & Mondrian", base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id='update_metabase_metadata', description= 'Flushes all field value caches in Metabase and updates metadata', commands=[RunFunction(mara_metabase.metadata.update_metadata)])) def write_mondrian_schema(): import mara_mondrian.schema_generation file_name = pathlib.Path('.mondrian-schema.xml') logger.log(f'Writing {file_name}', logger.Format.ITALICS) mara_mondrian.schema_generation.write_mondrian_schema( file_name=pathlib.Path('.mondrian-schema.xml'), data_set_tables={ data_set: ('mondrian', data_set.id()) for data_set in mara_schema.config.data_sets()
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from mara_schema.sql_generation import data_set_sql_query, database_identifier from mara_schema.config import data_sets pipeline = Pipeline( id="flatten_data_sets_for_mondrian", description="Creates data set tables for Mondrian (star schema, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'mondrian'}) pipeline.add_initial( Task( id="initialize_schema", description="Recreates the mondrian schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS mondrian_next CASCADE; CREATE SCHEMA mondrian_next; """, echo_queries=False)])) for data_set in data_sets(): pipeline.add( Task(id=f"flatten_{data_set.id()}_for_mondrian", description=f'Flattens the "{data_set.name}" data set for best use in Mondrian', commands=[ ExecuteSQL(f""" CREATE TABLE mondrian_next.{database_identifier(data_set.name)} AS {data_set_sql_query(data_set=data_set, pre_computed_metrics=False, human_readable_columns=False, star_schema=True,
import pathlib import mara_db.postgresql from facebook_ads_performance_pipeline import config from mara_pipelines.commands.files import ReadSQLite from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.files import ReadMode, ParallelReadSqlite from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines.config import default_db_alias pipeline = Pipeline( id="facebook", description="Processes the data downloaded from the FacebookAds API", base_path=pathlib.Path(__file__).parent, labels={"Schema": "fb_dim"}) pipeline.add_initial( Task( id="initialize_schemas", description="Recreates the tmp and dim_next schemas", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;" ), ExecuteSQL(sql_file_name="create_data_schema.sql", echo_queries=False, file_dependencies=["create_data_schema.sql"]), ExecuteSQL(sql_file_name="recreate_schemas.sql", echo_queries=False)
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools import config pipeline = Pipeline(id="create_time_dimensions", description="Creates a day and a duration dimension table", labels={"Schema": "time"}, base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_tables", description="Re-creates the day and duration table and their schema", commands=[ ExecuteSQL(sql_file_name='create_tables.sql', echo_queries=False, file_dependencies=['create_tables.sql']) ])) pipeline.add(Task( id="populate_time_dimensions", description="fills the time dimensions for a configured time range", commands=[ ExecuteSQL( sql_statement=lambda: "SELECT time.populate_time_dimensions('" + config.first_date_in_time_dimensions().isoformat() + "'::DATE, '" + config.last_date_in_time_dimensions().isoformat() + "'::DATE);") ]), upstreams=['create_tables'])
import pathlib from etl_tools import initialize_utils, create_time_dimensions from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines.commands.sql import ExecuteSQL pipeline = Pipeline( id="initialize_db", description="Adds a number of utility functions & creates time dimensions", base_path=pathlib.Path(__file__).parent) pipeline.add(initialize_utils.utils_pipeline(with_cstore_fdw=True)) pipeline.add(create_time_dimensions.pipeline, upstreams=['initialize_utils']) pipeline.add( Task(id='create_read_only_user', description='Creates a read-only user "dwh_read_only", useful for giving analysts direct access to DWH', commands=[ ExecuteSQL(sql_file_name='create_read_only_user.sql') ]), upstreams=['initialize_utils'])
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}' ddl = f''' DROP TABLE IF EXISTS {attributes_table_name}; CREATE TABLE {attributes_table_name} ( attribute TEXT NOT NULL, value TEXT NOT NULL, row_count BIGINT NOT NULL ) PARTITION BY LIST (attribute); ''' commands = [] with mara_db.postgresql.postgres_cursor_context( self.db_alias) as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' WITH enums AS ( SELECT DISTINCT typname, nspname FROM pg_type JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid ) SELECT column_name FROM information_schema.columns LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname WHERE table_schema = {'%s'} AND table_name = {'%s'} AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL); ''', (self.source_schema_name, self.source_table_name)) i = 0 for column_name, in cursor.fetchall(): i += 1 ddl += f""" CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}'); """ commands.append( ExecuteSQL(sql_statement=f''' INSERT INTO {attributes_table_name}_{i} SELECT '{column_name}', "{column_name}", count(*) FROM {self.source_schema_name}.{self.source_table_name} WHERE "{column_name}" IS NOT NULL GROUP BY "{column_name}" ORDER BY "{column_name}"; CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops); ''', echo_queries=False)) sub_pipeline.add_initial( Task(id='create_table', description='Creates the attributes table', commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)])) chunk_size = math.ceil( len(commands) / (2 * mara_pipelines.config.max_number_of_parallel_tasks())) for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)): task = Task(id=str(n), description='Process a portion of the attributes') task.add_commands(chunk) sub_pipeline.add(task)