Beispiel #1
0
def _internal_root_pipeline():
    root_pipeline = Pipeline(id='mara_singer',
                             description='Root pipeline of module mara_singer')

    root_pipeline.add(pipeline)

    return root_pipeline
Beispiel #2
0
def euro_exchange_rates_pipeline(db_alias: str):
    pipeline = Pipeline(
        id="load_euro_exchange_rates",
        description=
        "Loads daily Euro exchange rates since 1999 from the European central bank",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add(
        Task(id="create_schema_and_table",
             description="Re-creates currency exchange rate schema",
             commands=[
                 ExecuteSQL(sql_file_name='create_schema_and_table.sql',
                            echo_queries=False)
             ]))

    pipeline.add(Task(
        id='load_exchange_rate',
        description='Loads exchange rates from the European central bank',
        commands=[
            ReadScriptOutput(file_name='load_exchange_rate.py',
                             target_table='euro_fx.exchange_rate',
                             db_alias=db_alias)
        ]),
                 upstreams=['create_schema_and_table'])

    pipeline.add(Task(id="postprocess_exchange_rate",
                      description="Adds values for missing days",
                      commands=[
                          ExecuteSQL(
                              sql_file_name='postprocess_exchange_rate.sql',
                              echo_queries=False)
                      ]),
                 upstreams=['load_exchange_rate'])

    return pipeline
Beispiel #3
0
def add_schema_copying_to_pipeline(pipeline: Pipeline,
                                   schema_name,
                                   source_db_alias: str,
                                   target_db_alias: str,
                                   max_number_of_parallel_tasks: int = 4):
    """
    Adds schema copying to the end of a pipeline.

    When the pipeline already has a final node, then the all except the last command are run before the copying,
    and the last command after.

    Args:
        pipeline: The pipeline to modify
        schema_name: The schema to copy
        source_db_alias: The alias of the PostgreSQL database to copy from
        target_db_alias: The alias of the PostgreSQL database to copy to
        max_number_of_parallel_tasks: How many operations to run at parallel at max.
    """
    task_id = "copy_schema"
    description = f"Copies the {schema_name} schema to the {target_db_alias} db"
    commands = []
    if pipeline.final_node:
        assert (isinstance(pipeline.final_node, Task))
        description = pipeline.final_node.description + ' + ' + description
        task_id = pipeline.final_node.id + '_and_' + task_id
        commands = pipeline.final_node.commands
        pipeline.remove(pipeline.final_node)

    pipeline.add_final(
        ParallelCopySchema(
            id=task_id,
            description=description,
            schema_name=schema_name,
            source_db_alias=source_db_alias,
            target_db_alias=target_db_alias,
            max_number_of_parallel_tasks=max_number_of_parallel_tasks,
            commands_before=commands[:-1],
            commands_after=commands[-1:]))
Beispiel #4
0
def utils_pipeline(with_hll=False, with_cstore_fdw=False):
    pipeline = Pipeline(
        id="initialize_utils",
        description=
        "Creates an utils schema with a number of functions around the ETL best practices of Project A",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add_initial(
        Task(id="create_utils_schema",
             description="Re-creates the utils schema",
             commands=[
                 ExecuteSQL(
                     sql_statement=
                     "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;")
             ]))

    pipeline.add(
        Task(id='chunking',
             description='Runs file chunking.sql',
             commands=[
                 ExecuteSQL(sql_file_name='chunking.sql',
                            echo_queries=False,
                            replace={
                                'number_of_chunks':
                                lambda: config.number_of_chunks()
                            })
             ]))

    def add_task_for_file(file_name_without_extension):
        pipeline.add(
            Task(id=file_name_without_extension,
                 description=f'Runs file "{file_name_without_extension}.sql"',
                 commands=[
                     ExecuteSQL(sql_file_name=file_name_without_extension +
                                '.sql',
                                echo_queries=False)
                 ]))

    for file_name_without_extension in [
            'consistency_checks', 'data_sets', 'partitioning',
            'indexes_and_constraints', 'schema_switching', 'enums'
    ]:
        add_task_for_file(file_name_without_extension)

    if with_hll:
        add_task_for_file('hll')

    if with_cstore_fdw:
        add_task_for_file('cstore_fdw')

    return pipeline
import pathlib

import etl_tools.utils
from mara_pipelines.commands.files import Compression
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools.create_attributes_table import CreateAttributesTable

pipeline = Pipeline(
    id="github",
    description=
    "Builds a Github activity cube using the public Github archive data set",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "gh_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[
             ExecuteSQL(sql_file_name='recreate_schemas.sql'),
             ExecuteSQL(sql_file_name="create_data_schema.sql",
                        file_dependencies=["create_data_schema.sql"])
         ]))

read_repo_activity_file_dependencies = [
    "create_repo_activity_data_table.sql", "create_data_schema.sql"
]

pipeline.add(
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL, Copy
from mara_pipelines.pipelines import Pipeline, Task
from mara_pipelines import config

pipeline = Pipeline(
    id="load_marketing_data",
    description=
    "Jobs related with loading marketing leads data from the backend database",
    max_number_of_parallel_tasks=5,
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "m_data"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the marketing data schema",
         commands=[
             ExecuteSQL(sql_file_name='../recreate_marketing_data_schema.sql',
                        file_dependencies=[
                            pathlib.Path(__file__).parent.parent /
                            'recreate_marketing_data_schema.sql'
                        ])
         ]))

tables = ['closed_deal', 'marketing_qualified_lead']

for table in tables:
    pipeline.add(
        Task(id=f"load_{table}",
             description=f'Loads the {table}s from the backend database',
Beispiel #7
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="marketing",
    description="Builds the Leads cube based on marketing and e-commerce data",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "m_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[ExecuteSQL(sql_file_name="recreate_schemas.sql")]))

pipeline.add(
    Task(id="preprocess_lead",
         description="Preprocess the marketing leads",
         commands=[ExecuteSQL(sql_file_name="preprocess_lead.sql")]))

pipeline.add(Task(
    id="transform_smaller_dimensions",
    description="Transform smaller marketing dimensions",
    commands=[ExecuteSQL(sql_file_name="transform_smaller_dimensions.sql")]),
             upstreams=["preprocess_lead"])

pipeline.add(Task(id="transform_lead",
                  description="Creates the lead dim table",
                  commands=[
                      ExecuteSQL(sql_file_name="transform_lead.sql",
import pathlib

from mara_pipelines.commands.python import RunFunction
from mara_pipelines.commands.sql import ExecuteSQL, Copy
from mara_pipelines.pipelines import Pipeline, Task
from mara_schema.config import data_sets
from mara_schema.sql_generation import data_set_sql_query

from .cstore_tables import create_cstore_table_for_query
from .. import initialize_db

pipeline = Pipeline(
    id="flatten_data_sets_for_metabase",
    description=
    "Creates data set tables for Metabase (completely flattened, without composed metrics, without personal data)",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": 'metabase'})

pipeline.add_initial(
    Task(id="initialize_schema",
         description="Recreates the metabase schema",
         commands=[
             ExecuteSQL(sql_statement=f"""
DROP SCHEMA IF EXISTS util CASCADE;
CREATE SCHEMA util;            

DROP SCHEMA IF EXISTS metabase_next CASCADE;
CREATE SCHEMA metabase_next;
""",
                        echo_queries=False,
                        db_alias='metabase-data-write'),
Beispiel #9
0
import pathlib

import etl_tools.utils
from mara_pipelines.commands.files import Compression
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools.create_attributes_table import CreateAttributesTable

pipeline = Pipeline(
    id="pypi",
    description=
    "Builds a PyPI downloads cube using the public PyPi BigQuery data set",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pypi_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[
             ExecuteSQL(sql_file_name='recreate_schemas.sql'),
             ExecuteSQL(sql_file_name="create_data_schema.sql",
                        file_dependencies=["create_data_schema.sql"])
         ]))

read_download_file_dependencies = [
    "create_download_counts_data_table.sql", "create_data_schema.sql"
]

pipeline.add(
Beispiel #10
0
import pathlib

import etl_tools.utils
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools.create_attributes_table import CreateAttributesTable

pipeline = Pipeline(
    id="python_projects",
    description=
    "Combines PyPI downloads and github activities to a Python project activity cube",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pp_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')]))

pipeline.add(
    Task(
        id="extract_python_repo_activity",
        description=
        'Extracts activity metrics for github repos that have a corresponding pypi package (by name)',
        commands=[
            ExecuteSQL(sql_file_name="extract_python_repo_activity.sql")
        ]))

pipeline.add(ParallelExecuteSQL(
    id="transform_python_project_activity",
Beispiel #11
0
def root_pipeline():
    import app.pipelines.initialize_db
    import app.pipelines.load_data.load_ecommerce_data
    import app.pipelines.load_data.load_marketing_data
    import app.pipelines.e_commerce
    import app.pipelines.marketing
    import app.pipelines.generate_artifacts
    import app.pipelines.update_frontends
    import app.pipelines.consistency_checks
    import app.pipelines.update_frontends

    pipeline = Pipeline(
        id='mara_example_project_1',
        description='An example pipeline that integrates e-commerce and marketing funnel data')

    pipeline.add_initial(app.pipelines.initialize_db.pipeline)
    pipeline.add(app.pipelines.load_data.load_ecommerce_data.pipeline)
    pipeline.add(app.pipelines.load_data.load_marketing_data.pipeline)
    pipeline.add(app.pipelines.e_commerce.pipeline, upstreams=['load_ecommerce_data'])
    pipeline.add(app.pipelines.marketing.pipeline,
                 upstreams=['load_marketing_data', 'e_commerce'])

    pipeline.add(app.pipelines.generate_artifacts.pipeline, upstreams=['marketing'])
    pipeline.add(app.pipelines.update_frontends.pipeline, upstreams=['generate_artifacts'])
    pipeline.add(app.pipelines.consistency_checks.pipeline, upstreams=['generate_artifacts'])

    return pipeline
Beispiel #12
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="e_commerce",
    description="Builds the e-commerce cubes and datasets",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "ec_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[
             ExecuteSQL(sql_file_name="recreate_schemas.sql")
         ]))

pipeline.add(
    Task(id="preprocess_customer",
         description="Preprocess customers and consolidate the data to a single record per customer with "
                     "a unique ID",
         commands=[
             ExecuteSQL(sql_file_name="preprocess_customer.sql")
         ]))

pipeline.add(
    Task(id="preprocess_order",
         description="Preprocess orders to get correct unique customer ID",
         commands=[
             ExecuteSQL(sql_file_name="preprocess_order.sql")
Beispiel #13
0
"""Contains an internal pipeline for singer tasks runable """

from mara_pipelines.commands.bash import RunBash
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task

from mara_singer.commands.singer import SingerTapDiscover

import mara_singer.config

pipeline = Pipeline(id='_singer',
                    description="Internal Singer.io management pipeline")

for tap_name in mara_singer.config.tap_names():
    tap_pipeline = Pipeline(id=tap_name.replace('-', '_'),
                            description=f'Package {tap_name}')

    tap_pipeline.add(
        Task(id='discover',
             description=f'Reload the {tap_name} catalog',
             commands=[SingerTapDiscover(tap_name=tap_name)]))

    pipeline.add(tap_pipeline)


def _internal_root_pipeline():
    root_pipeline = Pipeline(id='mara_singer',
                             description='Root pipeline of module mara_singer')

    root_pipeline.add(pipeline)
Beispiel #14
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        source_db = mara_db.dbs.db(self.source_db_alias)
        target_db = mara_db.dbs.db(self.target_db_alias)
        assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB))
        assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB))

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias) as cursor:
            pg_version = cursor.connection.server_version

        ddl_task = Task(
            id='create_tables_and_functions',
            description=
            'Re-creates the schema, tables structure and functions on the target db',
            commands=[
                # schema and table structure
                bash.RunBash(
                    command="(echo 'DROP SCHEMA IF EXISTS " +
                    self.schema_name + " CASCADE;';\\\n" +
                    "    pg_dump --username="******" --host=" +
                    source_db.host + " --schema=" + self.schema_name +
                    " --section=pre-data --no-owner --no-privileges " +
                    source_db.database + ") \\\n" + "  | " +
                    mara_db.shell.query_command(
                        self.target_db_alias, echo_queries=False) + ' --quiet'
                ),

                # function definitions
                bash.RunBash(command=f'''echo "
SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def 
FROM (SELECT oid, prokind, pronamespace
      FROM pg_proc p 
      WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace
WHERE pg_proc.pronamespace = pg_namespace.oid
     AND nspname = '{self.schema_name}'" \\\n''' + "  | " +
                             mara_db.shell.copy_to_stdout_command(
                                 self.source_db_alias) + ' \\\n' + "  | " +
                             mara_db.shell.query_command(self.target_db_alias,
                                                         echo_queries=False))
            ])
        sub_pipeline.add(ddl_task)

        # copy content of tables
        number_of_chunks = self.max_number_of_parallel_tasks * 3
        table_copy_chunks = {i: [] for i in range(0, number_of_chunks)}
        current_size_per_table_copy_chunk = [0] * number_of_chunks
        table_types = {}

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias
        ) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                """
SELECT 
    pg_class.relname AS table,
    relkind,
    CASE WHEN relkind = 'f' 
         THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy 
         ELSE  pg_total_relation_size(pg_class.oid)
    END / 1000000.0 AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE nspname = '""" + self.schema_name +
                """' AND relkind IN ('r', 'f') AND relhassubclass = 'f'
ORDER BY size DESC""")
            for table_name, type, size in cursor.fetchall():
                smallest_chunk_index = min(
                    range(len(current_size_per_table_copy_chunk)),
                    key=current_size_per_table_copy_chunk.__getitem__)
                current_size_per_table_copy_chunk[smallest_chunk_index] += size
                table_copy_chunks[smallest_chunk_index].append(table_name)
                table_types[table_name] = type

            copy_tasks = []
            for i, tables in table_copy_chunks.items():
                if tables:
                    task = Task(
                        id=f'copy_tables_{i}',
                        description='Copies table content to the frontend db',
                        commands=[
                            RunBash(
                                command=
                                f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n'
                                + '  | ' +
                                mara_db.shell.copy_to_stdout_command(
                                    self.source_db_alias) + ' \\\n' + '  | ' +
                                mara_db.shell.copy_from_stdin_command(
                                    self.target_db_alias,
                                    target_table=
                                    f'{self.schema_name}.{table_name}'))
                            for table_name in tables
                        ])
                    copy_tasks.append(task)
                    sub_pipeline.add(task, upstreams=[ddl_task])

            # create indexes
            index_chunks = {i: [] for i in range(0, number_of_chunks)}
            current_size_per_index_chunk = [0] * number_of_chunks

            with mara_db.postgresql.postgres_cursor_context(
                    self.source_db_alias) as cursor:
                cursor.execute(""" 
SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname
WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i'
ORDER BY size DESC;""")
                for ddl, size in cursor.fetchall():
                    smallest_chunk_index = min(
                        range(len(current_size_per_index_chunk)),
                        key=current_size_per_index_chunk.__getitem__)
                    current_size_per_index_chunk[smallest_chunk_index] += size
                    index_chunks[smallest_chunk_index].append(ddl)

            for i, index_statements in index_chunks.items():
                if index_statements:
                    index_task = Task(
                        id=f'add_indexes_{i}',
                        description='Re-creates indexes on frontend db',
                        commands=[
                            ExecuteSQL(sql_statement=statement,
                                       db_alias=self.target_db_alias)
                            for statement in index_statements
                        ])
                    sub_pipeline.add(index_task, upstreams=copy_tasks)
Beispiel #15
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL, Copy
from mara_pipelines.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="load_ecommerce_data",
    description=
    "Jobs related with loading e-commerce data from the backend database",
    max_number_of_parallel_tasks=5,
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "ec_data"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the e-commerce data schema",
         commands=[
             ExecuteSQL(sql_file_name='../recreate_ecommerce_data_schema.sql',
                        file_dependencies=[
                            pathlib.Path(__file__).parent.parent /
                            'recreate_ecommerce_data_schema.sql'
                        ])
         ]))

tables = [
    'customer', 'order', 'order_item', 'product',
    'product_category_name_translation', 'seller'
]

for table in tables:
    pipeline.add(
Beispiel #16
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="generate_artifacts",
    description="Create flattened data set tables for various front-ends",
    base_path=pathlib.Path(__file__).parent)

from .metabase import pipeline as metabase_pipeline

pipeline.add(metabase_pipeline)

from .mara_data_explorer import pipeline as mara_data_explorer_pipeline

pipeline.add(mara_data_explorer_pipeline)

from .mondrian import pipeline as mondrian_pipeline

pipeline.add(mondrian_pipeline)

pipeline.add_final(
    Task(id='replace_schemas',
         description='Replaces the frontend schemas with their next versions',
         commands=[
             ExecuteSQL(sql_file_name='switch_metabase_schema.sql',
                        db_alias='metabase-data-write'),
             ExecuteSQL(
                 sql_statement=
                 f"SELECT util.replace_schema('data_sets', 'data_sets_next')",
Beispiel #17
0
import pathlib

import mara_mondrian.connection
import mara_schema.config
from mara_pipelines.commands.python import RunFunction
from mara_pipelines.logging import logger
from mara_pipelines.pipelines import Pipeline, Task

import mara_metabase.metadata

pipeline = Pipeline(id="update_frontends",
                    description="Updates Metabase & Mondrian",
                    base_path=pathlib.Path(__file__).parent)

pipeline.add(
    Task(id='update_metabase_metadata',
         description=
         'Flushes all field value caches in Metabase and updates metadata',
         commands=[RunFunction(mara_metabase.metadata.update_metadata)]))


def write_mondrian_schema():
    import mara_mondrian.schema_generation
    file_name = pathlib.Path('.mondrian-schema.xml')
    logger.log(f'Writing {file_name}', logger.Format.ITALICS)

    mara_mondrian.schema_generation.write_mondrian_schema(
        file_name=pathlib.Path('.mondrian-schema.xml'),
        data_set_tables={
            data_set: ('mondrian', data_set.id())
            for data_set in mara_schema.config.data_sets()
Beispiel #18
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from mara_schema.sql_generation import data_set_sql_query, database_identifier
from mara_schema.config import data_sets

pipeline = Pipeline(
    id="flatten_data_sets_for_mondrian",
    description="Creates data set tables for Mondrian (star schema, without composed metrics, without personal data)",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": 'mondrian'})

pipeline.add_initial(
    Task(
        id="initialize_schema",
        description="Recreates the mondrian schema",
        commands=[
            ExecuteSQL(sql_statement=f"""
DROP SCHEMA IF EXISTS mondrian_next CASCADE;
CREATE SCHEMA mondrian_next;
""", echo_queries=False)]))

for data_set in data_sets():
    pipeline.add(
        Task(id=f"flatten_{data_set.id()}_for_mondrian",
             description=f'Flattens the "{data_set.name}" data set for best use in Mondrian',
             commands=[
                 ExecuteSQL(f"""
CREATE TABLE mondrian_next.{database_identifier(data_set.name)} AS
{data_set_sql_query(data_set=data_set, pre_computed_metrics=False, human_readable_columns=False, star_schema=True,
Beispiel #19
0
import pathlib
import mara_db.postgresql
from facebook_ads_performance_pipeline import config

from mara_pipelines.commands.files import ReadSQLite
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.parallel_tasks.files import ReadMode, ParallelReadSqlite
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from mara_pipelines.config import default_db_alias

pipeline = Pipeline(
    id="facebook",
    description="Processes the data downloaded from the FacebookAds API",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "fb_dim"})

pipeline.add_initial(
    Task(
        id="initialize_schemas",
        description="Recreates the tmp and dim_next schemas",
        commands=[
            ExecuteSQL(
                sql_statement=
                "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;"
            ),
            ExecuteSQL(sql_file_name="create_data_schema.sql",
                       echo_queries=False,
                       file_dependencies=["create_data_schema.sql"]),
            ExecuteSQL(sql_file_name="recreate_schemas.sql",
                       echo_queries=False)
Beispiel #20
0
import pathlib

from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools import config

pipeline = Pipeline(id="create_time_dimensions",
                    description="Creates a day and a duration dimension table",
                    labels={"Schema": "time"},
                    base_path=pathlib.Path(__file__).parent)

pipeline.add(
    Task(id="create_tables",
         description="Re-creates the day and duration table and their schema",
         commands=[
             ExecuteSQL(sql_file_name='create_tables.sql',
                        echo_queries=False,
                        file_dependencies=['create_tables.sql'])
         ]))

pipeline.add(Task(
    id="populate_time_dimensions",
    description="fills the time dimensions for a configured time range",
    commands=[
        ExecuteSQL(
            sql_statement=lambda: "SELECT time.populate_time_dimensions('" +
            config.first_date_in_time_dimensions().isoformat() + "'::DATE, '" +
            config.last_date_in_time_dimensions().isoformat() + "'::DATE);")
    ]),
             upstreams=['create_tables'])
Beispiel #21
0
import pathlib

from etl_tools import initialize_utils, create_time_dimensions
from mara_pipelines.pipelines import Pipeline, Task
from mara_pipelines.commands.sql import ExecuteSQL

pipeline = Pipeline(
    id="initialize_db",
    description="Adds a number of utility functions & creates time dimensions",
    base_path=pathlib.Path(__file__).parent)

pipeline.add(initialize_utils.utils_pipeline(with_cstore_fdw=True))
pipeline.add(create_time_dimensions.pipeline, upstreams=['initialize_utils'])

pipeline.add(
    Task(id='create_read_only_user',
         description='Creates a read-only user "dwh_read_only", useful for giving analysts direct access to DWH',
         commands=[
             ExecuteSQL(sql_file_name='create_read_only_user.sql')
         ]),
    upstreams=['initialize_utils'])
Beispiel #22
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'

        ddl = f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
'''

        commands = []

        with mara_db.postgresql.postgres_cursor_context(
                self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                ddl += f"""
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');
"""
                commands.append(
                    ExecuteSQL(sql_statement=f'''
INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''',
                               echo_queries=False))

        sub_pipeline.add_initial(
            Task(id='create_table',
                 description='Creates the attributes table',
                 commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)]))

        chunk_size = math.ceil(
            len(commands) /
            (2 * mara_pipelines.config.max_number_of_parallel_tasks()))
        for n, chunk in enumerate(more_itertools.chunked(commands,
                                                         chunk_size)):
            task = Task(id=str(n),
                        description='Process a portion of the attributes')
            task.add_commands(chunk)
            sub_pipeline.add(task)