コード例 #1
0
def bc_reg_pipeline_single_thread():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_event_processor_single_thread',
        description=
        'A pipeline that processes BC Registries events and generates credentials.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_data_single_thread',
        description='Load BC Reg data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events_single_thread',
             description='Register un-processed events',
             commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data_single_thread',
             description='Load BC Registries data',
             commands=[ExecutePython('./bcreg/register_un_processed_events')]),
        ['register_un_processed_events_single_thread'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_single_thread',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_single_thread',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds-single-thread.py')
                       ]))
    pipeline1.add(sub_pipeline1_3,
                  ['load_and_process_bc_reg_data_single_thread'])

    return pipeline1
コード例 #2
0
ファイル: __init__.py プロジェクト: efulet/etl-tools
def euro_exchange_rates_pipeline(db_alias: str):
    pipeline = Pipeline(
        id="load_euro_exchange_rates",
        description=
        "Loads daily Euro exchange rates since 1999 from the European central bank",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add(
        Task(id="create_schema_and_table",
             description="Re-creates currency exchange rate schema",
             commands=[
                 ExecuteSQL(sql_file_name='create_schema_and_table.sql',
                            echo_queries=False)
             ]))

    pipeline.add(Task(
        id='load_exchange_rate',
        description='Loads exchange rates from the European central bank',
        commands=[
            ReadScriptOutput(file_name='load_exchange_rate.py',
                             target_table='euro_fx.exchange_rate',
                             db_alias='mdwh-etl')
        ]),
                 upstreams=['create_schema_and_table'])

    pipeline.add(Task(id="postprocess_exchange_rate",
                      description="Adds values for missing days",
                      commands=[
                          ExecuteSQL(
                              sql_file_name='postprocess_exchange_rate.sql',
                              echo_queries=False)
                      ]),
                 upstreams=['load_exchange_rate'])

    return pipeline
コード例 #3
0
def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_a',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    sub_pipeline1_4 = Pipeline(
        id='populate_evp_audit_table_a',
        description='Populate Event Processor Audit Table')
    sub_pipeline1_4.add(
        Task(id='populate_audit_table_a',
             description='Populate Audit Table',
             commands=[ExecutePython('./bcreg/populate_audit_table.py')]))
    pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials_a'])

    return pipeline1
コード例 #4
0
def bc_reg_pipeline():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_event_processor',
        description='A pipeline that processes BC Registries events and generates credentials.')

    sub_pipeline1_2 = Pipeline(id='load_and_process_bc_reg_data', description='Load BC Reg data and generate credentials')
    sub_pipeline1_2.add(Task(id='register_un_processed_events', description='Register un-processed events',
                          commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline1_2.add(Task(id='load_bc_reg_data', description='Load BC Registries data',
                          commands=[ExecutePython('./bcreg/process-corps-generate-creds.py')]), ['register_un_processed_events'])
    sub_pipeline1_2.add(Task(id='create_bc_reg_credentials', description='Create credentials',
                          commands=[ExecutePython('./bcreg/generate-creds.py')]), ['load_bc_reg_data'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials', description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(Task(id='submit_credentials', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3, ['load_and_process_bc_reg_data'])

    sub_pipeline1_4 = Pipeline(id='populate_evp_audit_table', description='Populate Event Processor Audit Table')
    sub_pipeline1_4.add(Task(id='populate_audit_table', description='Populate Audit Table',
                          commands=[ExecutePython('./bcreg/populate_audit_table.py')]))
    pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials'])

    return pipeline1
コード例 #5
0
def von_data_pipeline():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_event_processor',
        description=
        'A pipeline that processes von_data events and generates credentials.')

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_von_data_data',
        description='Load von_data data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='create_von_data_credentials',
             description='Create credentials',
             commands=[ExecutePython('./von_pipeline/generate-creds.py')]))
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials',
             description='Submit credentials',
             commands=[ExecutePython('./von_pipeline/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3, ['load_and_process_von_data_data'])

    return pipeline1
コード例 #6
0
def bc_reg_pipeline_initial_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_corp_loader',
        description=
        'A pipeline that does the initial data load and credentials for all corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_corps',
        description='Load Active BC Reg corps and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_corps',
             description='Register un-processed active corps',
             commands=[
                 ExecutePython('./bcreg/find-unprocessed-corps_actve.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data_a',
             description='Load BC Registries data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-creds.py')
             ]), ['register_un_processed_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1
コード例 #7
0
def bc_reg_pipeline_bn_credential_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_bn_loader',
        description=
        'A pipeline that creates BN credentials for all existing corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_existing_corps_no_bn',
        description='Load BC Reg corps with no BN credential')
    sub_pipeline1_2.add(
        Task(id='register_un_bned_corps',
             description='Register corps with no BN',
             commands=[ExecutePython('./bcreg/find-un-bned-corps.py')]))
    sub_pipeline1_2.add(
        Task(id='load_corp_bn_data',
             description='Load BN credentials from company data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-bn-creds.py')
             ]), ['register_un_bned_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1
コード例 #8
0
def db_init_pipeline():
    import bcreg

    pipeline = Pipeline(
      id = 'bc_reg_db_init',
      description = 'Initialize BC Registries Event Processor database')

    pipeline.add(Task(id='create_tables', description='Create event processing tables',
                        commands=[ExecutePython('./bcreg/create.py')]))
    pipeline.add(Task(id='initialize_tables', description='Insert configuration data',
                        commands=[ExecutePython('./bcreg/insert.py')]), ['create_tables'])

    return pipeline
コード例 #9
0
def utils_pipeline(with_hll=False, with_cstore_fdw=False):
    pipeline = Pipeline(
        id="initialize_utils",
        description=
        "Creates an utils schema with a number of functions around the ETL best practices of Project A",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add_initial(
        Task(id="create_utils_schema",
             description="Re-creates the utils schema",
             commands=[
                 ExecuteSQL(
                     sql_statement=
                     "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;")
             ]))

    pipeline.add(
        Task(id='chunking',
             description='Runs file chunking.sql',
             commands=[
                 ExecuteSQL(sql_file_name='chunking.sql',
                            echo_queries=False,
                            replace={
                                'number_of_chunks':
                                lambda: config.number_of_chunks()
                            })
             ]))

    def add_task_for_file(file_name_without_extension):
        pipeline.add(
            Task(id=file_name_without_extension,
                 description=f'Runs file "{file_name_without_extension}.sql"',
                 commands=[
                     ExecuteSQL(sql_file_name=file_name_without_extension +
                                '.sql',
                                echo_queries=False)
                 ]))

    for file_name_without_extension in [
            'consistency_checks', 'data_sets', 'partitioning',
            'indexes_and_constraints', 'schema_switching', 'enums'
    ]:
        add_task_for_file(file_name_without_extension)

    if with_hll:
        add_task_for_file('hll')

    if with_cstore_fdw:
        add_task_for_file('cstore_fdw')

    return pipeline
コード例 #10
0
def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))
    # remove these from the pipeline due to issues connecting to DB's on openshift
    #pipeline.add(Task(id='display_pipeline_stats', description='Display stats of each stage in the pipeline processing',
    #                    commands=[ExecutePython('./bcreg/display_processed_corps_counts.py')]))
    pipeline.add(Task(id='display_event_processor_stats', description='Display stats of each event processor stage',
                        commands=[ExecutePython('./bcreg/display_event_processor_counts.py')]))

    return pipeline
コード例 #11
0
def db_init_pipeline():
    import von_pipeline

    pipeline = Pipeline(
        id='von_data_db_init',
        description='Initialize von_data Event Processor database')

    pipeline.add(
        Task(id='create_tables',
             description='Create event processing tables',
             commands=[ExecutePython('./von_pipeline/create.py')]))
    pipeline.add(
        Task(id='initialize_tables',
             description='Insert configuration data',
             commands=[ExecutePython('./von_pipeline/insert.py')]),
        ['create_tables'])

    return pipeline
コード例 #12
0
 def add_task_for_file(file_name_without_extension):
     pipeline.add(
         Task(id=file_name_without_extension,
              description=f'Runs file "{file_name_without_extension}.sql"',
              commands=[
                  ExecuteSQL(sql_file_name=file_name_without_extension +
                             '.sql',
                             echo_queries=False)
              ]))
コード例 #13
0
def bc_reg_populate_audit_table():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_populate_audit_table',
        description='Populate Audit Table.')

    pipeline.add(Task(id='populate_audit_table', description='Populate audit table.',
                        commands=[ExecutePython('./bcreg/populate_audit_table.py')]))

    return pipeline
コード例 #14
0
def bc_reg_test_corps():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_corps',
        description='A pipeline that queues up a small set of test corporations.')

    pipeline.add(Task(id='register_test_corps', description='Register some test corps for processing',
                        commands=[ExecutePython('./bcreg/find-test-corps.py')]))

    return pipeline
コード例 #15
0
def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))

    return pipeline
コード例 #16
0
def bc_init_test_data():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_data',
        description='A pipeline that initializes event processor database for testing.')

    pipeline.add(Task(id='register_test_corps', description='Insert some test data for processing',
                        commands=[ExecutePython('./bcreg/insert-test.py')]))

    return pipeline
コード例 #17
0
def von_list_mongo_data():
    import von_pipeline

    pipeline = Pipeline(id='von_list_mongo_data',
                        description='A pipeline that lists data in mongodb.')

    pipeline.add(
        Task(id='list_mongo_data',
             description='List data queued for processing',
             commands=[ExecutePython('./von_pipeline/list_mongo_data.py')]))

    return pipeline
コード例 #18
0
def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1
コード例 #19
0
def bc_reg_pipeline_jsonbender():
    import bcreg

    pipeline2 = Pipeline(
        id='bc_reg_event_processor_json_transform_demo',
        description='A demo pipeline that processes events and generates credentials using JSONBender.')

    sub_pipeline2_2 = Pipeline(id='load_and_process_bc_reg_data', description='Load BC Reg data and generate credentials')
    sub_pipeline2_2.add(Task(id='register_un_processed_events', description='Register un-processed events',
                          commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline2_2.add(Task(id='load_bc_reg_data', description='Load BC Registries data',
                          commands=[ExecutePython('./bcreg/process-corps.py')]), ['register_un_processed_events'])
    sub_pipeline2_2.add(Task(id='create_credentials_jsonbender', description='Create credentials using JSONBender transform',
                          commands=[ExecutePython('./bcreg/generate-creds-bender.py')]), ['load_bc_reg_data'])
    pipeline2.add(sub_pipeline2_2)

    sub_pipeline2_3 = Pipeline(id='submit_bc_reg_credentials', description='Submit BC Reg credentials to P-X')
    sub_pipeline2_3.add(Task(id='submit_credentials', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline2.add(sub_pipeline2_3, ['load_and_process_bc_reg_data'])

    return pipeline2
コード例 #20
0
def von_data_pipeline_single_thread():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_pipeline_single_thread',
        description=
        'A pipeline that processes von_data events and generates credentials.')

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_von_data_data_single_thread',
        description='Load von_data data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events_single_thread',
             description='Register un-processed events',
             commands=[
                 ExecutePython('./von_pipeline/find-unprocessed-events.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_von_data_data_single_thread',
             description='Load von_data data',
             commands=[
                 ExecutePython('./von_pipeline/register_un_processed_events')
             ]), ['register_un_processed_events_single_thread'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_single_thread',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_single_thread',
             description='Submit credentials',
             commands=[
                 ExecutePython('./von_pipeline/submit-creds-single-thread.py')
             ]))
    pipeline1.add(sub_pipeline1_3,
                  ['load_and_process_von_data_data_single_thread'])

    return pipeline1
コード例 #21
0
    def __init__(self, task: pipelines.Task, event_queue: multiprocessing.Queue, status_queue: multiprocessing.Queue):
        """
        Runs a task in a separate sub process.

        Args:
            task: The task to run
            event_queue: The query for writing events to
            status_queue: A queue for reporting whether the task succeeded
        """
        super().__init__(name='task-' + '-'.join(task.path()))
        self.task = task
        self.event_queue = event_queue
        self.status_queue = status_queue
        self.start_time = datetime.datetime.now()
コード例 #22
0
def von_data_pipeline_status():
    import von_pipeline

    pipeline = Pipeline(id='von_data_pipeline_status',
                        description='Display overall event processing status.')

    pipeline.add(
        Task(id='display_pipeline_status',
             description=
             'Display status of the overall pipeline processing status',
             commands=[
                 ExecutePython('./von_pipeline/display_pipeline_status.py')
             ]))

    return pipeline
コード例 #23
0
def von_data_pipeline_post_credentials():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_a',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_a',
             description='Submit credentials',
             commands=[ExecutePython('./von_pipeline/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1
コード例 #24
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        source_db = mara_db.dbs.db(self.source_db_alias)
        target_db = mara_db.dbs.db(self.target_db_alias)
        assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB))
        assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB))

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias) as cursor:
            pg_version = cursor.connection.server_version

        ddl_task = Task(
            id='create_tables_and_functions',
            description=
            'Re-creates the schema, tables structure and functions on the target db',
            commands=[
                # schema and table structure
                bash.RunBash(
                    command="(echo 'DROP SCHEMA IF EXISTS " +
                    self.schema_name + " CASCADE;';\\\n" +
                    "    pg_dump --username="******" --host=" +
                    source_db.host + " --schema=" + self.schema_name +
                    " --section=pre-data --no-owner --no-privileges " +
                    source_db.database + ") \\\n" + "  | " +
                    mara_db.shell.query_command(
                        self.target_db_alias, echo_queries=False) + ' --quiet'
                ),

                # function definitions
                bash.RunBash(command=f'''echo "
SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def 
FROM (SELECT oid, * 
      FROM pg_proc p 
      WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace
WHERE pg_proc.pronamespace = pg_namespace.oid
     AND nspname = '{self.schema_name}'" \\\n''' + "  | " +
                             mara_db.shell.copy_to_stdout_command(
                                 self.source_db_alias) + ' \\\n' + "  | " +
                             mara_db.shell.query_command(self.target_db_alias,
                                                         echo_queries=False))
            ])
        sub_pipeline.add(ddl_task)

        # copy content of tables
        number_of_chunks = self.max_number_of_parallel_tasks * 3
        table_copy_chunks = {i: [] for i in range(0, number_of_chunks)}
        current_size_per_table_copy_chunk = [0] * number_of_chunks
        table_types = {}

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias
        ) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                """
SELECT 
    pg_class.relname AS table,
    relkind,
    CASE WHEN relkind = 'f' 
         THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy 
         ELSE  pg_total_relation_size(pg_class.oid)
    END / 1000000.0 AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE nspname = '""" + self.schema_name +
                """' AND relkind IN ('r', 'f') AND relhassubclass = 'f'
ORDER BY size DESC""")
            for table_name, type, size in cursor.fetchall():
                smallest_chunk_index = min(
                    range(len(current_size_per_table_copy_chunk)),
                    key=current_size_per_table_copy_chunk.__getitem__)
                current_size_per_table_copy_chunk[smallest_chunk_index] += size
                table_copy_chunks[smallest_chunk_index].append(table_name)
                table_types[table_name] = type

            copy_tasks = []
            for i, tables in table_copy_chunks.items():
                if tables:
                    task = Task(
                        id=f'copy_tables_{i}',
                        description='Copies table content to the frontend db',
                        commands=[
                            RunBash(
                                command=
                                f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n'
                                + '  | ' +
                                mara_db.shell.copy_to_stdout_command(
                                    self.source_db_alias) + ' \\\n' + '  | ' +
                                mara_db.shell.copy_from_stdin_command(
                                    self.target_db_alias,
                                    target_table=
                                    f'{self.schema_name}.{table_name}'))
                            for table_name in tables
                        ])
                    copy_tasks.append(task)
                    sub_pipeline.add(task, upstreams=[ddl_task])

            # create indexes
            index_chunks = {i: [] for i in range(0, number_of_chunks)}
            current_size_per_index_chunk = [0] * number_of_chunks

            with mara_db.postgresql.postgres_cursor_context(
                    self.source_db_alias) as cursor:
                cursor.execute(""" 
SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname
WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i'
ORDER BY size DESC;""")
                for ddl, size in cursor.fetchall():
                    smallest_chunk_index = min(
                        range(len(current_size_per_index_chunk)),
                        key=current_size_per_index_chunk.__getitem__)
                    current_size_per_index_chunk[smallest_chunk_index] += size
                    index_chunks[smallest_chunk_index].append(ddl)

            for i, index_statements in index_chunks.items():
                if index_statements:
                    index_task = Task(
                        id=f'add_indexes_{i}',
                        description='Re-creates indexes on frontend db',
                        commands=[
                            ExecuteSQL(sql_statement=statement,
                                       db_alias=self.target_db_alias)
                            for statement in index_statements
                        ])
                    sub_pipeline.add(index_task, upstreams=copy_tasks)
コード例 #25
0
ファイル: __init__.py プロジェクト: efulet/etl-tools
import pathlib

from data_integration.commands.sql import ExecuteSQL
from data_integration.pipelines import Pipeline, Task
from etl_tools import config

pipeline = Pipeline(id="create_time_dimensions",
                    description="Creates a day and a duration dimension table",
                    labels={"Schema": "time"},
                    base_path=pathlib.Path(__file__).parent)

pipeline.add(
    Task(id="create_tables",
         description="Re-creates the day and duration table and their schema",
         commands=[
             ExecuteSQL(sql_file_name='create_tables.sql',
                        echo_queries=False,
                        file_dependencies=['create_tables.sql'])
         ]))

pipeline.add(Task(
    id="populate_time_dimensions",
    description="fills the time dimensions for a configured time range",
    commands=[
        ExecuteSQL(
            sql_statement=lambda: "SELECT time.populate_time_dimensions('" +
            config.first_date_in_time_dimensions().isoformat() + "'::DATE, '" +
            config.last_date_in_time_dimensions().isoformat() + "'::DATE);")
    ]),
             upstreams=['create_tables'])
コード例 #26
0
from data_integration.config import default_db_alias

pipeline = Pipeline(
    id="facebook",
    description="Processes the data downloaded from the FacebookAds API",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "fb_dim"})

pipeline.add_initial(
    Task(
        id="initialize_schemas",
        description="Recreates the tmp and dim_next schemas",
        commands=[
            ExecuteSQL(
                sql_statement=
                "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;"
            ),
            ExecuteSQL(sql_file_name="create_data_schema.sql",
                       echo_queries=False,
                       file_dependencies=["create_data_schema.sql"]),
            ExecuteSQL(sql_file_name="recreate_schemas.sql",
                       echo_queries=False)
        ]))

pipeline.add(
    Task(id="read_campaign_structure",
         description="Loads the adwords campaign structure",
         commands=[
             ExecuteSQL(
                 sql_file_name='create_campaign_structure_data_table.sql',
                 echo_queries=False),
             ReadSQLite(
コード例 #27
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'
        sub_pipeline.add_initial(
            Task(id='create_table',
                 description='Creates the attributes table',
                 commands=[
                     ExecuteSQL(sql_statement=f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
''')
                 ]))

        with mara_db.postgresql.postgres_cursor_context(
                self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                sub_pipeline.add(
                    Task(id=re.sub('\W+', '_', column_name.lower()),
                         description=
                         f'Extracts attributes for the {column_name} column',
                         commands=[
                             ExecuteSQL(sql_statement=f'''
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');

INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''',
                                        echo_queries=False)
                         ]))
コード例 #28
0
import etl_tools.utils
from data_integration.commands.sql import ExecuteSQL
from data_integration.parallel_tasks.sql import ParallelExecuteSQL
from data_integration.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="python_projects",
    description=
    "Combines PyPI downloads and github activities to a Python project activity cube",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pp_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')]))

pipeline.add(
    Task(
        id="extract_python_repo_activity",
        description=
        'Extracts activity metrics for github repos that have a corresponding pypi package (by name)',
        commands=[
            ExecuteSQL(sql_file_name="extract_python_repo_activity.sql")
        ]))

pipeline.add(ParallelExecuteSQL(
    id="transform_python_project_activity",
    description=
    "Aggregates downloads at project level and combines them with github activity metrics",
コード例 #29
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'

        ddl = f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
'''

        commands = []

        with mara_db.postgresql.postgres_cursor_context(self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                ddl += f"""
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');
"""
                commands.append(
                    ExecuteSQL(sql_statement=f'''
INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''', echo_queries=False))

        sub_pipeline.add_initial(
            Task(id='create_table', description='Creates the attributes table',
                 commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)]))

        chunk_size = math.ceil(len(commands) / (2 * data_integration.config.max_number_of_parallel_tasks()))
        for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)):
            task = Task(id=str(n), description='Process a portion of the attributes')
            task.add_commands(chunk)
            sub_pipeline.add(task)
コード例 #30
0
from data_integration.parallel_tasks.files import ParallelReadFile, ReadMode
from data_integration.parallel_tasks.sql import ParallelExecuteSQL
from data_integration.pipelines import Pipeline, Task

pipeline = Pipeline(
    id="pypi",
    description=
    "Builds a PyPI downloads cube using the public PyPi BigQuery data set",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pypi_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[
             ExecuteSQL(sql_file_name='recreate_schemas.sql'),
             ExecuteSQL(sql_file_name="create_data_schema.sql",
                        file_dependencies=["create_data_schema.sql"])
         ]))

read_download_file_dependencies = [
    "create_download_data_table.sql", "create_data_schema.sql"
]

pipeline.add(
    ParallelReadFile(
        id="read_download",
        description="Loads PyPI downloads from pre_downloaded csv files",
        file_pattern="*/*/*/pypi/downloads-v1.csv.gz",
        read_mode=ReadMode.ONLY_NEW,
        compression=Compression.GZIP,