Python Pipeline.addの例、data_integration.pipelines.Pipeline.add Pythonの例

コード例 #1

0

ファイルを表示

def bc_reg_pipeline_single_thread():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_event_processor_single_thread',
        description=
        'A pipeline that processes BC Registries events and generates credentials.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_data_single_thread',
        description='Load BC Reg data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events_single_thread',
             description='Register un-processed events',
             commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data_single_thread',
             description='Load BC Registries data',
             commands=[ExecutePython('./bcreg/register_un_processed_events')]),
        ['register_un_processed_events_single_thread'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_single_thread',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_single_thread',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds-single-thread.py')
                       ]))
    pipeline1.add(sub_pipeline1_3,
                  ['load_and_process_bc_reg_data_single_thread'])

    return pipeline1

コード例 #2

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def bc_init_test_data():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_data',
        description='A pipeline that initializes event processor database for testing.')

    pipeline.add(Task(id='register_test_corps', description='Insert some test data for processing',
                        commands=[ExecutePython('./bcreg/insert-test.py')]))

    return pipeline

コード例 #3

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def bc_reg_test_corps():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_corps',
        description='A pipeline that queues up a small set of test corporations.')

    pipeline.add(Task(id='register_test_corps', description='Register some test corps for processing',
                        commands=[ExecutePython('./bcreg/find-test-corps.py')]))

    return pipeline

コード例 #4

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: weiiv/von-bc-registries-agent

def bc_reg_populate_audit_table():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_populate_audit_table',
        description='Populate Audit Table.')

    pipeline.add(Task(id='populate_audit_table', description='Populate audit table.',
                        commands=[ExecutePython('./bcreg/populate_audit_table.py')]))

    return pipeline

コード例 #5

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))

    return pipeline

コード例 #6

0

ファイルを表示

def von_list_mongo_data():
    import von_pipeline

    pipeline = Pipeline(id='von_list_mongo_data',
                        description='A pipeline that lists data in mongodb.')

    pipeline.add(
        Task(id='list_mongo_data',
             description='List data queued for processing',
             commands=[ExecutePython('./von_pipeline/list_mongo_data.py')]))

    return pipeline

コード例 #7

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def db_init_pipeline():
    import bcreg

    pipeline = Pipeline(
      id = 'bc_reg_db_init',
      description = 'Initialize BC Registries Event Processor database')

    pipeline.add(Task(id='create_tables', description='Create event processing tables',
                        commands=[ExecutePython('./bcreg/create.py')]))
    pipeline.add(Task(id='initialize_tables', description='Insert configuration data',
                        commands=[ExecutePython('./bcreg/insert.py')]), ['create_tables'])

    return pipeline

コード例 #8

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1

コード例 #9

0

ファイルを表示

def utils_pipeline(with_hll=False, with_cstore_fdw=False):
    pipeline = Pipeline(
        id="initialize_utils",
        description=
        "Creates an utils schema with a number of functions around the ETL best practices of Project A",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add_initial(
        Task(id="create_utils_schema",
             description="Re-creates the utils schema",
             commands=[
                 ExecuteSQL(
                     sql_statement=
                     "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;")
             ]))

    pipeline.add(
        Task(id='chunking',
             description='Runs file chunking.sql',
             commands=[
                 ExecuteSQL(sql_file_name='chunking.sql',
                            echo_queries=False,
                            replace={
                                'number_of_chunks':
                                lambda: config.number_of_chunks()
                            })
             ]))

    def add_task_for_file(file_name_without_extension):
        pipeline.add(
            Task(id=file_name_without_extension,
                 description=f'Runs file "{file_name_without_extension}.sql"',
                 commands=[
                     ExecuteSQL(sql_file_name=file_name_without_extension +
                                '.sql',
                                echo_queries=False)
                 ]))

    for file_name_without_extension in [
            'consistency_checks', 'data_sets', 'partitioning',
            'indexes_and_constraints', 'schema_switching', 'enums'
    ]:
        add_task_for_file(file_name_without_extension)

    if with_hll:
        add_task_for_file('hll')

    if with_cstore_fdw:
        add_task_for_file('cstore_fdw')

    return pipeline

コード例 #10

0

ファイルを表示

def von_data_pipeline():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_event_processor',
        description=
        'A pipeline that processes von_data events and generates credentials.')

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_von_data_data',
        description='Load von_data data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='create_von_data_credentials',
             description='Create credentials',
             commands=[ExecutePython('./von_pipeline/generate-creds.py')]))
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials',
             description='Submit credentials',
             commands=[ExecutePython('./von_pipeline/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3, ['load_and_process_von_data_data'])

    return pipeline1

コード例 #11

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: ised-isde-canada/von-gc-corp-canada-agent

def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_a',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    sub_pipeline1_4 = Pipeline(
        id='populate_evp_audit_table_a',
        description='Populate Event Processor Audit Table')
    sub_pipeline1_4.add(
        Task(id='populate_audit_table_a',
             description='Populate Audit Table',
             commands=[ExecutePython('./bcreg/populate_audit_table.py')]))
    pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials_a'])

    return pipeline1

コード例 #12

0

ファイルを表示

def von_data_pipeline_status():
    import von_pipeline

    pipeline = Pipeline(id='von_data_pipeline_status',
                        description='Display overall event processing status.')

    pipeline.add(
        Task(id='display_pipeline_status',
             description=
             'Display status of the overall pipeline processing status',
             commands=[
                 ExecutePython('./von_pipeline/display_pipeline_status.py')
             ]))

    return pipeline

コード例 #13

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: weiiv/von-bc-registries-agent

def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))
    # remove these from the pipeline due to issues connecting to DB's on openshift
    #pipeline.add(Task(id='display_pipeline_stats', description='Display stats of each stage in the pipeline processing',
    #                    commands=[ExecutePython('./bcreg/display_processed_corps_counts.py')]))
    pipeline.add(Task(id='display_event_processor_stats', description='Display stats of each event processor stage',
                        commands=[ExecutePython('./bcreg/display_event_processor_counts.py')]))

    return pipeline

コード例 #14

0

ファイルを表示

def von_data_pipeline_post_credentials():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_a',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_a',
             description='Submit credentials',
             commands=[ExecutePython('./von_pipeline/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1

コード例 #15

0

ファイルを表示

def db_init_pipeline():
    import von_pipeline

    pipeline = Pipeline(
        id='von_data_db_init',
        description='Initialize von_data Event Processor database')

    pipeline.add(
        Task(id='create_tables',
             description='Create event processing tables',
             commands=[ExecutePython('./von_pipeline/create.py')]))
    pipeline.add(
        Task(id='initialize_tables',
             description='Insert configuration data',
             commands=[ExecutePython('./von_pipeline/insert.py')]),
        ['create_tables'])

    return pipeline

コード例 #16

0

ファイルを表示

def bc_reg_pipeline_initial_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_corp_loader',
        description=
        'A pipeline that does the initial data load and credentials for all corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_corps',
        description='Load Active BC Reg corps and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_corps',
             description='Register un-processed active corps',
             commands=[
                 ExecutePython('./bcreg/find-unprocessed-corps_actve.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data_a',
             description='Load BC Registries data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-creds.py')
             ]), ['register_un_processed_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1

コード例 #17

0

ファイルを表示

def bc_reg_pipeline_bn_credential_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_bn_loader',
        description=
        'A pipeline that creates BN credentials for all existing corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_existing_corps_no_bn',
        description='Load BC Reg corps with no BN credential')
    sub_pipeline1_2.add(
        Task(id='register_un_bned_corps',
             description='Register corps with no BN',
             commands=[ExecutePython('./bcreg/find-un-bned-corps.py')]))
    sub_pipeline1_2.add(
        Task(id='load_corp_bn_data',
             description='Load BN credentials from company data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-bn-creds.py')
             ]), ['register_un_bned_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1

コード例 #18

0

ファイルを表示

ファイル: __init__.py プロジェクト: efulet/etl-tools

def euro_exchange_rates_pipeline(db_alias: str):
    pipeline = Pipeline(
        id="load_euro_exchange_rates",
        description=
        "Loads daily Euro exchange rates since 1999 from the European central bank",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add(
        Task(id="create_schema_and_table",
             description="Re-creates currency exchange rate schema",
             commands=[
                 ExecuteSQL(sql_file_name='create_schema_and_table.sql',
                            echo_queries=False)
             ]))

    pipeline.add(Task(
        id='load_exchange_rate',
        description='Loads exchange rates from the European central bank',
        commands=[
            ReadScriptOutput(file_name='load_exchange_rate.py',
                             target_table='euro_fx.exchange_rate',
                             db_alias='mdwh-etl')
        ]),
                 upstreams=['create_schema_and_table'])

    pipeline.add(Task(id="postprocess_exchange_rate",
                      description="Adds values for missing days",
                      commands=[
                          ExecuteSQL(
                              sql_file_name='postprocess_exchange_rate.sql',
                              echo_queries=False)
                      ]),
                 upstreams=['load_exchange_rate'])

    return pipeline

コード例 #19

0

ファイルを表示

def von_data_pipeline_single_thread():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_pipeline_single_thread',
        description=
        'A pipeline that processes von_data events and generates credentials.')

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_von_data_data_single_thread',
        description='Load von_data data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events_single_thread',
             description='Register un-processed events',
             commands=[
                 ExecutePython('./von_pipeline/find-unprocessed-events.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_von_data_data_single_thread',
             description='Load von_data data',
             commands=[
                 ExecutePython('./von_pipeline/register_un_processed_events')
             ]), ['register_un_processed_events_single_thread'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_single_thread',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_single_thread',
             description='Submit credentials',
             commands=[
                 ExecutePython('./von_pipeline/submit-creds-single-thread.py')
             ]))
    pipeline1.add(sub_pipeline1_3,
                  ['load_and_process_von_data_data_single_thread'])

    return pipeline1

コード例 #20

0

ファイルを表示

def root_pipeline():
    import app.data_integration.pipelines.github
    import app.data_integration.pipelines.pypi
    import app.data_integration.pipelines.utils
    import app.data_integration.pipelines.python_projects

    pipeline = Pipeline(
        id='mara_example_project',
        description=
        'An example pipeline that integrates PyPI download stats with the Github activity of a project'
    )

    pipeline.add(app.data_integration.pipelines.utils.pipeline)
    pipeline.add(app.data_integration.pipelines.pypi.pipeline,
                 upstreams=['utils'])
    pipeline.add(app.data_integration.pipelines.github.pipeline,
                 upstreams=['utils'])
    pipeline.add(app.data_integration.pipelines.python_projects.pipeline,
                 upstreams=['pypi', 'github'])
    return pipeline

コード例 #21

0

ファイルを表示

def bc_reg_pipeline_jsonbender():
    import bcreg

    pipeline2 = Pipeline(
        id='bc_reg_event_processor_json_transform_demo',
        description=
        'A demo pipeline that processes events and generates credentials using JSONBender.'
    )

    sub_pipeline2_2 = Pipeline(
        id='load_and_process_bc_reg_data',
        description='Load BC Reg data and generate credentials')
    sub_pipeline2_2.add(
        Task(id='register_un_processed_events',
             description='Register un-processed events',
             commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline2_2.add(
        Task(id='load_bc_reg_data',
             description='Load BC Registries data',
             commands=[ExecutePython('./bcreg/process-corps.py')]),
        ['register_un_processed_events'])
    sub_pipeline2_2.add(
        Task(id='create_credentials_jsonbender',
             description='Create credentials using JSONBender transform',
             commands=[ExecutePython('./bcreg/generate-creds-bender.py')]),
        ['load_bc_reg_data'])
    pipeline2.add(sub_pipeline2_2)

    sub_pipeline2_3 = Pipeline(id='submit_bc_reg_credentials',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline2_3.add(
        Task(id='submit_credentials',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline2.add(sub_pipeline2_3, ['load_and_process_bc_reg_data'])

    return pipeline2

コード例 #22

0

ファイルを表示

ファイル: schema_copying.py プロジェクト: JohannaRaab/mara-etl-tools

    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        source_db = mara_db.dbs.db(self.source_db_alias)
        target_db = mara_db.dbs.db(self.target_db_alias)
        assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB))
        assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB))

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias) as cursor:
            pg_version = cursor.connection.server_version

        ddl_task = Task(
            id='create_tables_and_functions',
            description=
            'Re-creates the schema, tables structure and functions on the target db',
            commands=[
                # schema and table structure
                bash.RunBash(
                    command="(echo 'DROP SCHEMA IF EXISTS " +
                    self.schema_name + " CASCADE;';\\\n" +
                    "    pg_dump --username="******" --host=" +
                    source_db.host + " --schema=" + self.schema_name +
                    " --section=pre-data --no-owner --no-privileges " +
                    source_db.database + ") \\\n" + "  | " +
                    mara_db.shell.query_command(
                        self.target_db_alias, echo_queries=False) + ' --quiet'
                ),

                # function definitions
                bash.RunBash(command=f'''echo "
SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def 
FROM (SELECT oid, * 
      FROM pg_proc p 
      WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace
WHERE pg_proc.pronamespace = pg_namespace.oid
     AND nspname = '{self.schema_name}'" \\\n''' + "  | " +
                             mara_db.shell.copy_to_stdout_command(
                                 self.source_db_alias) + ' \\\n' + "  | " +
                             mara_db.shell.query_command(self.target_db_alias,
                                                         echo_queries=False))
            ])
        sub_pipeline.add(ddl_task)

        # copy content of tables
        number_of_chunks = self.max_number_of_parallel_tasks * 3
        table_copy_chunks = {i: [] for i in range(0, number_of_chunks)}
        current_size_per_table_copy_chunk = [0] * number_of_chunks
        table_types = {}

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias
        ) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                """
SELECT 
    pg_class.relname AS table,
    relkind,
    CASE WHEN relkind = 'f' 
         THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy 
         ELSE  pg_total_relation_size(pg_class.oid)
    END / 1000000.0 AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE nspname = '""" + self.schema_name +
                """' AND relkind IN ('r', 'f') AND relhassubclass = 'f'
ORDER BY size DESC""")
            for table_name, type, size in cursor.fetchall():
                smallest_chunk_index = min(
                    range(len(current_size_per_table_copy_chunk)),
                    key=current_size_per_table_copy_chunk.__getitem__)
                current_size_per_table_copy_chunk[smallest_chunk_index] += size
                table_copy_chunks[smallest_chunk_index].append(table_name)
                table_types[table_name] = type

            copy_tasks = []
            for i, tables in table_copy_chunks.items():
                if tables:
                    task = Task(
                        id=f'copy_tables_{i}',
                        description='Copies table content to the frontend db',
                        commands=[
                            RunBash(
                                command=
                                f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n'
                                + '  | ' +
                                mara_db.shell.copy_to_stdout_command(
                                    self.source_db_alias) + ' \\\n' + '  | ' +
                                mara_db.shell.copy_from_stdin_command(
                                    self.target_db_alias,
                                    target_table=
                                    f'{self.schema_name}.{table_name}'))
                            for table_name in tables
                        ])
                    copy_tasks.append(task)
                    sub_pipeline.add(task, upstreams=[ddl_task])

            # create indexes
            index_chunks = {i: [] for i in range(0, number_of_chunks)}
            current_size_per_index_chunk = [0] * number_of_chunks

            with mara_db.postgresql.postgres_cursor_context(
                    self.source_db_alias) as cursor:
                cursor.execute(""" 
SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname
WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i'
ORDER BY size DESC;""")
                for ddl, size in cursor.fetchall():
                    smallest_chunk_index = min(
                        range(len(current_size_per_index_chunk)),
                        key=current_size_per_index_chunk.__getitem__)
                    current_size_per_index_chunk[smallest_chunk_index] += size
                    index_chunks[smallest_chunk_index].append(ddl)

            for i, index_statements in index_chunks.items():
                if index_statements:
                    index_task = Task(
                        id=f'add_indexes_{i}',
                        description='Re-creates indexes on frontend db',
                        commands=[
                            ExecuteSQL(sql_statement=statement,
                                       db_alias=self.target_db_alias)
                            for statement in index_statements
                        ])
                    sub_pipeline.add(index_task, upstreams=copy_tasks)

コード例 #23

0

ファイルを表示

ファイル: bcreg_pipelines.py プロジェクト: swcurran/von-bc-registries-agent

def bc_reg_root_pipeline():
    import bcreg

    parent_pipeline = Pipeline(
        id = 'holder_for_pipeline_versions',
        description = 'Holder for the different versions of the BC Registries pipeline.')

    parent_pipeline.add(bc_reg_pipeline())
    parent_pipeline.add(bc_reg_pipeline_status())

    init_pipeline = Pipeline(
        id = 'initialization_and_load_tasks',
        description = 'One-time initialization and data load tasks')

    init_pipeline.add(db_init_pipeline())
    init_pipeline.add(bc_reg_pipeline_initial_load())
    init_pipeline.add(bc_reg_pipeline_post_credentials())

    parent_pipeline.add(init_pipeline)

    test_pipeline = Pipeline(
        id = 'test_and_demo_tasks',
        description = 'Holder for test and demo tasks.')

    test_pipeline.add(bc_init_test_data())
    test_pipeline.add(bc_reg_test_corps())
    test_pipeline.add(bc_reg_pipeline_single_thread())
    test_pipeline.add(bc_reg_pipeline_jsonbender())

    parent_pipeline.add(test_pipeline)

    return parent_pipeline

コード例 #24

0

ファイルを表示

ファイル: bc_reg_pipeline_initial_load.py プロジェクト: nrempel/von-bc-registries-agent

import os
from data_integration.pipelines import Pipeline, Task
from data_integration.ui.cli import run_pipeline
import mara_db.auto_migration
import mara_db.config
import mara_db.dbs
import data_integration
from bcreg.bcreg_pipelines import bc_reg_pipeline_initial_load

mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb')
mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db')
mara_port = os.environ.get('MARA_DB_PORT', '5432')
mara_user = os.environ.get('MARA_DB_USER', 'mara_db')
mara_password = os.environ.get('MARA_DB_PASSWORD')

mara_db.config.databases \
    = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)}

parent_pipeline = Pipeline(
    id='holder_for_pipeline_versions',
    description=
    'Holder for the different versions of the BC Registries pipeline.')

parent_pipeline.add(bc_reg_pipeline_initial_load())

run_pipeline(parent_pipeline)

コード例 #25

0

ファイルを表示

pipeline = Pipeline(
    id="python_projects",
    description=
    "Combines PyPI downloads and github activities to a Python project activity cube",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pp_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')]))

pipeline.add(
    Task(
        id="extract_python_repo_activity",
        description=
        'Extracts activity metrics for github repos that have a corresponding pypi package (by name)',
        commands=[
            ExecuteSQL(sql_file_name="extract_python_repo_activity.sql")
        ]))

pipeline.add(ParallelExecuteSQL(
    id="transform_python_project_activity",
    description=
    "Aggregates downloads at project level and combines them with github activity metrics",
    commands_before=[
        ExecuteSQL(sql_file_name="transform_python_project_activity.sql")
    ],
    sql_statement=
    "SELECT pp_tmp.insert_python_project_activity(@chunk@::SMALLINT);",
    parameter_function=etl_tools.utils.chunk_parameter_function,
    parameter_placeholders=["@chunk@"]),

コード例 #26

0

ファイルを表示

def von_root_pipeline():

    parent_pipeline = Pipeline(
        id='holder_for_pipeline_versions',
        description=
        'Holder for the different versions of the VON Data Pipeline.')

    parent_pipeline.add(von_data_pipeline())
    parent_pipeline.add(von_data_pipeline_status())

    init_pipeline = Pipeline(
        id='initialization_and_load_tasks',
        description='One-time initialization and data load tasks')

    init_pipeline.add(db_init_pipeline())
    init_pipeline.add(von_data_pipeline_initial_load())
    init_pipeline.add(von_data_pipeline_post_credentials())

    parent_pipeline.add(init_pipeline)

    test_pipeline = Pipeline(id='test_and_demo_tasks',
                             description='Holder for test and demo tasks.')

    test_pipeline.add(von_data_init_test_data())
    test_pipeline.add(von_data_test_registrations())
    test_pipeline.add(von_data_pipeline_single_thread())

    parent_pipeline.add(test_pipeline)

    return parent_pipeline

コード例 #27

0

ファイルを表示

    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'

        ddl = f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
'''

        commands = []

        with mara_db.postgresql.postgres_cursor_context(self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                ddl += f"""
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');
"""
                commands.append(
                    ExecuteSQL(sql_statement=f'''
INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''', echo_queries=False))

        sub_pipeline.add_initial(
            Task(id='create_table', description='Creates the attributes table',
                 commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)]))

        chunk_size = math.ceil(len(commands) / (2 * data_integration.config.max_number_of_parallel_tasks()))
        for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)):
            task = Task(id=str(n), description='Process a portion of the attributes')
            task.add_commands(chunk)
            sub_pipeline.add(task)

コード例 #28

0

ファイルを表示

ファイル: bc_reg_pipeline.py プロジェクト: nrempel/von-bc-registries-agent

import os
from data_integration.pipelines import Pipeline, Task
from data_integration.ui.cli import run_pipeline
import mara_db.auto_migration
import mara_db.config
import mara_db.dbs
import data_integration
from bcreg.bcreg_pipelines import bc_reg_pipeline

mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb')
mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db')
mara_port = os.environ.get('MARA_DB_PORT', '5432')
mara_user = os.environ.get('MARA_DB_USER', 'mara_db')
mara_password = os.environ.get('MARA_DB_PASSWORD')

mara_db.config.databases \
    = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)}

parent_pipeline = Pipeline(
    id='holder_for_pipeline_versions',
    description=
    'Holder for the different versions of the BC Registries pipeline.')

parent_pipeline.add(bc_reg_pipeline())

run_pipeline(parent_pipeline)

コード例 #29

0

ファイルを表示

ファイル: bc_reg_migrate.py プロジェクト: nrempel/von-bc-registries-agent

import os
from data_integration.commands.bash import RunBash
from data_integration.commands.python import ExecutePython
from data_integration.pipelines import Pipeline, Task
from data_integration.ui.cli import run_pipeline, run_interactively
from data_integration.ui.cli import run_pipeline
import mara_db.auto_migration
import mara_db.config
import mara_db.dbs
import data_integration
from bcreg.bcreg_pipelines import db_init_pipeline

mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb')
mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db')
mara_port = os.environ.get('MARA_DB_PORT', '5432')
mara_user = os.environ.get('MARA_DB_USER', 'mara_db')
mara_password = os.environ.get('MARA_DB_PASSWORD')

mara_db.config.databases \
    = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)}

parent_pipeline = Pipeline(
    id = 'holder_for_pipeline_versions',
    description = 'Holder for the different versions of the BC Registries pipeline.')

parent_pipeline.add(db_init_pipeline())

run_pipeline(parent_pipeline)

コード例 #30

0

ファイルを表示

ファイル: __init__.py プロジェクト: rogue0137/mara-example-project

         ]))

read_download_file_dependencies = [
    "create_download_data_table.sql", "create_data_schema.sql"
]

pipeline.add(
    ParallelReadFile(
        id="read_download",
        description="Loads PyPI downloads from pre_downloaded csv files",
        file_pattern="*/*/*/pypi/downloads-v1.csv.gz",
        read_mode=ReadMode.ONLY_NEW,
        compression=Compression.GZIP,
        target_table="pypi_data.download",
        delimiter_char="\t",
        skip_header=True,
        csv_format=True,
        file_dependencies=read_download_file_dependencies,
        date_regex="^(?P<year>\d{4})\/(?P<month>\d{2})\/(?P<day>\d{2})/",
        partition_target_table_by_day_id=True,
        timezone="UTC",
        commands_before=[
            ExecuteSQL(sql_file_name="create_download_data_table.sql",
                       file_dependencies=read_download_file_dependencies)
        ]))

pipeline.add(ParallelExecuteSQL(
    id="preprocess_project_version",
    description='Assigns unique ids to projects and versions',
    commands_before=[
        ExecuteSQL(sql_file_name="preprocess_project_version_1.sql")