コード例 #1
0
ファイル: pipelines.py プロジェクト: xia0204/data-integration
def demo_pipeline():
    """Returns a demo pipeline"""
    from data_integration.commands import bash, python
    pipeline = Pipeline(id='demo',
                        description='A small pipeline that demonstrates the interplay between pipelines, tasks and commands')

    pipeline.add(Task(id='ping_localhost', description='Pings localhost',
                      commands=[bash.RunBash('ping -c 3 localhost')]))

    sub_pipeline = Pipeline(id='sub_pipeline', description='Pings a number of hosts')

    for host in ['google', 'amazon', 'facebook']:
        sub_pipeline.add(Task(id=f'ping_{host}', description=f'Pings {host}',
                              commands=[bash.RunBash(f'ping -c 3 {host}.com'),
                                        python.RunFunction(lambda: 1)]))
    sub_pipeline.add_dependency('ping_amazon', 'ping_facebook')
    sub_pipeline.add(Task(id='ping_foo', description='Pings foo',
                          commands=[bash.RunBash('ping foo')]), ['ping_amazon'])

    pipeline.add(sub_pipeline, ['ping_localhost'])

    pipeline.add(Task(id='sleep', description='Sleeps for 2 seconds',
                      commands=[bash.RunBash('sleep 2')]), ['sub_pipeline'])

    return pipeline
コード例 #2
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        source_db = mara_db.dbs.db(self.source_db_alias)
        target_db = mara_db.dbs.db(self.target_db_alias)
        assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB))
        assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB))

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias) as cursor:
            pg_version = cursor.connection.server_version

        ddl_task = Task(
            id='create_tables_and_functions',
            description=
            'Re-creates the schema, tables structure and functions on the target db',
            commands=[
                # schema and table structure
                bash.RunBash(
                    command="(echo 'DROP SCHEMA IF EXISTS " +
                    self.schema_name + " CASCADE;';\\\n" +
                    "    pg_dump --username="******" --host=" +
                    source_db.host + " --schema=" + self.schema_name +
                    " --section=pre-data --no-owner --no-privileges " +
                    source_db.database + ") \\\n" + "  | " +
                    mara_db.shell.query_command(
                        self.target_db_alias, echo_queries=False) + ' --quiet'
                ),

                # function definitions
                bash.RunBash(command=f'''echo "
SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def 
FROM (SELECT oid, * 
      FROM pg_proc p 
      WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace
WHERE pg_proc.pronamespace = pg_namespace.oid
     AND nspname = '{self.schema_name}'" \\\n''' + "  | " +
                             mara_db.shell.copy_to_stdout_command(
                                 self.source_db_alias) + ' \\\n' + "  | " +
                             mara_db.shell.query_command(self.target_db_alias,
                                                         echo_queries=False))
            ])
        sub_pipeline.add(ddl_task)

        # copy content of tables
        number_of_chunks = self.max_number_of_parallel_tasks * 3
        table_copy_chunks = {i: [] for i in range(0, number_of_chunks)}
        current_size_per_table_copy_chunk = [0] * number_of_chunks
        table_types = {}

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias
        ) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                """
SELECT 
    pg_class.relname AS table,
    relkind,
    CASE WHEN relkind = 'f' 
         THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy 
         ELSE  pg_total_relation_size(pg_class.oid)
    END / 1000000.0 AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE nspname = '""" + self.schema_name +
                """' AND relkind IN ('r', 'f') AND relhassubclass = 'f'
ORDER BY size DESC""")
            for table_name, type, size in cursor.fetchall():
                smallest_chunk_index = min(
                    range(len(current_size_per_table_copy_chunk)),
                    key=current_size_per_table_copy_chunk.__getitem__)
                current_size_per_table_copy_chunk[smallest_chunk_index] += size
                table_copy_chunks[smallest_chunk_index].append(table_name)
                table_types[table_name] = type

            copy_tasks = []
            for i, tables in table_copy_chunks.items():
                if tables:
                    task = Task(
                        id=f'copy_tables_{i}',
                        description='Copies table content to the frontend db',
                        commands=[
                            RunBash(
                                command=
                                f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n'
                                + '  | ' +
                                mara_db.shell.copy_to_stdout_command(
                                    self.source_db_alias) + ' \\\n' + '  | ' +
                                mara_db.shell.copy_from_stdin_command(
                                    self.target_db_alias,
                                    target_table=
                                    f'{self.schema_name}.{table_name}'))
                            for table_name in tables
                        ])
                    copy_tasks.append(task)
                    sub_pipeline.add(task, upstreams=[ddl_task])

            # create indexes
            index_chunks = {i: [] for i in range(0, number_of_chunks)}
            current_size_per_index_chunk = [0] * number_of_chunks

            with mara_db.postgresql.postgres_cursor_context(
                    self.source_db_alias) as cursor:
                cursor.execute(""" 
SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname
WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i'
ORDER BY size DESC;""")
                for ddl, size in cursor.fetchall():
                    smallest_chunk_index = min(
                        range(len(current_size_per_index_chunk)),
                        key=current_size_per_index_chunk.__getitem__)
                    current_size_per_index_chunk[smallest_chunk_index] += size
                    index_chunks[smallest_chunk_index].append(ddl)

            for i, index_statements in index_chunks.items():
                if index_statements:
                    index_task = Task(
                        id=f'add_indexes_{i}',
                        description='Re-creates indexes on frontend db',
                        commands=[
                            ExecuteSQL(sql_statement=statement,
                                       db_alias=self.target_db_alias)
                            for statement in index_statements
                        ])
                    sub_pipeline.add(index_task, upstreams=copy_tasks)