def bc_reg_pipeline_single_thread(): import bcreg pipeline1 = Pipeline( id='bc_reg_event_processor_single_thread', description= 'A pipeline that processes BC Registries events and generates credentials.' ) sub_pipeline1_2 = Pipeline( id='load_and_process_bc_reg_data_single_thread', description='Load BC Reg data and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_events_single_thread', description='Register un-processed events', commands=[ExecutePython('./bcreg/find-unprocessed-events.py')])) sub_pipeline1_2.add( Task(id='load_bc_reg_data_single_thread', description='Load BC Registries data', commands=[ExecutePython('./bcreg/register_un_processed_events')]), ['register_un_processed_events_single_thread']) pipeline1.add(sub_pipeline1_2) sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_single_thread', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_single_thread', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds-single-thread.py') ])) pipeline1.add(sub_pipeline1_3, ['load_and_process_bc_reg_data_single_thread']) return pipeline1
def bc_init_test_data(): import bcreg pipeline = Pipeline( id='bc_reg_test_data', description='A pipeline that initializes event processor database for testing.') pipeline.add(Task(id='register_test_corps', description='Insert some test data for processing', commands=[ExecutePython('./bcreg/insert-test.py')])) return pipeline
def bc_reg_test_corps(): import bcreg pipeline = Pipeline( id='bc_reg_test_corps', description='A pipeline that queues up a small set of test corporations.') pipeline.add(Task(id='register_test_corps', description='Register some test corps for processing', commands=[ExecutePython('./bcreg/find-test-corps.py')])) return pipeline
def bc_reg_populate_audit_table(): import bcreg pipeline = Pipeline( id='bc_reg_populate_audit_table', description='Populate Audit Table.') pipeline.add(Task(id='populate_audit_table', description='Populate audit table.', commands=[ExecutePython('./bcreg/populate_audit_table.py')])) return pipeline
def bc_reg_pipeline_status(): import bcreg pipeline = Pipeline( id='bc_reg_pipeline_status', description='Display overall event processing status.') pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status', commands=[ExecutePython('./bcreg/display_pipeline_status.py')])) return pipeline
def von_list_mongo_data(): import von_pipeline pipeline = Pipeline(id='von_list_mongo_data', description='A pipeline that lists data in mongodb.') pipeline.add( Task(id='list_mongo_data', description='List data queued for processing', commands=[ExecutePython('./von_pipeline/list_mongo_data.py')])) return pipeline
def db_init_pipeline(): import bcreg pipeline = Pipeline( id = 'bc_reg_db_init', description = 'Initialize BC Registries Event Processor database') pipeline.add(Task(id='create_tables', description='Create event processing tables', commands=[ExecutePython('./bcreg/create.py')])) pipeline.add(Task(id='initialize_tables', description='Insert configuration data', commands=[ExecutePython('./bcreg/insert.py')]), ['create_tables']) return pipeline
def bc_reg_pipeline_post_credentials(): import bcreg pipeline1 = Pipeline( id='bc_reg_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) return pipeline1
def utils_pipeline(with_hll=False, with_cstore_fdw=False): pipeline = Pipeline( id="initialize_utils", description= "Creates an utils schema with a number of functions around the ETL best practices of Project A", base_path=pathlib.Path(__file__).parent) pipeline.add_initial( Task(id="create_utils_schema", description="Re-creates the utils schema", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;") ])) pipeline.add( Task(id='chunking', description='Runs file chunking.sql', commands=[ ExecuteSQL(sql_file_name='chunking.sql', echo_queries=False, replace={ 'number_of_chunks': lambda: config.number_of_chunks() }) ])) def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ])) for file_name_without_extension in [ 'consistency_checks', 'data_sets', 'partitioning', 'indexes_and_constraints', 'schema_switching', 'enums' ]: add_task_for_file(file_name_without_extension) if with_hll: add_task_for_file('hll') if with_cstore_fdw: add_task_for_file('cstore_fdw') return pipeline
def von_data_pipeline(): import von_pipeline pipeline1 = Pipeline( id='von_data_event_processor', description= 'A pipeline that processes von_data events and generates credentials.') sub_pipeline1_2 = Pipeline( id='load_and_process_von_data_data', description='Load von_data data and generate credentials') sub_pipeline1_2.add( Task(id='create_von_data_credentials', description='Create credentials', commands=[ExecutePython('./von_pipeline/generate-creds.py')])) pipeline1.add(sub_pipeline1_2) sub_pipeline1_3 = Pipeline( id='submit_von_data_credentials', description='Submit von_data credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials', description='Submit credentials', commands=[ExecutePython('./von_pipeline/submit-creds.py')])) pipeline1.add(sub_pipeline1_3, ['load_and_process_von_data_data']) return pipeline1
def bc_reg_pipeline_post_credentials(): import bcreg pipeline1 = Pipeline( id='bc_reg_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) sub_pipeline1_4 = Pipeline( id='populate_evp_audit_table_a', description='Populate Event Processor Audit Table') sub_pipeline1_4.add( Task(id='populate_audit_table_a', description='Populate Audit Table', commands=[ExecutePython('./bcreg/populate_audit_table.py')])) pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials_a']) return pipeline1
def von_data_pipeline_status(): import von_pipeline pipeline = Pipeline(id='von_data_pipeline_status', description='Display overall event processing status.') pipeline.add( Task(id='display_pipeline_status', description= 'Display status of the overall pipeline processing status', commands=[ ExecutePython('./von_pipeline/display_pipeline_status.py') ])) return pipeline
def bc_reg_pipeline_status(): import bcreg pipeline = Pipeline( id='bc_reg_pipeline_status', description='Display overall event processing status.') pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status', commands=[ExecutePython('./bcreg/display_pipeline_status.py')])) # remove these from the pipeline due to issues connecting to DB's on openshift #pipeline.add(Task(id='display_pipeline_stats', description='Display stats of each stage in the pipeline processing', # commands=[ExecutePython('./bcreg/display_processed_corps_counts.py')])) pipeline.add(Task(id='display_event_processor_stats', description='Display stats of each event processor stage', commands=[ExecutePython('./bcreg/display_event_processor_counts.py')])) return pipeline
def von_data_pipeline_post_credentials(): import von_pipeline pipeline1 = Pipeline( id='von_data_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline( id='submit_von_data_credentials_a', description='Submit von_data credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./von_pipeline/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) return pipeline1
def db_init_pipeline(): import von_pipeline pipeline = Pipeline( id='von_data_db_init', description='Initialize von_data Event Processor database') pipeline.add( Task(id='create_tables', description='Create event processing tables', commands=[ExecutePython('./von_pipeline/create.py')])) pipeline.add( Task(id='initialize_tables', description='Insert configuration data', commands=[ExecutePython('./von_pipeline/insert.py')]), ['create_tables']) return pipeline
def bc_reg_pipeline_initial_load(): import bcreg pipeline1 = Pipeline( id='bc_reg_corp_loader', description= 'A pipeline that does the initial data load and credentials for all corporations.' ) sub_pipeline1_2 = Pipeline( id='load_and_process_bc_reg_corps', description='Load Active BC Reg corps and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_corps', description='Register un-processed active corps', commands=[ ExecutePython('./bcreg/find-unprocessed-corps_actve.py') ])) sub_pipeline1_2.add( Task(id='load_bc_reg_data_a', description='Load BC Registries data', commands=[ ExecutePython('./bcreg/process-corps-generate-creds.py') ]), ['register_un_processed_corps']) pipeline1.add(sub_pipeline1_2) return pipeline1
def bc_reg_pipeline_bn_credential_load(): import bcreg pipeline1 = Pipeline( id='bc_reg_bn_loader', description= 'A pipeline that creates BN credentials for all existing corporations.' ) sub_pipeline1_2 = Pipeline( id='load_existing_corps_no_bn', description='Load BC Reg corps with no BN credential') sub_pipeline1_2.add( Task(id='register_un_bned_corps', description='Register corps with no BN', commands=[ExecutePython('./bcreg/find-un-bned-corps.py')])) sub_pipeline1_2.add( Task(id='load_corp_bn_data', description='Load BN credentials from company data', commands=[ ExecutePython('./bcreg/process-corps-generate-bn-creds.py') ]), ['register_un_bned_corps']) pipeline1.add(sub_pipeline1_2) return pipeline1
def euro_exchange_rates_pipeline(db_alias: str): pipeline = Pipeline( id="load_euro_exchange_rates", description= "Loads daily Euro exchange rates since 1999 from the European central bank", base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_schema_and_table", description="Re-creates currency exchange rate schema", commands=[ ExecuteSQL(sql_file_name='create_schema_and_table.sql', echo_queries=False) ])) pipeline.add(Task( id='load_exchange_rate', description='Loads exchange rates from the European central bank', commands=[ ReadScriptOutput(file_name='load_exchange_rate.py', target_table='euro_fx.exchange_rate', db_alias='mdwh-etl') ]), upstreams=['create_schema_and_table']) pipeline.add(Task(id="postprocess_exchange_rate", description="Adds values for missing days", commands=[ ExecuteSQL( sql_file_name='postprocess_exchange_rate.sql', echo_queries=False) ]), upstreams=['load_exchange_rate']) return pipeline
def von_data_pipeline_single_thread(): import von_pipeline pipeline1 = Pipeline( id='von_data_pipeline_single_thread', description= 'A pipeline that processes von_data events and generates credentials.') sub_pipeline1_2 = Pipeline( id='load_and_process_von_data_data_single_thread', description='Load von_data data and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_events_single_thread', description='Register un-processed events', commands=[ ExecutePython('./von_pipeline/find-unprocessed-events.py') ])) sub_pipeline1_2.add( Task(id='load_von_data_data_single_thread', description='Load von_data data', commands=[ ExecutePython('./von_pipeline/register_un_processed_events') ]), ['register_un_processed_events_single_thread']) pipeline1.add(sub_pipeline1_2) sub_pipeline1_3 = Pipeline( id='submit_von_data_credentials_single_thread', description='Submit von_data credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_single_thread', description='Submit credentials', commands=[ ExecutePython('./von_pipeline/submit-creds-single-thread.py') ])) pipeline1.add(sub_pipeline1_3, ['load_and_process_von_data_data_single_thread']) return pipeline1
def root_pipeline(): import app.data_integration.pipelines.github import app.data_integration.pipelines.pypi import app.data_integration.pipelines.utils import app.data_integration.pipelines.python_projects pipeline = Pipeline( id='mara_example_project', description= 'An example pipeline that integrates PyPI download stats with the Github activity of a project' ) pipeline.add(app.data_integration.pipelines.utils.pipeline) pipeline.add(app.data_integration.pipelines.pypi.pipeline, upstreams=['utils']) pipeline.add(app.data_integration.pipelines.github.pipeline, upstreams=['utils']) pipeline.add(app.data_integration.pipelines.python_projects.pipeline, upstreams=['pypi', 'github']) return pipeline
def bc_reg_pipeline_jsonbender(): import bcreg pipeline2 = Pipeline( id='bc_reg_event_processor_json_transform_demo', description= 'A demo pipeline that processes events and generates credentials using JSONBender.' ) sub_pipeline2_2 = Pipeline( id='load_and_process_bc_reg_data', description='Load BC Reg data and generate credentials') sub_pipeline2_2.add( Task(id='register_un_processed_events', description='Register un-processed events', commands=[ExecutePython('./bcreg/find-unprocessed-events.py')])) sub_pipeline2_2.add( Task(id='load_bc_reg_data', description='Load BC Registries data', commands=[ExecutePython('./bcreg/process-corps.py')]), ['register_un_processed_events']) sub_pipeline2_2.add( Task(id='create_credentials_jsonbender', description='Create credentials using JSONBender transform', commands=[ExecutePython('./bcreg/generate-creds-bender.py')]), ['load_bc_reg_data']) pipeline2.add(sub_pipeline2_2) sub_pipeline2_3 = Pipeline(id='submit_bc_reg_credentials', description='Submit BC Reg credentials to P-X') sub_pipeline2_3.add( Task(id='submit_credentials', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline2.add(sub_pipeline2_3, ['load_and_process_bc_reg_data']) return pipeline2
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: source_db = mara_db.dbs.db(self.source_db_alias) target_db = mara_db.dbs.db(self.target_db_alias) assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB)) assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB)) with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: pg_version = cursor.connection.server_version ddl_task = Task( id='create_tables_and_functions', description= 'Re-creates the schema, tables structure and functions on the target db', commands=[ # schema and table structure bash.RunBash( command="(echo 'DROP SCHEMA IF EXISTS " + self.schema_name + " CASCADE;';\\\n" + " pg_dump --username="******" --host=" + source_db.host + " --schema=" + self.schema_name + " --section=pre-data --no-owner --no-privileges " + source_db.database + ") \\\n" + " | " + mara_db.shell.query_command( self.target_db_alias, echo_queries=False) + ' --quiet' ), # function definitions bash.RunBash(command=f'''echo " SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def FROM (SELECT oid, * FROM pg_proc p WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace WHERE pg_proc.pronamespace = pg_namespace.oid AND nspname = '{self.schema_name}'" \\\n''' + " | " + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + " | " + mara_db.shell.query_command(self.target_db_alias, echo_queries=False)) ]) sub_pipeline.add(ddl_task) # copy content of tables number_of_chunks = self.max_number_of_parallel_tasks * 3 table_copy_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_table_copy_chunk = [0] * number_of_chunks table_types = {} with mara_db.postgresql.postgres_cursor_context( self.source_db_alias ) as cursor: # type: psycopg2.extensions.cursor cursor.execute( """ SELECT pg_class.relname AS table, relkind, CASE WHEN relkind = 'f' THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy ELSE pg_total_relation_size(pg_class.oid) END / 1000000.0 AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE nspname = '""" + self.schema_name + """' AND relkind IN ('r', 'f') AND relhassubclass = 'f' ORDER BY size DESC""") for table_name, type, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_table_copy_chunk)), key=current_size_per_table_copy_chunk.__getitem__) current_size_per_table_copy_chunk[smallest_chunk_index] += size table_copy_chunks[smallest_chunk_index].append(table_name) table_types[table_name] = type copy_tasks = [] for i, tables in table_copy_chunks.items(): if tables: task = Task( id=f'copy_tables_{i}', description='Copies table content to the frontend db', commands=[ RunBash( command= f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n' + ' | ' + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + ' | ' + mara_db.shell.copy_from_stdin_command( self.target_db_alias, target_table= f'{self.schema_name}.{table_name}')) for table_name in tables ]) copy_tasks.append(task) sub_pipeline.add(task, upstreams=[ddl_task]) # create indexes index_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_index_chunk = [0] * number_of_chunks with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: cursor.execute(""" SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i' ORDER BY size DESC;""") for ddl, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_index_chunk)), key=current_size_per_index_chunk.__getitem__) current_size_per_index_chunk[smallest_chunk_index] += size index_chunks[smallest_chunk_index].append(ddl) for i, index_statements in index_chunks.items(): if index_statements: index_task = Task( id=f'add_indexes_{i}', description='Re-creates indexes on frontend db', commands=[ ExecuteSQL(sql_statement=statement, db_alias=self.target_db_alias) for statement in index_statements ]) sub_pipeline.add(index_task, upstreams=copy_tasks)
def bc_reg_root_pipeline(): import bcreg parent_pipeline = Pipeline( id = 'holder_for_pipeline_versions', description = 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(bc_reg_pipeline()) parent_pipeline.add(bc_reg_pipeline_status()) init_pipeline = Pipeline( id = 'initialization_and_load_tasks', description = 'One-time initialization and data load tasks') init_pipeline.add(db_init_pipeline()) init_pipeline.add(bc_reg_pipeline_initial_load()) init_pipeline.add(bc_reg_pipeline_post_credentials()) parent_pipeline.add(init_pipeline) test_pipeline = Pipeline( id = 'test_and_demo_tasks', description = 'Holder for test and demo tasks.') test_pipeline.add(bc_init_test_data()) test_pipeline.add(bc_reg_test_corps()) test_pipeline.add(bc_reg_pipeline_single_thread()) test_pipeline.add(bc_reg_pipeline_jsonbender()) parent_pipeline.add(test_pipeline) return parent_pipeline
import os from data_integration.pipelines import Pipeline, Task from data_integration.ui.cli import run_pipeline import mara_db.auto_migration import mara_db.config import mara_db.dbs import data_integration from bcreg.bcreg_pipelines import bc_reg_pipeline_initial_load mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb') mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db') mara_port = os.environ.get('MARA_DB_PORT', '5432') mara_user = os.environ.get('MARA_DB_USER', 'mara_db') mara_password = os.environ.get('MARA_DB_PASSWORD') mara_db.config.databases \ = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)} parent_pipeline = Pipeline( id='holder_for_pipeline_versions', description= 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(bc_reg_pipeline_initial_load()) run_pipeline(parent_pipeline)
pipeline = Pipeline( id="python_projects", description= "Combines PyPI downloads and github activities to a Python project activity cube", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pp_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')])) pipeline.add( Task( id="extract_python_repo_activity", description= 'Extracts activity metrics for github repos that have a corresponding pypi package (by name)', commands=[ ExecuteSQL(sql_file_name="extract_python_repo_activity.sql") ])) pipeline.add(ParallelExecuteSQL( id="transform_python_project_activity", description= "Aggregates downloads at project level and combines them with github activity metrics", commands_before=[ ExecuteSQL(sql_file_name="transform_python_project_activity.sql") ], sql_statement= "SELECT pp_tmp.insert_python_project_activity(@chunk@::SMALLINT);", parameter_function=etl_tools.utils.chunk_parameter_function, parameter_placeholders=["@chunk@"]),
def von_root_pipeline(): parent_pipeline = Pipeline( id='holder_for_pipeline_versions', description= 'Holder for the different versions of the VON Data Pipeline.') parent_pipeline.add(von_data_pipeline()) parent_pipeline.add(von_data_pipeline_status()) init_pipeline = Pipeline( id='initialization_and_load_tasks', description='One-time initialization and data load tasks') init_pipeline.add(db_init_pipeline()) init_pipeline.add(von_data_pipeline_initial_load()) init_pipeline.add(von_data_pipeline_post_credentials()) parent_pipeline.add(init_pipeline) test_pipeline = Pipeline(id='test_and_demo_tasks', description='Holder for test and demo tasks.') test_pipeline.add(von_data_init_test_data()) test_pipeline.add(von_data_test_registrations()) test_pipeline.add(von_data_pipeline_single_thread()) parent_pipeline.add(test_pipeline) return parent_pipeline
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}' ddl = f''' DROP TABLE IF EXISTS {attributes_table_name}; CREATE TABLE {attributes_table_name} ( attribute TEXT NOT NULL, value TEXT NOT NULL, row_count BIGINT NOT NULL ) PARTITION BY LIST (attribute); ''' commands = [] with mara_db.postgresql.postgres_cursor_context(self.db_alias) as cursor: # type: psycopg2.extensions.cursor cursor.execute(f''' WITH enums AS ( SELECT DISTINCT typname, nspname FROM pg_type JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid ) SELECT column_name FROM information_schema.columns LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname WHERE table_schema = {'%s'} AND table_name = {'%s'} AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL); ''', (self.source_schema_name, self.source_table_name)) i = 0 for column_name, in cursor.fetchall(): i += 1 ddl += f""" CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}'); """ commands.append( ExecuteSQL(sql_statement=f''' INSERT INTO {attributes_table_name}_{i} SELECT '{column_name}', "{column_name}", count(*) FROM {self.source_schema_name}.{self.source_table_name} WHERE "{column_name}" IS NOT NULL GROUP BY "{column_name}" ORDER BY "{column_name}"; CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops); ''', echo_queries=False)) sub_pipeline.add_initial( Task(id='create_table', description='Creates the attributes table', commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)])) chunk_size = math.ceil(len(commands) / (2 * data_integration.config.max_number_of_parallel_tasks())) for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)): task = Task(id=str(n), description='Process a portion of the attributes') task.add_commands(chunk) sub_pipeline.add(task)
import os from data_integration.pipelines import Pipeline, Task from data_integration.ui.cli import run_pipeline import mara_db.auto_migration import mara_db.config import mara_db.dbs import data_integration from bcreg.bcreg_pipelines import bc_reg_pipeline mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb') mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db') mara_port = os.environ.get('MARA_DB_PORT', '5432') mara_user = os.environ.get('MARA_DB_USER', 'mara_db') mara_password = os.environ.get('MARA_DB_PASSWORD') mara_db.config.databases \ = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)} parent_pipeline = Pipeline( id='holder_for_pipeline_versions', description= 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(bc_reg_pipeline()) run_pipeline(parent_pipeline)
import os from data_integration.commands.bash import RunBash from data_integration.commands.python import ExecutePython from data_integration.pipelines import Pipeline, Task from data_integration.ui.cli import run_pipeline, run_interactively from data_integration.ui.cli import run_pipeline import mara_db.auto_migration import mara_db.config import mara_db.dbs import data_integration from bcreg.bcreg_pipelines import db_init_pipeline mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb') mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db') mara_port = os.environ.get('MARA_DB_PORT', '5432') mara_user = os.environ.get('MARA_DB_USER', 'mara_db') mara_password = os.environ.get('MARA_DB_PASSWORD') mara_db.config.databases \ = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)} parent_pipeline = Pipeline( id = 'holder_for_pipeline_versions', description = 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(db_init_pipeline()) run_pipeline(parent_pipeline)
])) read_download_file_dependencies = [ "create_download_data_table.sql", "create_data_schema.sql" ] pipeline.add( ParallelReadFile( id="read_download", description="Loads PyPI downloads from pre_downloaded csv files", file_pattern="*/*/*/pypi/downloads-v1.csv.gz", read_mode=ReadMode.ONLY_NEW, compression=Compression.GZIP, target_table="pypi_data.download", delimiter_char="\t", skip_header=True, csv_format=True, file_dependencies=read_download_file_dependencies, date_regex="^(?P<year>\d{4})\/(?P<month>\d{2})\/(?P<day>\d{2})/", partition_target_table_by_day_id=True, timezone="UTC", commands_before=[ ExecuteSQL(sql_file_name="create_download_data_table.sql", file_dependencies=read_download_file_dependencies) ])) pipeline.add(ParallelExecuteSQL( id="preprocess_project_version", description='Assigns unique ids to projects and versions', commands_before=[ ExecuteSQL(sql_file_name="preprocess_project_version_1.sql")