def demo_pipeline(): """Returns a demo pipeline""" from data_integration.commands import bash, python pipeline = Pipeline(id='demo', description='A small pipeline that demonstrates the interplay between pipelines, tasks and commands') pipeline.add(Task(id='ping_localhost', description='Pings localhost', commands=[bash.RunBash('ping -c 3 localhost')])) sub_pipeline = Pipeline(id='sub_pipeline', description='Pings a number of hosts') for host in ['google', 'amazon', 'facebook']: sub_pipeline.add(Task(id=f'ping_{host}', description=f'Pings {host}', commands=[bash.RunBash(f'ping -c 3 {host}.com'), python.RunFunction(lambda: 1)])) sub_pipeline.add_dependency('ping_amazon', 'ping_facebook') sub_pipeline.add(Task(id='ping_foo', description='Pings foo', commands=[bash.RunBash('ping foo')]), ['ping_amazon']) pipeline.add(sub_pipeline, ['ping_localhost']) pipeline.add(Task(id='sleep', description='Sleeps for 2 seconds', commands=[bash.RunBash('sleep 2')]), ['sub_pipeline']) return pipeline
def parallel_commands(self, file_name: str) -> [pipelines.Command]: return [self.read_command(file_name)] + ([ python.RunFunction( function=lambda: _processed_files.track_processed_file( self.path(), file_name, self._last_modification_timestamp(file_name))) ] if self.read_mode != ReadMode.ALL else [])
def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None: parameters = self.parameter_function() if not isinstance(parameters, list): raise ValueError( f'parameter function should return a list, got "{repr(parameters)}"' ) for parameter in parameters: sub_pipeline.add( pipelines.Task( id=str(parameter).lower().replace(' ', '_').replace('-', '_'), description= f'Runs the function with parameters {repr(parameter)}', commands=[ python.RunFunction( lambda args=parameter: self.function(args)) ]))
def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None: files = [] # A list of (file_name, date_or_file_name) tuples data_dir = config.data_dir() first_date = config.first_date() for file in glob.iglob(str(pathlib.Path(data_dir / self.file_pattern))): file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir))) if self.date_regex: match = re.match(self.date_regex, file) if not match: raise Exception( f'file name "{file}" \ndoes not match date regex "{self.date_regex}"' ) date = datetime.date(*[int(group) for group in match.groups()]) if date >= first_date: files.append((file, date)) else: files.append((file, file)) # sort by date when regex provided or by filename otherwise files.sort(key=lambda x: x[1], reverse=True) # remove latest file when requested if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST: files = files[1:] # take only latest file when requested if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST: files = files[:1] # for incremental loading, determine which files already have been processed # reprocess all when file dependencies changed if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and (not self.file_dependencies or not _file_dependencies.is_modified( self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies))): processed_files = _processed_files.already_processed_files( self.path()) files = [ x for x in files if x[0] not in processed_files # everything not yet read or (self.read_mode == ReadMode.ONLY_CHANGED # everything modified and self._last_modification_timestamp(x[0]) > processed_files[ x[0]]) ] if not files: logger.log('No newer files', format=logger.Format.ITALICS) return if self.read_mode != ReadMode.ALL and self.file_dependencies: def update_file_dependencies(): _file_dependencies.update(self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies) return True sub_pipeline.final_node.commands.append( python.RunFunction(update_file_dependencies)) chunk_size = math.ceil( len(files) / (2 * config.max_number_of_parallel_tasks())) if self.partition_target_table_by_day_id: if not isinstance(mara_db.dbs.db(self.db_alias), mara_db.dbs.PostgreSQLDB): raise NotImplementedError( f'Partitioning by day_id has only been implemented for postgresql so far, \n' f'not for {mara_db.postgresql.engine(self.db_alias).name}') files_per_day = {} for (file, date) in files: if date in files_per_day: files_per_day[date].append(file) else: files_per_day[date] = [file] sql_statement = '' for date in files_per_day.keys(): sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}' sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n' create_partitions_task = pipelines.Task( id='create_partitions', description='Creates required target table partitions', commands=[ sql.ExecuteSQL(sql_statement=sql_statement, echo_queries=False, db_alias=self.db_alias) ]) sub_pipeline.add(create_partitions_task) for n, chunk in enumerate( more_itertools.chunked(files_per_day.items(), chunk_size)): task = pipelines.Task( id=str(n), description='Reads a portion of the files') for (day, files) in chunk: target_table = self.target_table + '_' + day.strftime( "%Y%m%d") for file in files: task.add_commands(self.parallel_commands(file)) task.add_command( sql.ExecuteSQL( sql_statement=f'ANALYZE {target_table}')) sub_pipeline.add(task, ['create_partitions']) else: for n, chunk in enumerate(more_itertools.chunked( files, chunk_size)): sub_pipeline.add( pipelines.Task( id=str(n), description=f'Reads {len(chunk)} files', commands=sum( [self.parallel_commands(x[0]) for x in chunk], [])))