Example #1
0
    def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None:
        files = []  # A list of (file_name, date_or_file_name) tuples
        data_dir = config.data_dir()
        first_date = config.first_date()

        for file in glob.iglob(str(pathlib.Path(data_dir /
                                                self.file_pattern))):
            file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir)))
            if self.date_regex:
                match = re.match(self.date_regex, file)
                if not match:
                    raise Exception(
                        f'file name "{file}" \ndoes not match date regex "{self.date_regex}"'
                    )
                date = datetime.date(*[int(group) for group in match.groups()])
                if date >= first_date:
                    files.append((file, date))
            else:
                files.append((file, file))

        # sort by date when regex provided or by filename otherwise
        files.sort(key=lambda x: x[1], reverse=True)

        # remove latest file when requested
        if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST:
            files = files[1:]

        # take only latest file when requested
        if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST:
            files = files[:1]

        # for incremental loading, determine which files already have been processed
        # reprocess all when file dependencies changed
        if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and
            (not self.file_dependencies or not _file_dependencies.is_modified(
                self.path(), 'ParallelReadFile', self.parent.base_path(),
                self.file_dependencies))):
            processed_files = _processed_files.already_processed_files(
                self.path())

            files = [
                x for x in files
                if x[0] not in processed_files  # everything not yet read
                or
                (self.read_mode == ReadMode.ONLY_CHANGED  # everything modified
                 and self._last_modification_timestamp(x[0]) > processed_files[
                     x[0]])
            ]

        if not files:
            logger.log('No newer files', format=logger.Format.ITALICS)
            return

        if self.read_mode != ReadMode.ALL and self.file_dependencies:

            def update_file_dependencies():
                _file_dependencies.update(self.path(), 'ParallelReadFile',
                                          self.parent.base_path(),
                                          self.file_dependencies)
                return True

            sub_pipeline.final_node.commands.append(
                python.RunFunction(update_file_dependencies))

        chunk_size = math.ceil(
            len(files) / (2 * config.max_number_of_parallel_tasks()))

        if self.partition_target_table_by_day_id:
            if not isinstance(mara_db.dbs.db(self.db_alias),
                              mara_db.dbs.PostgreSQLDB):
                raise NotImplementedError(
                    f'Partitioning by day_id has only been implemented for postgresql so far, \n'
                    f'not for {mara_db.postgresql.engine(self.db_alias).name}')
            files_per_day = {}
            for (file, date) in files:
                if date in files_per_day:
                    files_per_day[date].append(file)
                else:
                    files_per_day[date] = [file]

            sql_statement = ''
            for date in files_per_day.keys():
                sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}'
                sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n'

            create_partitions_task = pipelines.Task(
                id='create_partitions',
                description='Creates required target table partitions',
                commands=[
                    sql.ExecuteSQL(sql_statement=sql_statement,
                                   echo_queries=False,
                                   db_alias=self.db_alias)
                ])

            sub_pipeline.add(create_partitions_task)

            for n, chunk in enumerate(
                    more_itertools.chunked(files_per_day.items(), chunk_size)):
                task = pipelines.Task(
                    id=str(n), description='Reads a portion of the files')
                for (day, files) in chunk:
                    target_table = self.target_table + '_' + day.strftime(
                        "%Y%m%d")
                    for file in files:
                        task.add_commands(self.parallel_commands(file))
                    task.add_command(
                        sql.ExecuteSQL(
                            sql_statement=f'ANALYZE {target_table}'))
                sub_pipeline.add(task, ['create_partitions'])
        else:
            for n, chunk in enumerate(more_itertools.chunked(
                    files, chunk_size)):
                sub_pipeline.add(
                    pipelines.Task(
                        id=str(n),
                        description=f'Reads {len(chunk)} files',
                        commands=sum(
                            [self.parallel_commands(x[0]) for x in chunk],
                            [])))
Example #2
0
 def _last_modification_timestamp(self, file_name):
     return datetime.datetime.fromtimestamp(
         os.path.getmtime(pathlib.Path(config.data_dir()) / file_name))