Example #1
0
def dispatch_data(num_p: int,
                  data_iter: Union[Iterator, Iterable, List],
                  args: Tuple,
                  print_out=__debug__):
    if num_p <= 0:
        raise ValueError(
            f"The number of processes specified in `nump_p` must be positive, but it is {num_p}."
        )

    tic("Splitting task", verbose=print_out)
    splits = split_iter(it=data_iter, num_splits=num_p, use_tqdm=print_out)
    toc(print_out=print_out)

    num_p = len(splits)
    if num_p == 0:
        raise ValueError(
            f"The number of data splits is zero. Possibly no data was read from the provided iterator."
        )
    else:
        job_args = [None] * num_p
        for pidx in range(num_p):
            if print_out:
                hprint_pairs(('pid', pidx), ('workload', len(splits[pidx])))
            job_args[pidx] = (pidx, splits[pidx]) + args
        return job_args
Example #2
0
def print_basic_path_info(*path_or_paths):
    for item in path_or_paths:
        if isinstance(item, str):
            hprint_pairs(("path", item), ("is file", path.isfile(item)),
                         ("exists", path.exists(item)))
        else:
            hprint_pairs((item[0], item[1]), ("is file", path.isfile(item[1])),
                         ("exists", path.exists(item[1])))
Example #3
0
 def __call__(self, pid, iq: Queue, data, *args):
     if self.pass_each_data_item:
         it = chain(*(chunk_iter(self.create_iterator(dataitem, *args),
                                 chunk_size=self.chunk_size,
                                 as_list=True) for dataitem in data))
     else:
         it = chunk_iter(self.create_iterator(data, *args),
                         chunk_size=self.chunk_size,
                         as_list=True)
     hprint_message('initialized', f'{self.name}{pid}')
     while True:
         while not iq.full():
             try:
                 obj = next(it)
             except StopIteration:
                 return
             iq.put(obj)
         hprint_pairs(('full queue for', f'{self.name}{pid}'),
                      ('wait for', self._wait_time))
         sleep(self._wait_time)
Example #4
0
def dispatch_files(num_p, file_paths: List[str], args: Tuple):
    num_files = len(file_paths)
    if __debug__:
        hprint_message(
            f"Dispatching {num_p} processes for {num_files} files ...")
    num_files_per_process = int(num_files / num_p)
    num_files_overflow = num_files - num_files_per_process * num_p
    file_idx_start = 0
    job_args = [None] * num_p
    for pidx in range(num_p):
        file_idx_end = file_idx_start + num_files_per_process + (
            pidx < num_files_overflow)
        if num_p == 1:
            curr_file_paths = file_paths
        elif pidx == num_p - 1:
            curr_file_paths = file_paths[file_idx_start:]
        else:
            curr_file_paths = file_paths[file_idx_start:file_idx_end]
            file_idx_start = file_idx_end
        if __debug__:
            hprint_pairs(('pid', pidx), ('num of files', len(curr_file_paths)))
        job_args[pidx] = (pidx, curr_file_paths) + args
    return job_args
Example #5
0
def _solve_multi_path(multi_path_str,
                      file_pattern=None,
                      multi_path_delimiter=DEFAULT_MULTI_PATH_DELIMITER,
                      sort=True,
                      verbose=__debug__):
    if verbose:
        hprint_message('solving multi-file paths from input', multi_path_str)

    # region STEP1: get all paths

    # split the path by the subdir delimiter; special treatment for Windows system.
    input_paths = [
        file_or_dir_path
        for file_or_dir_path in multi_path_str.split(multi_path_delimiter)
        if file_or_dir_path
    ]
    if platform.system() == 'Windows' and multi_path_delimiter == ':' and len(
            input_paths[0]) == 1 and input_paths[0].isalpha(
            ) and input_paths[1][0] == '\\':
        input_paths = [f'{input_paths[0]}:{input_paths[1]}'] + input_paths[2:]

    # replace the final segments of the first path to generate all actual subdir/file paths.
    for i in range(1, len(input_paths)):
        input_paths[i] = replace_path_tail(input_paths[0], input_paths[i])

    # region STEP2: check the path existence.
    path_exists = [False] * len(input_paths)
    has_available_path = False
    for path_idx, possible_path in enumerate(input_paths):
        path_exists[path_idx] = path.exists(possible_path)
        if path_exists[path_idx]:
            has_available_path = True
        if verbose:
            hprint_pairs(('path', possible_path),
                         ('exists', path_exists[path_idx]))
    # endregion

    # region STEP3: if the `file_pattern` is specified, then expand each existing dir path as files that match the provided pattern.
    if file_pattern:
        expanded_input_paths = []
        expanded_path_exists = []
        for input_path, path_exist in zip(input_paths, path_exists):
            if path_exist and path.isdir(input_path):
                files = get_files_by_pattern(input_path, file_pattern)
                if files:
                    expanded_input_paths.extend(files)
                    expanded_path_exists.extend([True] * len(files))
                    has_available_path = True
                    if verbose:
                        hprint_pairs(('extending path', input_path),
                                     ('pattern', file_pattern),
                                     ('num found files', len(files)))
            else:  # ! keeps the original path if 1) it does not exist; 2) it is a file.
                expanded_input_paths.append(input_path)
                expanded_path_exists.append(path_exist)

        if len(expanded_input_paths) == 0:
            warnings.warn(
                f"File pattern '{file_pattern}' specified, but no file of this pattern is found."
            )

        input_paths = expanded_input_paths
        path_exists = expanded_path_exists

    # endregion

    # returns the solved paths, their existence flags, and a single boolean value indicating if any of the path exists.
    if sort:
        input_paths, path_exists = zip(*sorted(zip(input_paths, path_exists)))
    return input_paths, path_exists, has_available_path
Example #6
0
def get_mp_cache_files(num_p,
                       file_paths,
                       sort=True,
                       verbose=__debug__,
                       cache_dir_path=None,
                       chunk_size=100000,
                       sort_use_basename=False,
                       rebuild_on_change=True):
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    else:
        file_paths = paex.sort_paths(file_paths,
                                     sort=sort,
                                     sort_by_basename=sort_use_basename)

    num_file_paths = len(file_paths)
    if verbose:
        hprint_pairs(('number of files', num_file_paths), ('num_p', num_p))
    if num_file_paths < num_p:
        if cache_dir_path is None:
            if len(file_paths) == 1:
                cache_dir_path = paex.add_to_main_name(file_paths[0],
                                                       prefix='.mp.')
            else:
                cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp')
        cache_file_ext_name = paex.get_ext_name(file_paths[0])

        tic('Constructs multi-processing cache files at path ' +
            path.join(cache_dir_path, '*' + cache_file_ext_name))

        mp_cache_file_paths = None
        files_id_path = cache_dir_path + '.id'
        if path.exists(cache_dir_path):
            if path.exists(files_id_path):
                old_files_id = ioex.read_all_text(files_id_path).strip()
                new_files_id = ioex.get_files_id(
                    file_paths
                )  # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed
                if new_files_id != old_files_id:
                    hprint_message(f'Files are changed; rebuilding cache at',
                                   cache_dir_path)
                    import shutil, os
                    shutil.rmtree(cache_dir_path)  # removes file cache
                    os.remove(files_id_path)  # removes the id file
                else:
                    mp_cache_file_paths = paex.get_files_by_pattern(
                        dir_or_dirs=cache_dir_path,
                        pattern='*' + cache_file_ext_name,
                        full_path=True,
                        recursive=False,
                        sort=sort,
                        sort_use_basename=sort_use_basename)
                    if not mp_cache_file_paths:
                        wprint_message(
                            'Cache directory exists, but nothing there',
                            cache_dir_path)
            else:
                hprint_message(f'Files id does not exist; rebuilding cache at',
                               cache_dir_path)
                import shutil
                shutil.rmtree(cache_dir_path)  # removes file cache
        if not mp_cache_file_paths:
            ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path)
            ioex.write_all_lines(
                iterable=ioex.iter_all_lines_from_all_files(file_paths),
                output_path=cache_dir_path,
                create_dir=True,
                chunk_size=chunk_size,
                chunked_file_ext_name=cache_file_ext_name)
            mp_cache_file_paths = paex.get_files_by_pattern(
                dir_or_dirs=cache_dir_path,
                pattern='*' + cache_file_ext_name,
                full_path=True,
                recursive=False,
                sort=sort,
                sort_use_basename=sort_use_basename)

        if mp_cache_file_paths:
            hprint_message(title='number of multi-processing cache files',
                           content=len(mp_cache_file_paths))
        else:
            raise IOError('multi-processing cache files are not found')
        file_paths = mp_cache_file_paths
        num_p = min(num_p, len(file_paths))
        toc('Done!')
    return num_p, file_paths
Example #7
0
 def __call__(self, pid, data, *args):
     hprint_message('initialized', f'{self.name}{pid}')
     no_job_cnt = 0
     if self._pass_each_data_item:
         if not self._result_dump_path and self.use_queue:
             # TODO file based queue
             iq: Queue = data
             oq: Queue = args[0]
             flags = args[1]
             while True:
                 while not iq.empty():
                     data = iq.get()
                     if self._data_from_files:
                         data = ioex.iter_all_lines_from_all_files(
                             input_paths=data, use_tqdm=True)
                         _data = (self.target(pid, dataitem, *args[2:])
                                  for dataitem in data)
                         oq.put(
                             MPResultTuple((x for x in _data
                                            if x is not None) if self.
                                           _remove_none else _data))
                     else:
                         if self._unpack_singleton_result and len(
                                 data) == 1:
                             oq.put(self.target(pid, data[0], *args[2:]))
                         else:
                             oq.put(
                                 MPResultTuple(
                                     self.target(pid, dataitem, *args[2:])
                                     for dataitem in data))
                 if not flags or flags[0]:
                     return
                 no_job_cnt += 1
                 if no_job_cnt % 10 == 0:
                     hprint_pairs(('no jobs for', f'{self.name}{pid}'),
                                  ('wait for', self._wait_time))
                 sleep(self._wait_time)
         else:
             if self._data_from_files:
                 data = ioex.iter_all_lines_from_all_files(input_paths=data,
                                                           use_tqdm=True)
                 _data = (self.target(pid, dataitem, *args)
                          for dataitem in data)
                 output = MPResultTuple((
                     x for x in _data
                     if x is not None) if self._remove_none else _data)
             elif self._unpack_singleton_result and len(data) == 1:
                 output = self.target(pid, data[0], *args)
             else:
                 data = tqdm(data, desc=f'pid: {pid}')
                 _data = (self.target(pid, dataitem, *args)
                          for dataitem in data)
                 # use a fake data type `MPResultTuple` (actually just a tuple) to inform the outside multi-processing method that the output comes from each data item
                 output = MPResultTuple((
                     x for x in _data
                     if x is not None) if self._remove_none else _data)
     elif not self._result_dump_path and self.use_queue:
         iq: Queue = data
         oq: Queue = args[0]
         flags = args[1]
         while True:
             while not iq.empty():
                 data = iq.get()
                 if self._data_from_files:
                     data = ioex.iter_all_lines_from_all_files(
                         input_paths=data, use_tqdm=True)
                 result = self.target(pid, data, *args[2:])
                 oq.put(result[0] if self._unpack_singleton_result
                        and hasattr(result, '__len__')
                        and hasattr(result, '__getitem__')
                        and len(result) == 1 else result)
             if not flags or flags[0]:
                 return
             no_job_cnt += 1
             if no_job_cnt % 10 == 0:
                 hprint_pairs(('no jobs for', f'{self.name}{pid}'),
                              ('wait for', self._wait_time))
             sleep(self._wait_time)
     else:
         if self._data_from_files:
             data = ioex.iter_all_lines_from_all_files(input_paths=data,
                                                       use_tqdm=True)
         output = self.target(pid, data, *args)
         if self._unpack_singleton_result and hasattr(
                 output, '__len__') and hasattr(
                     output, '__getitem__') and len(output) == 1:
             output = output[0]
     if self._result_dump_path:
         dump_path = path.join(
             self._result_dump_path,
             (ioex.pathex.append_timestamp(str(uuid.uuid4())) +
              '.mpb' if self._result_dump_file_pattern is None else
              self._result_dump_file_pattern.format(pid)))
         self._result_dump_method(output, dump_path)
         return dump_path if not self._always_return_results else output
     else:
         return output