def dispatch_data(num_p: int, data_iter: Union[Iterator, Iterable, List], args: Tuple, print_out=__debug__): if num_p <= 0: raise ValueError( f"The number of processes specified in `nump_p` must be positive, but it is {num_p}." ) tic("Splitting task", verbose=print_out) splits = split_iter(it=data_iter, num_splits=num_p, use_tqdm=print_out) toc(print_out=print_out) num_p = len(splits) if num_p == 0: raise ValueError( f"The number of data splits is zero. Possibly no data was read from the provided iterator." ) else: job_args = [None] * num_p for pidx in range(num_p): if print_out: hprint_pairs(('pid', pidx), ('workload', len(splits[pidx]))) job_args[pidx] = (pidx, splits[pidx]) + args return job_args
def print_basic_path_info(*path_or_paths): for item in path_or_paths: if isinstance(item, str): hprint_pairs(("path", item), ("is file", path.isfile(item)), ("exists", path.exists(item))) else: hprint_pairs((item[0], item[1]), ("is file", path.isfile(item[1])), ("exists", path.exists(item[1])))
def __call__(self, pid, iq: Queue, data, *args): if self.pass_each_data_item: it = chain(*(chunk_iter(self.create_iterator(dataitem, *args), chunk_size=self.chunk_size, as_list=True) for dataitem in data)) else: it = chunk_iter(self.create_iterator(data, *args), chunk_size=self.chunk_size, as_list=True) hprint_message('initialized', f'{self.name}{pid}') while True: while not iq.full(): try: obj = next(it) except StopIteration: return iq.put(obj) hprint_pairs(('full queue for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time)
def dispatch_files(num_p, file_paths: List[str], args: Tuple): num_files = len(file_paths) if __debug__: hprint_message( f"Dispatching {num_p} processes for {num_files} files ...") num_files_per_process = int(num_files / num_p) num_files_overflow = num_files - num_files_per_process * num_p file_idx_start = 0 job_args = [None] * num_p for pidx in range(num_p): file_idx_end = file_idx_start + num_files_per_process + ( pidx < num_files_overflow) if num_p == 1: curr_file_paths = file_paths elif pidx == num_p - 1: curr_file_paths = file_paths[file_idx_start:] else: curr_file_paths = file_paths[file_idx_start:file_idx_end] file_idx_start = file_idx_end if __debug__: hprint_pairs(('pid', pidx), ('num of files', len(curr_file_paths))) job_args[pidx] = (pidx, curr_file_paths) + args return job_args
def _solve_multi_path(multi_path_str, file_pattern=None, multi_path_delimiter=DEFAULT_MULTI_PATH_DELIMITER, sort=True, verbose=__debug__): if verbose: hprint_message('solving multi-file paths from input', multi_path_str) # region STEP1: get all paths # split the path by the subdir delimiter; special treatment for Windows system. input_paths = [ file_or_dir_path for file_or_dir_path in multi_path_str.split(multi_path_delimiter) if file_or_dir_path ] if platform.system() == 'Windows' and multi_path_delimiter == ':' and len( input_paths[0]) == 1 and input_paths[0].isalpha( ) and input_paths[1][0] == '\\': input_paths = [f'{input_paths[0]}:{input_paths[1]}'] + input_paths[2:] # replace the final segments of the first path to generate all actual subdir/file paths. for i in range(1, len(input_paths)): input_paths[i] = replace_path_tail(input_paths[0], input_paths[i]) # region STEP2: check the path existence. path_exists = [False] * len(input_paths) has_available_path = False for path_idx, possible_path in enumerate(input_paths): path_exists[path_idx] = path.exists(possible_path) if path_exists[path_idx]: has_available_path = True if verbose: hprint_pairs(('path', possible_path), ('exists', path_exists[path_idx])) # endregion # region STEP3: if the `file_pattern` is specified, then expand each existing dir path as files that match the provided pattern. if file_pattern: expanded_input_paths = [] expanded_path_exists = [] for input_path, path_exist in zip(input_paths, path_exists): if path_exist and path.isdir(input_path): files = get_files_by_pattern(input_path, file_pattern) if files: expanded_input_paths.extend(files) expanded_path_exists.extend([True] * len(files)) has_available_path = True if verbose: hprint_pairs(('extending path', input_path), ('pattern', file_pattern), ('num found files', len(files))) else: # ! keeps the original path if 1) it does not exist; 2) it is a file. expanded_input_paths.append(input_path) expanded_path_exists.append(path_exist) if len(expanded_input_paths) == 0: warnings.warn( f"File pattern '{file_pattern}' specified, but no file of this pattern is found." ) input_paths = expanded_input_paths path_exists = expanded_path_exists # endregion # returns the solved paths, their existence flags, and a single boolean value indicating if any of the path exists. if sort: input_paths, path_exists = zip(*sorted(zip(input_paths, path_exists))) return input_paths, path_exists, has_available_path
def get_mp_cache_files(num_p, file_paths, sort=True, verbose=__debug__, cache_dir_path=None, chunk_size=100000, sort_use_basename=False, rebuild_on_change=True): if isinstance(file_paths, str): file_paths = [file_paths] else: file_paths = paex.sort_paths(file_paths, sort=sort, sort_by_basename=sort_use_basename) num_file_paths = len(file_paths) if verbose: hprint_pairs(('number of files', num_file_paths), ('num_p', num_p)) if num_file_paths < num_p: if cache_dir_path is None: if len(file_paths) == 1: cache_dir_path = paex.add_to_main_name(file_paths[0], prefix='.mp.') else: cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp') cache_file_ext_name = paex.get_ext_name(file_paths[0]) tic('Constructs multi-processing cache files at path ' + path.join(cache_dir_path, '*' + cache_file_ext_name)) mp_cache_file_paths = None files_id_path = cache_dir_path + '.id' if path.exists(cache_dir_path): if path.exists(files_id_path): old_files_id = ioex.read_all_text(files_id_path).strip() new_files_id = ioex.get_files_id( file_paths ) # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed if new_files_id != old_files_id: hprint_message(f'Files are changed; rebuilding cache at', cache_dir_path) import shutil, os shutil.rmtree(cache_dir_path) # removes file cache os.remove(files_id_path) # removes the id file else: mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if not mp_cache_file_paths: wprint_message( 'Cache directory exists, but nothing there', cache_dir_path) else: hprint_message(f'Files id does not exist; rebuilding cache at', cache_dir_path) import shutil shutil.rmtree(cache_dir_path) # removes file cache if not mp_cache_file_paths: ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path) ioex.write_all_lines( iterable=ioex.iter_all_lines_from_all_files(file_paths), output_path=cache_dir_path, create_dir=True, chunk_size=chunk_size, chunked_file_ext_name=cache_file_ext_name) mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if mp_cache_file_paths: hprint_message(title='number of multi-processing cache files', content=len(mp_cache_file_paths)) else: raise IOError('multi-processing cache files are not found') file_paths = mp_cache_file_paths num_p = min(num_p, len(file_paths)) toc('Done!') return num_p, file_paths
def __call__(self, pid, data, *args): hprint_message('initialized', f'{self.name}{pid}') no_job_cnt = 0 if self._pass_each_data_item: if not self._result_dump_path and self.use_queue: # TODO file based queue iq: Queue = data oq: Queue = args[0] flags = args[1] while True: while not iq.empty(): data = iq.get() if self._data_from_files: data = ioex.iter_all_lines_from_all_files( input_paths=data, use_tqdm=True) _data = (self.target(pid, dataitem, *args[2:]) for dataitem in data) oq.put( MPResultTuple((x for x in _data if x is not None) if self. _remove_none else _data)) else: if self._unpack_singleton_result and len( data) == 1: oq.put(self.target(pid, data[0], *args[2:])) else: oq.put( MPResultTuple( self.target(pid, dataitem, *args[2:]) for dataitem in data)) if not flags or flags[0]: return no_job_cnt += 1 if no_job_cnt % 10 == 0: hprint_pairs(('no jobs for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time) else: if self._data_from_files: data = ioex.iter_all_lines_from_all_files(input_paths=data, use_tqdm=True) _data = (self.target(pid, dataitem, *args) for dataitem in data) output = MPResultTuple(( x for x in _data if x is not None) if self._remove_none else _data) elif self._unpack_singleton_result and len(data) == 1: output = self.target(pid, data[0], *args) else: data = tqdm(data, desc=f'pid: {pid}') _data = (self.target(pid, dataitem, *args) for dataitem in data) # use a fake data type `MPResultTuple` (actually just a tuple) to inform the outside multi-processing method that the output comes from each data item output = MPResultTuple(( x for x in _data if x is not None) if self._remove_none else _data) elif not self._result_dump_path and self.use_queue: iq: Queue = data oq: Queue = args[0] flags = args[1] while True: while not iq.empty(): data = iq.get() if self._data_from_files: data = ioex.iter_all_lines_from_all_files( input_paths=data, use_tqdm=True) result = self.target(pid, data, *args[2:]) oq.put(result[0] if self._unpack_singleton_result and hasattr(result, '__len__') and hasattr(result, '__getitem__') and len(result) == 1 else result) if not flags or flags[0]: return no_job_cnt += 1 if no_job_cnt % 10 == 0: hprint_pairs(('no jobs for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time) else: if self._data_from_files: data = ioex.iter_all_lines_from_all_files(input_paths=data, use_tqdm=True) output = self.target(pid, data, *args) if self._unpack_singleton_result and hasattr( output, '__len__') and hasattr( output, '__getitem__') and len(output) == 1: output = output[0] if self._result_dump_path: dump_path = path.join( self._result_dump_path, (ioex.pathex.append_timestamp(str(uuid.uuid4())) + '.mpb' if self._result_dump_file_pattern is None else self._result_dump_file_pattern.format(pid))) self._result_dump_method(output, dump_path) return dump_path if not self._always_return_results else output else: return output