def build_vocab(self, data_entries): """ Build vocabularies. The vocabularies are :param data_entries: :return: """ vocab_build_data_args = [] for item in self._data_args: if item.data_arg is not None and item.vocab_build: for vocab in item.vocab_build: if vocab.build_mode: vocab_build_data_args.append(item) break if vocab_build_data_args: data_entries = tqdm(data_entries) data_entries.set_description('build vocabulary') vocabs = set() for data_entry in data_entries: for item in self._data_args: if item.data_arg is not None and item.vocab_build: vocabs.update(item.vocab_build) data_entry[item.data_key]( item.data_arg ) # it is assumed this `item.data_arg` has a vocabulary-based indexer for this to work for vocab in vocabs: vocab.save() else: gx.hprint_message('all vocabularies are ready')
def check_path_existence(*path_or_paths): has_error = False for item in path_or_paths: if not path.exists(item): eprint_message("path not found", item) has_error = True if not has_error: hprint_message("Path existence check passed!")
def pack_csv(csv_path, output_path, sep='\t', data_seps=' ', header=True, top=None, use_tqdm=False, display_msg=None, verbose=__debug__): """ Packs a csv file a compressed pickle file. :param csv_path: the input csv file path. :param output_path: the pickle file will be saved at this path. :param sep: the csv field separator. :param data_seps: the separator to further split the field data; can use a dictionary to specify a data separator for each field. :param header: `True` if the csv file has a header; otherwise, `False`. :param use_tqdm: `True` to use tqdm to display packing progress; otherwise, `False`. :param display_msg: the message to print or display to indicate the data is being packed. :param verbose: `True` to print out as much internal message as possible. """ vocab = IndexDict() data = [] with open(csv_path, 'r') as f: if header: header = next(f).strip('\n').split(sep) if isinstance(data_seps, dict): for i, h in enumerate(header): if h in data_seps: data_seps[i] = data_seps[h] else: header = None for line in tqdm_wrap(f if top is None else islice(f, top), use_tqdm=use_tqdm, tqdm_msg=display_msg, verbose=verbose): fields = line.strip('\n').split(sep) if data_seps is None: fields = tuple(vocab.add(field) for field in fields) elif isinstance(data_seps, str): fields = tuple( tuple(vocab.add(x) for x in field.split(data_seps)) for field in fields) else: fields = tuple( tuple( vocab.add(x) for x in field.split(data_seps.get(i, ' '))) for i, field in enumerate(fields)) data.append(fields) if verbose: hprint_message('data size', len(data)) hprint_message('vocab size', len(vocab)) pickle_save((sep, data_seps, header, data, dict(vocab.to_dicts()[0])), output_path, compressed=True)
def __init__(self, save_path, min_count: int, max_size: int = None, vocab_name=None, pad_token='[PAD]', unk_token='[UNK]', fixed_tokens: Mapping = None, build_mode=False, format=None, index_offset=0): self.vocab_name = vocab_name or path.basename(save_path) self.save_path = save_path self.min_count = min_count self.max_size = max_size self.pad_token = pad_token self.unk_token = unk_token self.build_mode = build_mode self.index_offset = index_offset self._active_tokens = None if build_mode: shutil.rmtree(self.save_path) token2index_file = path.join(self.save_path, 'token2index.txt') if not path.exists(token2index_file): self._token2index = {pad_token: 0, unk_token: 1} self._index2token = {0: pad_token, 1: unk_token} self._token_count = {} self.build_mode = True else: tokencount_file = path.join(self.save_path, 'tokencount.txt') if path.exists(tokencount_file): self._token_count = read_dict_from_text(tokencount_file, valtype=int) self._set_active_tokens_by_counts() self._active_tokens.add(pad_token) self._active_tokens.add(unk_token) else: self._token_count = None self._token2index = read_dict_from_text(token2index_file, valtype=int, format=format) self._index2token = kvswap(self._token2index) self.build_mode = False if fixed_tokens is not None: if self._active_tokens is not None: self._active_tokens.update(fixed_tokens.keys()) for token, idx in fixed_tokens.items(): self._token2index[token] = idx self._index2token[idx] = token if not self.build_mode: hprint_message(title=f'size of vocabulary {self.vocab_name}', content=self.vocab_size())
def mp_read(data_iter, provider, producer, provider_args=(), producer_args=(), num_providers=1, num_producers=4, ctx=None, checking_interval=0.5, print_out=True): provider_jobs = [None] * num_providers producer_jobs = [None] * num_producers if ctx is None: ctx = get_context() manager = ctx.Manager() iq = manager.Queue() oq: Queue = manager.Queue() flags = manager.list([False]) if isinstance(producer, MPTarget): producer.use_queue = True provider_args = dispatch_data(num_p=num_providers, data_iter=data_iter, args=provider_args, print_out=print_out) for i in range(num_providers): provider_jobs[i] = ctx.Process(target=provider, args=(i, iq) + provider_args[i][1:]) for i in range(num_producers): producer_jobs[i] = ctx.Process(target=producer, args=(i, iq, oq, flags) + producer_args) start_jobs(provider_jobs) start_jobs(producer_jobs) while True: while not oq.empty(): objs = oq.get() yield from objs sleep(checking_interval) any_active_provider = any((job.is_alive() for job in provider_jobs)) any_active_producer = any((job.is_alive() for job in producer_jobs)) if not any_active_provider: if not flags[0]: flags[0] = True hprint_message('all providers done!') if not any_active_producer: hprint_message('all jobs done!') break
def load_embeds(embeds_path, format='labeled_numpy', read_embeds=True, read_labels=True, use_tqdm: bool = True, tqdm_msg: str = None, sort=True, **kwargs): if tqdm_msg is None: if read_embeds and read_labels: tqdm_msg = f'loading embeds with labels at {embeds_path}' elif read_embeds: tqdm_msg = f'loading embeds at {embeds_path}' elif read_labels: tqdm_msg = f'loading labels at {embeds_path}' else: return embeds_it = iter_embeds(embeds_path=embeds_path, format=format, read_embeds=read_embeds, read_labels=read_labels, use_tqdm=use_tqdm, tqdm_msg=tqdm_msg, sort=sort, **kwargs) tic('Load embeddings ...') if format == 'labeled_numpy': output = list(embeds_it) if read_embeds and read_labels: embeds_list, labels_list = gx.unzip(output) gx.hprint_message( f"Total number of embedding batches at {embeds_path} to index", len(embeds_list)) output = (embeds_list, labels_list) elif read_embeds or read_labels: gx.hprint_message( f"Total number of embedding batches at {embeds_path} to index", len(output)) else: raise NotImplementedError('the embedding file format is not supported') toc(msg=f'Done!') return output
def savefig__(fname, dpi: int = 1200, format=None, clear=False, verbose=__debug__, *args, **kwargs): if format is None: format = path.splitext(path.basename(fname))[1] if format: format = format[1:] else: format = 'svg' plt.savefig(fname=fname, dpi=dpi, format=format, *args, **kwargs) if verbose: hprint_message('figure saved', fname) if clear: plt.clf()
def __call__(self, pid, iq: Queue, data, *args): if self.pass_each_data_item: it = chain(*(chunk_iter(self.create_iterator(dataitem, *args), chunk_size=self.chunk_size, as_list=True) for dataitem in data)) else: it = chunk_iter(self.create_iterator(data, *args), chunk_size=self.chunk_size, as_list=True) hprint_message('initialized', f'{self.name}{pid}') while True: while not iq.full(): try: obj = next(it) except StopIteration: return iq.put(obj) hprint_pairs(('full queue for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time)
def save(self): if self.build_mode: self._token_count = sort_by_values(self._token_count, reverse=True) self._set_active_tokens_by_counts() self._active_tokens.update(self._token2index.keys()) for token in self._token_count: idx = len(self) + self.index_offset self._token2index[token] = idx self._index2token[idx] = token ensure_dir_existence(self.save_path, clear_dir=True) write_dict_as_text(self._token2index, output_path=path.join(self.save_path, 'token2index.txt')) write_dict_as_text(self._token_count, output_path=path.join(self.save_path, 'tokencount.txt')) hprint_message(title=f'size of vocabulary {self.vocab_name}', content=self.vocab_size()) self.build_mode = False
def unpack_csv(data_path, output_csv_path, use_tqdm=False, display_msg=None, verbose=__debug__): """ Unpacks a compressed pickle file built by `pack_csv` to a csv file. :param data_path: the path to the pickle file. :param output_csv_path: the output csv file will be saved at this path. :param use_tqdm: `True` to use tqdm to display packing progress; otherwise, `False`. :param display_msg: the message to print or display to indicate the data is being unpacked. :param verbose: `True` to print out as much internal message as possible. """ sep, data_seps, header, data, vocab = pickle_load(data_path, compressed=True) vocab = kvswap(vocab) if verbose: hprint_message('data size', len(data)) hprint_message('vocab size', len(vocab)) def _tup_iter(): for fields in tqdm_wrap(data, use_tqdm=use_tqdm, tqdm_msg=display_msg, verbose=verbose): if data_seps is None: yield tuple(vocab.get(field) for field in fields) elif isinstance(data_seps, str): yield tuple( data_seps.join([vocab.get(x) for x in field]) for field in fields) else: yield tuple( data_seps.get(i, ' ').join([vocab.get(x) for x in field]) for i, field in enumerate(fields)) write_csv(_tup_iter(), output_csv_path=output_csv_path, sep=sep, header=header)
def install_bashrc(rcdir, mainrc, other, target='~/.bashrc', verbose=__debug__): """ Assembles multiple bashrc files. This script supports splitting a large complicated bashrc file to multiple files in a directory, and then assemble them back into one file when needed. Use this function with the `install_bashrc.sh` bash script to achieve easier management of bashrc files. :param rcdir: :param mainrc: :param other: :param target: :param verbose: :return: """ if isinstance(other, str): other = [x.strip() for x in other.split(',')] mainrc = path.join(rcdir, mainrc) if verbose: hprint_message(title='mainrc', content=mainrc) hprint_message(title='others', content=other) hprint_message(title='target', content=target) shutil.copyfile(mainrc, target) with open(target, 'a') as f: f.write('\n') for other in iter__(other): other = path.join(rcdir, other) f.write(f'source "{other}"\n') f.write(f'alias vim_{get_main_name(other)}="vim {other}"\n') f.write(f'alias vimsrc="vim {target}"\n') f.write(f'alias srcsrc="source {target}"\n')
def dispatch_files(num_p, file_paths: List[str], args: Tuple): num_files = len(file_paths) if __debug__: hprint_message( f"Dispatching {num_p} processes for {num_files} files ...") num_files_per_process = int(num_files / num_p) num_files_overflow = num_files - num_files_per_process * num_p file_idx_start = 0 job_args = [None] * num_p for pidx in range(num_p): file_idx_end = file_idx_start + num_files_per_process + ( pidx < num_files_overflow) if num_p == 1: curr_file_paths = file_paths elif pidx == num_p - 1: curr_file_paths = file_paths[file_idx_start:] else: curr_file_paths = file_paths[file_idx_start:file_idx_end] file_idx_start = file_idx_end if __debug__: hprint_pairs(('pid', pidx), ('num of files', len(curr_file_paths))) job_args[pidx] = (pidx, curr_file_paths) + args return job_args
def build_index(embeds_path, output_path, num_clusters=65536, use_gpu=False, train_ratio=1.0, embeds_format='labeled_numpy', sort=True, **kwargs): # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True) # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths) # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt') # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx') embeds_list, _ = load_embeds(embeds_path=embeds_path, format=embeds_format, sort=sort, **kwargs) tic('Initializing index ...') if not num_clusters: num_clusters = len(embeds_list) // 100 index = faiss.index_factory(embeds_list[0].shape[-1], f"IVF{num_clusters},Flat", faiss.METRIC_INNER_PRODUCT) if use_gpu: index = faiss.index_cpu_to_all_gpus(index) tic('Concatenating embeddings ...') if 0 < train_ratio < 1: gx.hprint_message( f"will sample subset for training with ratio {train_ratio}...") all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list( gx.sampled_iter(embeds_list, train_ratio))) toc(msg=f'Initialization done!') tic(f'Training embeddings of shape {all_embeds.shape} ...') index.train(all_embeds) if use_gpu: index = faiss.index_gpu_to_cpu(index) toc(msg='Index training done!') tic('Add embeddings to index ...') del all_embeds embed_index_start = 0 for embeds in tqdm(embeds_list): embed_count = embeds.shape[0] index.add_with_ids( embeds, np.arange(embed_index_start, embed_index_start + embed_count)) embed_index_start += embed_count # with open(text_file_path, 'w+') as wf: # for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True): # write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False) # embed_count = embeds.shape[0] # index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count)) # embed_index_start += embed_count if path.exists(output_path): os.remove(output_path) gx.hprint_message('saving indexed embeddings to', output_path) faiss.write_index(index, output_path) toc(msg='Indexing done!') return index
def __call__(self, pid, data, *args): hprint_message('initialized', f'{self.name}{pid}') no_job_cnt = 0 if self._pass_each_data_item: if not self._result_dump_path and self.use_queue: # TODO file based queue iq: Queue = data oq: Queue = args[0] flags = args[1] while True: while not iq.empty(): data = iq.get() if self._data_from_files: data = ioex.iter_all_lines_from_all_files( input_paths=data, use_tqdm=True) _data = (self.target(pid, dataitem, *args[2:]) for dataitem in data) oq.put( MPResultTuple((x for x in _data if x is not None) if self. _remove_none else _data)) else: if self._unpack_singleton_result and len( data) == 1: oq.put(self.target(pid, data[0], *args[2:])) else: oq.put( MPResultTuple( self.target(pid, dataitem, *args[2:]) for dataitem in data)) if not flags or flags[0]: return no_job_cnt += 1 if no_job_cnt % 10 == 0: hprint_pairs(('no jobs for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time) else: if self._data_from_files: data = ioex.iter_all_lines_from_all_files(input_paths=data, use_tqdm=True) _data = (self.target(pid, dataitem, *args) for dataitem in data) output = MPResultTuple(( x for x in _data if x is not None) if self._remove_none else _data) elif self._unpack_singleton_result and len(data) == 1: output = self.target(pid, data[0], *args) else: data = tqdm(data, desc=f'pid: {pid}') _data = (self.target(pid, dataitem, *args) for dataitem in data) # use a fake data type `MPResultTuple` (actually just a tuple) to inform the outside multi-processing method that the output comes from each data item output = MPResultTuple(( x for x in _data if x is not None) if self._remove_none else _data) elif not self._result_dump_path and self.use_queue: iq: Queue = data oq: Queue = args[0] flags = args[1] while True: while not iq.empty(): data = iq.get() if self._data_from_files: data = ioex.iter_all_lines_from_all_files( input_paths=data, use_tqdm=True) result = self.target(pid, data, *args[2:]) oq.put(result[0] if self._unpack_singleton_result and hasattr(result, '__len__') and hasattr(result, '__getitem__') and len(result) == 1 else result) if not flags or flags[0]: return no_job_cnt += 1 if no_job_cnt % 10 == 0: hprint_pairs(('no jobs for', f'{self.name}{pid}'), ('wait for', self._wait_time)) sleep(self._wait_time) else: if self._data_from_files: data = ioex.iter_all_lines_from_all_files(input_paths=data, use_tqdm=True) output = self.target(pid, data, *args) if self._unpack_singleton_result and hasattr( output, '__len__') and hasattr( output, '__getitem__') and len(output) == 1: output = output[0] if self._result_dump_path: dump_path = path.join( self._result_dump_path, (ioex.pathex.append_timestamp(str(uuid.uuid4())) + '.mpb' if self._result_dump_file_pattern is None else self._result_dump_file_pattern.format(pid))) self._result_dump_method(output, dump_path) return dump_path if not self._always_return_results else output else: return output
def get_mp_cache_files(num_p, file_paths, sort=True, verbose=__debug__, cache_dir_path=None, chunk_size=100000, sort_use_basename=False, rebuild_on_change=True): if isinstance(file_paths, str): file_paths = [file_paths] else: file_paths = paex.sort_paths(file_paths, sort=sort, sort_by_basename=sort_use_basename) num_file_paths = len(file_paths) if verbose: hprint_pairs(('number of files', num_file_paths), ('num_p', num_p)) if num_file_paths < num_p: if cache_dir_path is None: if len(file_paths) == 1: cache_dir_path = paex.add_to_main_name(file_paths[0], prefix='.mp.') else: cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp') cache_file_ext_name = paex.get_ext_name(file_paths[0]) tic('Constructs multi-processing cache files at path ' + path.join(cache_dir_path, '*' + cache_file_ext_name)) mp_cache_file_paths = None files_id_path = cache_dir_path + '.id' if path.exists(cache_dir_path): if path.exists(files_id_path): old_files_id = ioex.read_all_text(files_id_path).strip() new_files_id = ioex.get_files_id( file_paths ) # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed if new_files_id != old_files_id: hprint_message(f'Files are changed; rebuilding cache at', cache_dir_path) import shutil, os shutil.rmtree(cache_dir_path) # removes file cache os.remove(files_id_path) # removes the id file else: mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if not mp_cache_file_paths: wprint_message( 'Cache directory exists, but nothing there', cache_dir_path) else: hprint_message(f'Files id does not exist; rebuilding cache at', cache_dir_path) import shutil shutil.rmtree(cache_dir_path) # removes file cache if not mp_cache_file_paths: ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path) ioex.write_all_lines( iterable=ioex.iter_all_lines_from_all_files(file_paths), output_path=cache_dir_path, create_dir=True, chunk_size=chunk_size, chunked_file_ext_name=cache_file_ext_name) mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if mp_cache_file_paths: hprint_message(title='number of multi-processing cache files', content=len(mp_cache_file_paths)) else: raise IOError('multi-processing cache files are not found') file_paths = mp_cache_file_paths num_p = min(num_p, len(file_paths)) toc('Done!') return num_p, file_paths
def pd_series_plot(df, output_path, series_col, index_col, value_cols, group_cols=None, groups=None, remove_zero_vals=True, title=None, plot_args=None, plot_mode='subplots', xlabel=None): # TODO: 'ylabel' does not work if group_cols is not None: pathex.ensure_dir_existence(output_path) if isinstance(group_cols, str): group_cols = (group_cols, ) if isinstance(value_cols, str): value_cols = (value_cols, ) for group_idx, group in enumerate(groups): hprint_message(f'generating plot for group {group_idx}', group) _df = df for group_col, val in zip(group_cols, group): _df = _df.loc[_df[group_col] == val] if remove_zero_vals: for value_col in value_cols: _df[value_col] = _df[value_col].replace(0, np.nan) if title is None: _title = "_".join(map(str, group)) elif isinstance(title, str): _title = title else: _title = title[group_idx] plt.clf() if plot_mode == 'same_fig': value_col = value_cols[0] __df = _df.pivot(index=index_col, columns=series_col, values=value_col) _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) ax = __df.plot(title=_title, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) for value_col in value_cols[1:]: __df = _df.pivot(index=index_col, columns=series_col, values=value_col) _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) ax = __df.plot(ax=ax, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) fig = None elif plot_mode == 'subplots': fig, axes = init_figure(*value_cols, max_ncols=1, sharex=True) for value_col, ax in zip(value_cols, axes): _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) __df = _df.pivot(index=index_col, columns=series_col, values=value_col) ax = __df.plot(ax=ax, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) if xlabel is not None: plt.xlabel(xlabel) plt.savefig(path.join(output_path, f'{_title}.png')) plt.clf() plt.close(fig=fig)
def scp_upload(src_dir, host, username, password, dst_dir, pattern='*', recursive=True, server_path_sep='/', ignore_error='silent', ignore_unchanged_files=True, hash_block_size=65536, ssh_timout=15.0, **kwargs): ssh = SSHClient() ssh.load_system_host_keys() ssh.connect(hostname=host, username=username, password=password, timeout=ssh_timout, **kwargs) scp = SCPClient(ssh.get_transport()) src_dir = path.expandvars(src_dir) if ignore_unchanged_files: filehash_path = path.join( src_dir, f"{strex.hash_str('/'.join((host, username, dst_dir, str(pattern), str(recursive))))}_scp_file_hashes" ) filehash_dict = ioex.pickle_load( filehash_path, compressed=True) if path.exists(filehash_path) else {} for local_file, file_name in pathex.iter_files_by_pattern( dir_or_dirs=src_dir, pattern=pattern, recursive=recursive, full_path=pathex.FullPathMode.FullPathRelativePathTuple): if ignore_unchanged_files: filehash = ioex.hash_file(local_file, block_size=hash_block_size) if filehash == filehash_dict.get(local_file, None): continue remote_file = path.join(dst_dir, file_name) if server_path_sep != os.sep: remote_file = remote_file.replace(os.sep, server_path_sep) try: scp.put(local_file, remote_file) except Exception as err: if "No such file or directory" in str(err): ssh.exec_command(f"mkdir -p {path.dirname(remote_file)}") try: scp.put(local_file, remote_file) except Exception as err: if ignore_error is True: eprint_message(title='failed', content=local_file) print(type(err), err) continue elif ignore_error == 'silent': continue else: raise err else: if ignore_error is True: eprint_message(title='failed', content=local_file) print(type(err), err) continue elif ignore_error == 'silent': continue else: raise err hprint_message(title='success', content=local_file) if ignore_unchanged_files: filehash_dict[local_file] = filehash scp.close() if ignore_unchanged_files: ioex.pickle_save(filehash_dict, filehash_path, compressed=True)
def _solve_multi_path(multi_path_str, file_pattern=None, multi_path_delimiter=DEFAULT_MULTI_PATH_DELIMITER, sort=True, verbose=__debug__): if verbose: hprint_message('solving multi-file paths from input', multi_path_str) # region STEP1: get all paths # split the path by the subdir delimiter; special treatment for Windows system. input_paths = [ file_or_dir_path for file_or_dir_path in multi_path_str.split(multi_path_delimiter) if file_or_dir_path ] if platform.system() == 'Windows' and multi_path_delimiter == ':' and len( input_paths[0]) == 1 and input_paths[0].isalpha( ) and input_paths[1][0] == '\\': input_paths = [f'{input_paths[0]}:{input_paths[1]}'] + input_paths[2:] # replace the final segments of the first path to generate all actual subdir/file paths. for i in range(1, len(input_paths)): input_paths[i] = replace_path_tail(input_paths[0], input_paths[i]) # region STEP2: check the path existence. path_exists = [False] * len(input_paths) has_available_path = False for path_idx, possible_path in enumerate(input_paths): path_exists[path_idx] = path.exists(possible_path) if path_exists[path_idx]: has_available_path = True if verbose: hprint_pairs(('path', possible_path), ('exists', path_exists[path_idx])) # endregion # region STEP3: if the `file_pattern` is specified, then expand each existing dir path as files that match the provided pattern. if file_pattern: expanded_input_paths = [] expanded_path_exists = [] for input_path, path_exist in zip(input_paths, path_exists): if path_exist and path.isdir(input_path): files = get_files_by_pattern(input_path, file_pattern) if files: expanded_input_paths.extend(files) expanded_path_exists.extend([True] * len(files)) has_available_path = True if verbose: hprint_pairs(('extending path', input_path), ('pattern', file_pattern), ('num found files', len(files))) else: # ! keeps the original path if 1) it does not exist; 2) it is a file. expanded_input_paths.append(input_path) expanded_path_exists.append(path_exist) if len(expanded_input_paths) == 0: warnings.warn( f"File pattern '{file_pattern}' specified, but no file of this pattern is found." ) input_paths = expanded_input_paths path_exists = expanded_path_exists # endregion # returns the solved paths, their existence flags, and a single boolean value indicating if any of the path exists. if sort: input_paths, path_exists = zip(*sorted(zip(input_paths, path_exists))) return input_paths, path_exists, has_available_path