def shuffle_list_iter(ls: List, num_shuffles_to_generate: int = 1, random_seed: int = 0, index_file_dir: str = None, index_file_name_pattern: str = 'shuffle_idx_{}.idx', verbose: bool = __debug__) -> Iterator[List]: ensure_positive_arg(arg_val=num_shuffles_to_generate, arg_name='num_shuffles_to_generate') ls_len = len(ls) if index_file_dir: if not path.exists(index_file_dir): paex.ensure_dir_existence(index_file_dir, verbose=verbose) else: from utix.ioex import pickle_load for i in range(num_shuffles_to_generate): idx_file_path = path.join(index_file_dir, index_file_name_pattern.format(i)) if path.exists(idx_file_path): idxes = pickle_load(idx_file_path) elif i == 0: break else: raise ValueError(f"the index file {idx_file_path} does not exist.") if len(idxes) != ls_len: raise ValueError(f"the number of the loaded list indexes ({len(idxes)}) is different from the size of the list to shuffle ({ls_len}).") yield [ls[idx] for idx in idxes] if i != 0: return rng = random if random_seed < 0 else random.Random(random_seed) idxes = list(range(ls_len)) from utix.ioex import pickle_save__ for i in range(num_shuffles_to_generate): idx_file_path = path.join(index_file_dir, index_file_name_pattern.format(i)) rng.shuffle(idxes) if index_file_dir: pickle_save__(data=idxes, file_or_dir_path=idx_file_path, auto_timestamp=False) yield [ls[idx] for idx in idxes]
def write_dicts_to_csv(row_dicts, output_path, append=False, create_dir=True): if create_dir: paex.ensure_dir_existence(path.dirname(output_path), verbose=False) if not path.exists(output_path): append = False with open(output_path, 'a+' if append else 'w+') as csv_fout: writer = csv.DictWriter(csv_fout, fieldnames=row_dicts[0].keys()) if not append: writer.writeheader() writer.writerows(row_dicts)
def write_csv(tup_iter, output_csv_path, sep='\t', header=None, append=False, encoding='utf-8', create_dir=True, flatten=False): """ Writes tuples/lists to a csv file. :param tup_iter: an iterator of tuples or lists. :param output_csv_path: the output csv file will be saved at this path. :param sep: the csv field separator. :param header: provide the header for the csv file; the header will be written as the first line of the csv file if `append` is set `False`. :param append: `True` to use append mode; otherwise, `False`. :param encoding: specifies the csv file encoding; the default is 'utf-8'. """ if create_dir: paex.ensure_dir_existence(path.dirname(output_csv_path), verbose=False) with open(output_csv_path, 'a' if append else 'w', encoding=encoding) as csv_f: if flatten: if header is not None and append is False: csv_f.write( sep.join(((sep.join(x) if isinstance(x, (tuple, list)) else x) for x in header))) csv_f.write('\n') for tup in tup_iter: csv_f.write( sep.join(((sep.join(map(str, x)) if isinstance( x, (tuple, list)) else str(x)) for x in tup))) csv_f.write('\n') else: if header is not None and append is False: csv_f.write(sep.join(header)) csv_f.write('\n') for tup in tup_iter: csv_f.write(sep.join(str(x) for x in tup)) csv_f.write('\n')
def save(self): if self.build_mode: self._token_count = sort_by_values(self._token_count, reverse=True) self._set_active_tokens_by_counts() self._active_tokens.update(self._token2index.keys()) for token in self._token_count: idx = len(self) + self.index_offset self._token2index[token] = idx self._index2token[idx] = token ensure_dir_existence(self.save_path, clear_dir=True) write_dict_as_text(self._token2index, output_path=path.join(self.save_path, 'token2index.txt')) write_dict_as_text(self._token_count, output_path=path.join(self.save_path, 'tokencount.txt')) hprint_message(title=f'size of vocabulary {self.vocab_name}', content=self.vocab_size()) self.build_mode = False
def pd_series_plot(df, output_path, series_col, index_col, value_cols, group_cols=None, groups=None, remove_zero_vals=True, title=None, plot_args=None, plot_mode='subplots', xlabel=None): # TODO: 'ylabel' does not work if group_cols is not None: pathex.ensure_dir_existence(output_path) if isinstance(group_cols, str): group_cols = (group_cols, ) if isinstance(value_cols, str): value_cols = (value_cols, ) for group_idx, group in enumerate(groups): hprint_message(f'generating plot for group {group_idx}', group) _df = df for group_col, val in zip(group_cols, group): _df = _df.loc[_df[group_col] == val] if remove_zero_vals: for value_col in value_cols: _df[value_col] = _df[value_col].replace(0, np.nan) if title is None: _title = "_".join(map(str, group)) elif isinstance(title, str): _title = title else: _title = title[group_idx] plt.clf() if plot_mode == 'same_fig': value_col = value_cols[0] __df = _df.pivot(index=index_col, columns=series_col, values=value_col) _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) ax = __df.plot(title=_title, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) for value_col in value_cols[1:]: __df = _df.pivot(index=index_col, columns=series_col, values=value_col) _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) ax = __df.plot(ax=ax, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) fig = None elif plot_mode == 'subplots': fig, axes = init_figure(*value_cols, max_ncols=1, sharex=True) for value_col, ax in zip(value_cols, axes): _plot_args = plot_args[value_col] ylabel = _plot_args.pop('ylabel', None) __df = _df.pivot(index=index_col, columns=series_col, values=value_col) ax = __df.plot(ax=ax, **_plot_args) if ylabel is not None: ax.set_ylabel(ylabel) if xlabel is not None: plt.xlabel(xlabel) plt.savefig(path.join(output_path, f'{_title}.png')) plt.clf() plt.close(fig=fig)
def train_test_val_split_for_files(file_paths: List, train_test_val_ratios: Tuple[float, float, float], output_path: Union[str, Tuple[str, str, str]], copy_files=True, overwrite=False, sort=False, shuffle=True, rnd_seed=-1, verbose=__debug__, num_p=1): if verbose: tic(f"Splitting {len(file_paths)} files into train/test/val sets with split ratios {train_test_val_ratios}", newline=True) if len(train_test_val_ratios) != 3: raise ValueError( f"must specify three ratios for the train/test/validation set splits; got {len(train_test_val_ratios)} ratios '{','.join((str(x) for x in train_test_val_ratios))}'" ) if sort: file_paths.sort() elif shuffle: with numpy_local_seed(rnd_seed) as _: if rnd_seed >= 0: file_paths.sort() # NOTE reproducibility needs this sort np.random.shuffle(file_paths) if isinstance(output_path, str): train_dir = path.join(output_path, 'train') test_dir = path.join(output_path, 'test') val_dir = path.join(output_path, 'val') elif len(output_path) == 3: train_dir, test_dir, val_dir = output_path else: raise ValueError( msg_invalid_arg_value(arg_val=output_path, arg_name='output_path')) ensure_sum_to_one_arg(arg_val=train_test_val_ratios, arg_name='train_test_val_ratios', warning=True) paex.ensure_dir_existence(train_dir, clear_dir=overwrite, verbose=verbose) paex.ensure_dir_existence(test_dir, clear_dir=overwrite, verbose=verbose) paex.ensure_dir_existence(val_dir, clear_dir=overwrite, verbose=verbose) splits = split_list_by_ratios(list_to_split=file_paths, split_ratios=train_test_val_ratios, check_ratio_sum_to_one=False) for cur_path_list, cur_output_dir in zip(splits, (train_dir, test_dir, val_dir)): if copy_files: batch_copy( src_paths=cur_path_list, dst_dir=cur_output_dir, solve_conflict=True, use_tqdm=verbose, tqdm_msg=f"copy files to {path.basename(cur_output_dir)}" if verbose else None, num_p=num_p) else: batch_move( src_paths=cur_path_list, dst_dir=cur_output_dir, solve_conflict=True, undo_move_on_failure=verbose, use_tqdm=True, tqdm_msg=f"move files to {path.basename(cur_output_dir)}" if verbose else None) if verbose: toc()
import utix.mpex as mpex import utix.ioex as ioex import utix.pathex as paex if __name__ == '__main__': mpex.freeze_support() src = paex.get_files_by_pattern(dir_or_dirs='tmp2', pattern='*.csv') trg = r'./tmp3' paex.ensure_dir_existence(trg, clear_dir=True) mpex.mp_chunk_file(src, trg, chunk_size=33, num_p=4) lines1 = sorted(ioex.read_all_lines_from_all_files(src)) lines2 = sorted( ioex.iter_all_lines_from_all_sub_dirs(input_path=trg, pattern='*.csv')) print(lines1 == lines2)