Example #1
0
def shuffle_list_iter(ls: List, num_shuffles_to_generate: int = 1, random_seed: int = 0, index_file_dir: str = None, index_file_name_pattern: str = 'shuffle_idx_{}.idx', verbose: bool = __debug__) -> Iterator[List]:
    ensure_positive_arg(arg_val=num_shuffles_to_generate, arg_name='num_shuffles_to_generate')
    ls_len = len(ls)
    if index_file_dir:
        if not path.exists(index_file_dir):
            paex.ensure_dir_existence(index_file_dir, verbose=verbose)
        else:
            from utix.ioex import pickle_load
            for i in range(num_shuffles_to_generate):
                idx_file_path = path.join(index_file_dir, index_file_name_pattern.format(i))
                if path.exists(idx_file_path):
                    idxes = pickle_load(idx_file_path)
                elif i == 0:
                    break
                else:
                    raise ValueError(f"the index file {idx_file_path} does not exist.")
                if len(idxes) != ls_len:
                    raise ValueError(f"the number of the loaded list indexes ({len(idxes)}) is different from the size of the list to shuffle ({ls_len}).")
                yield [ls[idx] for idx in idxes]
            if i != 0:
                return

    rng = random if random_seed < 0 else random.Random(random_seed)
    idxes = list(range(ls_len))
    from utix.ioex import pickle_save__
    for i in range(num_shuffles_to_generate):
        idx_file_path = path.join(index_file_dir, index_file_name_pattern.format(i))
        rng.shuffle(idxes)
        if index_file_dir:
            pickle_save__(data=idxes, file_or_dir_path=idx_file_path, auto_timestamp=False)
        yield [ls[idx] for idx in idxes]
Example #2
0
def write_dicts_to_csv(row_dicts, output_path, append=False, create_dir=True):
    if create_dir:
        paex.ensure_dir_existence(path.dirname(output_path), verbose=False)
    if not path.exists(output_path):
        append = False
    with open(output_path, 'a+' if append else 'w+') as csv_fout:
        writer = csv.DictWriter(csv_fout, fieldnames=row_dicts[0].keys())
        if not append:
            writer.writeheader()
        writer.writerows(row_dicts)
Example #3
0
def write_csv(tup_iter,
              output_csv_path,
              sep='\t',
              header=None,
              append=False,
              encoding='utf-8',
              create_dir=True,
              flatten=False):
    """
    Writes tuples/lists to a csv file.

    :param tup_iter: an iterator of tuples or lists.
    :param output_csv_path: the output csv file will be saved at this path.
    :param sep: the csv field separator.
    :param header: provide the header for the csv file; the header will be written as the first line of the csv file if `append` is set `False`.
    :param append: `True` to use append mode; otherwise, `False`.
    :param encoding: specifies the csv file encoding; the default is 'utf-8'.
    """
    if create_dir:
        paex.ensure_dir_existence(path.dirname(output_csv_path), verbose=False)

    with open(output_csv_path, 'a' if append else 'w',
              encoding=encoding) as csv_f:
        if flatten:
            if header is not None and append is False:
                csv_f.write(
                    sep.join(((sep.join(x) if isinstance(x,
                                                         (tuple, list)) else x)
                              for x in header)))
                csv_f.write('\n')
            for tup in tup_iter:
                csv_f.write(
                    sep.join(((sep.join(map(str, x)) if isinstance(
                        x, (tuple, list)) else str(x)) for x in tup)))
                csv_f.write('\n')
        else:
            if header is not None and append is False:
                csv_f.write(sep.join(header))
                csv_f.write('\n')
            for tup in tup_iter:
                csv_f.write(sep.join(str(x) for x in tup))
                csv_f.write('\n')
Example #4
0
    def save(self):
        if self.build_mode:
            self._token_count = sort_by_values(self._token_count, reverse=True)
            self._set_active_tokens_by_counts()
            self._active_tokens.update(self._token2index.keys())
            for token in self._token_count:
                idx = len(self) + self.index_offset
                self._token2index[token] = idx
                self._index2token[idx] = token
            ensure_dir_existence(self.save_path, clear_dir=True)
            write_dict_as_text(self._token2index,
                               output_path=path.join(self.save_path,
                                                     'token2index.txt'))
            write_dict_as_text(self._token_count,
                               output_path=path.join(self.save_path,
                                                     'tokencount.txt'))
            hprint_message(title=f'size of vocabulary {self.vocab_name}',
                           content=self.vocab_size())

            self.build_mode = False
Example #5
0
def pd_series_plot(df,
                   output_path,
                   series_col,
                   index_col,
                   value_cols,
                   group_cols=None,
                   groups=None,
                   remove_zero_vals=True,
                   title=None,
                   plot_args=None,
                   plot_mode='subplots',
                   xlabel=None):
    # TODO: 'ylabel' does not work
    if group_cols is not None:
        pathex.ensure_dir_existence(output_path)
        if isinstance(group_cols, str):
            group_cols = (group_cols, )
        if isinstance(value_cols, str):
            value_cols = (value_cols, )

        for group_idx, group in enumerate(groups):
            hprint_message(f'generating plot for group {group_idx}', group)
            _df = df
            for group_col, val in zip(group_cols, group):
                _df = _df.loc[_df[group_col] == val]
            if remove_zero_vals:
                for value_col in value_cols:
                    _df[value_col] = _df[value_col].replace(0, np.nan)

            if title is None:
                _title = "_".join(map(str, group))
            elif isinstance(title, str):
                _title = title
            else:
                _title = title[group_idx]

            plt.clf()

            if plot_mode == 'same_fig':
                value_col = value_cols[0]
                __df = _df.pivot(index=index_col,
                                 columns=series_col,
                                 values=value_col)
                _plot_args = plot_args[value_col]
                ylabel = _plot_args.pop('ylabel', None)
                ax = __df.plot(title=_title, **_plot_args)
                if ylabel is not None:
                    ax.set_ylabel(ylabel)
                for value_col in value_cols[1:]:
                    __df = _df.pivot(index=index_col,
                                     columns=series_col,
                                     values=value_col)
                    _plot_args = plot_args[value_col]
                    ylabel = _plot_args.pop('ylabel', None)
                    ax = __df.plot(ax=ax, **_plot_args)
                    if ylabel is not None:
                        ax.set_ylabel(ylabel)
                fig = None
            elif plot_mode == 'subplots':
                fig, axes = init_figure(*value_cols, max_ncols=1, sharex=True)
                for value_col, ax in zip(value_cols, axes):
                    _plot_args = plot_args[value_col]
                    ylabel = _plot_args.pop('ylabel', None)
                    __df = _df.pivot(index=index_col,
                                     columns=series_col,
                                     values=value_col)
                    ax = __df.plot(ax=ax, **_plot_args)
                    if ylabel is not None:
                        ax.set_ylabel(ylabel)
            if xlabel is not None:
                plt.xlabel(xlabel)
            plt.savefig(path.join(output_path, f'{_title}.png'))
            plt.clf()
            plt.close(fig=fig)
Example #6
0
def train_test_val_split_for_files(file_paths: List,
                                   train_test_val_ratios: Tuple[float, float,
                                                                float],
                                   output_path: Union[str, Tuple[str, str,
                                                                 str]],
                                   copy_files=True,
                                   overwrite=False,
                                   sort=False,
                                   shuffle=True,
                                   rnd_seed=-1,
                                   verbose=__debug__,
                                   num_p=1):
    if verbose:
        tic(f"Splitting {len(file_paths)} files into train/test/val sets with split ratios {train_test_val_ratios}",
            newline=True)
    if len(train_test_val_ratios) != 3:
        raise ValueError(
            f"must specify three ratios for the train/test/validation set splits; got {len(train_test_val_ratios)} ratios '{','.join((str(x) for x in train_test_val_ratios))}'"
        )
    if sort:
        file_paths.sort()
    elif shuffle:
        with numpy_local_seed(rnd_seed) as _:
            if rnd_seed >= 0:
                file_paths.sort()  # NOTE reproducibility needs this sort
            np.random.shuffle(file_paths)

    if isinstance(output_path, str):
        train_dir = path.join(output_path, 'train')
        test_dir = path.join(output_path, 'test')
        val_dir = path.join(output_path, 'val')
    elif len(output_path) == 3:
        train_dir, test_dir, val_dir = output_path
    else:
        raise ValueError(
            msg_invalid_arg_value(arg_val=output_path, arg_name='output_path'))

    ensure_sum_to_one_arg(arg_val=train_test_val_ratios,
                          arg_name='train_test_val_ratios',
                          warning=True)
    paex.ensure_dir_existence(train_dir, clear_dir=overwrite, verbose=verbose)
    paex.ensure_dir_existence(test_dir, clear_dir=overwrite, verbose=verbose)
    paex.ensure_dir_existence(val_dir, clear_dir=overwrite, verbose=verbose)
    splits = split_list_by_ratios(list_to_split=file_paths,
                                  split_ratios=train_test_val_ratios,
                                  check_ratio_sum_to_one=False)
    for cur_path_list, cur_output_dir in zip(splits,
                                             (train_dir, test_dir, val_dir)):
        if copy_files:
            batch_copy(
                src_paths=cur_path_list,
                dst_dir=cur_output_dir,
                solve_conflict=True,
                use_tqdm=verbose,
                tqdm_msg=f"copy files to {path.basename(cur_output_dir)}"
                if verbose else None,
                num_p=num_p)
        else:
            batch_move(
                src_paths=cur_path_list,
                dst_dir=cur_output_dir,
                solve_conflict=True,
                undo_move_on_failure=verbose,
                use_tqdm=True,
                tqdm_msg=f"move files to {path.basename(cur_output_dir)}"
                if verbose else None)
    if verbose:
        toc()
Example #7
0
import utix.mpex as mpex
import utix.ioex as ioex
import utix.pathex as paex

if __name__ == '__main__':
    mpex.freeze_support()
    src = paex.get_files_by_pattern(dir_or_dirs='tmp2', pattern='*.csv')
    trg = r'./tmp3'
    paex.ensure_dir_existence(trg, clear_dir=True)
    mpex.mp_chunk_file(src, trg, chunk_size=33, num_p=4)
    lines1 = sorted(ioex.read_all_lines_from_all_files(src))
    lines2 = sorted(
        ioex.iter_all_lines_from_all_sub_dirs(input_path=trg, pattern='*.csv'))
    print(lines1 == lines2)