Esempio n. 1
0
    def build_vocab(self, data_entries):
        """
        Build vocabularies. The vocabularies are
        :param data_entries:
        :return:
        """
        vocab_build_data_args = []
        for item in self._data_args:
            if item.data_arg is not None and item.vocab_build:
                for vocab in item.vocab_build:
                    if vocab.build_mode:
                        vocab_build_data_args.append(item)
                        break

        if vocab_build_data_args:
            data_entries = tqdm(data_entries)
            data_entries.set_description('build vocabulary')
            vocabs = set()
            for data_entry in data_entries:
                for item in self._data_args:
                    if item.data_arg is not None and item.vocab_build:
                        vocabs.update(item.vocab_build)
                        data_entry[item.data_key](
                            item.data_arg
                        )  # it is assumed this `item.data_arg` has a vocabulary-based indexer for this to work
            for vocab in vocabs:
                vocab.save()
        else:
            gx.hprint_message('all vocabularies are ready')
Esempio n. 2
0
def check_path_existence(*path_or_paths):
    has_error = False
    for item in path_or_paths:
        if not path.exists(item):
            eprint_message("path not found", item)
            has_error = True
    if not has_error:
        hprint_message("Path existence check passed!")
Esempio n. 3
0
def pack_csv(csv_path,
             output_path,
             sep='\t',
             data_seps=' ',
             header=True,
             top=None,
             use_tqdm=False,
             display_msg=None,
             verbose=__debug__):
    """
    Packs a csv file a compressed pickle file.

    :param csv_path: the input csv file path.
    :param output_path: the pickle file will be saved at this path.
    :param sep: the csv field separator.
    :param data_seps: the separator to further split the field data; can use a dictionary to specify a data separator for each field.
    :param header: `True` if the csv file has a header; otherwise, `False`.
    :param use_tqdm: `True` to use tqdm to display packing progress; otherwise, `False`.
    :param display_msg: the message to print or display to indicate the data is being packed.
    :param verbose: `True` to print out as much internal message as possible.
    """
    vocab = IndexDict()
    data = []
    with open(csv_path, 'r') as f:
        if header:
            header = next(f).strip('\n').split(sep)
            if isinstance(data_seps, dict):
                for i, h in enumerate(header):
                    if h in data_seps:
                        data_seps[i] = data_seps[h]
        else:
            header = None

        for line in tqdm_wrap(f if top is None else islice(f, top),
                              use_tqdm=use_tqdm,
                              tqdm_msg=display_msg,
                              verbose=verbose):
            fields = line.strip('\n').split(sep)
            if data_seps is None:
                fields = tuple(vocab.add(field) for field in fields)
            elif isinstance(data_seps, str):
                fields = tuple(
                    tuple(vocab.add(x) for x in field.split(data_seps))
                    for field in fields)
            else:
                fields = tuple(
                    tuple(
                        vocab.add(x)
                        for x in field.split(data_seps.get(i, ' ')))
                    for i, field in enumerate(fields))
            data.append(fields)
    if verbose:
        hprint_message('data size', len(data))
        hprint_message('vocab size', len(vocab))
    pickle_save((sep, data_seps, header, data, dict(vocab.to_dicts()[0])),
                output_path,
                compressed=True)
Esempio n. 4
0
    def __init__(self,
                 save_path,
                 min_count: int,
                 max_size: int = None,
                 vocab_name=None,
                 pad_token='[PAD]',
                 unk_token='[UNK]',
                 fixed_tokens: Mapping = None,
                 build_mode=False,
                 format=None,
                 index_offset=0):
        self.vocab_name = vocab_name or path.basename(save_path)
        self.save_path = save_path
        self.min_count = min_count
        self.max_size = max_size
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.build_mode = build_mode
        self.index_offset = index_offset
        self._active_tokens = None
        if build_mode:
            shutil.rmtree(self.save_path)

        token2index_file = path.join(self.save_path, 'token2index.txt')
        if not path.exists(token2index_file):
            self._token2index = {pad_token: 0, unk_token: 1}
            self._index2token = {0: pad_token, 1: unk_token}
            self._token_count = {}
            self.build_mode = True
        else:
            tokencount_file = path.join(self.save_path, 'tokencount.txt')
            if path.exists(tokencount_file):
                self._token_count = read_dict_from_text(tokencount_file,
                                                        valtype=int)
                self._set_active_tokens_by_counts()
                self._active_tokens.add(pad_token)
                self._active_tokens.add(unk_token)
            else:
                self._token_count = None

            self._token2index = read_dict_from_text(token2index_file,
                                                    valtype=int,
                                                    format=format)
            self._index2token = kvswap(self._token2index)
            self.build_mode = False

        if fixed_tokens is not None:
            if self._active_tokens is not None:
                self._active_tokens.update(fixed_tokens.keys())
            for token, idx in fixed_tokens.items():
                self._token2index[token] = idx
                self._index2token[idx] = token

        if not self.build_mode:
            hprint_message(title=f'size of vocabulary {self.vocab_name}',
                           content=self.vocab_size())
Esempio n. 5
0
def mp_read(data_iter,
            provider,
            producer,
            provider_args=(),
            producer_args=(),
            num_providers=1,
            num_producers=4,
            ctx=None,
            checking_interval=0.5,
            print_out=True):
    provider_jobs = [None] * num_providers
    producer_jobs = [None] * num_producers

    if ctx is None:
        ctx = get_context()
    manager = ctx.Manager()
    iq = manager.Queue()
    oq: Queue = manager.Queue()
    flags = manager.list([False])
    if isinstance(producer, MPTarget):
        producer.use_queue = True

    provider_args = dispatch_data(num_p=num_providers,
                                  data_iter=data_iter,
                                  args=provider_args,
                                  print_out=print_out)

    for i in range(num_providers):
        provider_jobs[i] = ctx.Process(target=provider,
                                       args=(i, iq) + provider_args[i][1:])
    for i in range(num_producers):
        producer_jobs[i] = ctx.Process(target=producer,
                                       args=(i, iq, oq, flags) + producer_args)

    start_jobs(provider_jobs)
    start_jobs(producer_jobs)

    while True:
        while not oq.empty():
            objs = oq.get()
            yield from objs
        sleep(checking_interval)
        any_active_provider = any((job.is_alive() for job in provider_jobs))
        any_active_producer = any((job.is_alive() for job in producer_jobs))
        if not any_active_provider:
            if not flags[0]:
                flags[0] = True
                hprint_message('all providers done!')
            if not any_active_producer:
                hprint_message('all jobs done!')
                break
Esempio n. 6
0
def load_embeds(embeds_path,
                format='labeled_numpy',
                read_embeds=True,
                read_labels=True,
                use_tqdm: bool = True,
                tqdm_msg: str = None,
                sort=True,
                **kwargs):
    if tqdm_msg is None:
        if read_embeds and read_labels:
            tqdm_msg = f'loading embeds with labels at {embeds_path}'
        elif read_embeds:
            tqdm_msg = f'loading embeds at {embeds_path}'
        elif read_labels:
            tqdm_msg = f'loading labels at {embeds_path}'
        else:
            return
    embeds_it = iter_embeds(embeds_path=embeds_path,
                            format=format,
                            read_embeds=read_embeds,
                            read_labels=read_labels,
                            use_tqdm=use_tqdm,
                            tqdm_msg=tqdm_msg,
                            sort=sort,
                            **kwargs)
    tic('Load embeddings ...')
    if format == 'labeled_numpy':
        output = list(embeds_it)
        if read_embeds and read_labels:
            embeds_list, labels_list = gx.unzip(output)
            gx.hprint_message(
                f"Total number of embedding batches at {embeds_path} to index",
                len(embeds_list))
            output = (embeds_list, labels_list)
        elif read_embeds or read_labels:
            gx.hprint_message(
                f"Total number of embedding batches at {embeds_path} to index",
                len(output))
    else:
        raise NotImplementedError('the embedding file format is not supported')

    toc(msg=f'Done!')
    return output
Esempio n. 7
0
def savefig__(fname,
              dpi: int = 1200,
              format=None,
              clear=False,
              verbose=__debug__,
              *args,
              **kwargs):
    if format is None:
        format = path.splitext(path.basename(fname))[1]
        if format:
            format = format[1:]
        else:
            format = 'svg'

    plt.savefig(fname=fname, dpi=dpi, format=format, *args, **kwargs)
    if verbose:
        hprint_message('figure saved', fname)
    if clear:
        plt.clf()
Esempio n. 8
0
 def __call__(self, pid, iq: Queue, data, *args):
     if self.pass_each_data_item:
         it = chain(*(chunk_iter(self.create_iterator(dataitem, *args),
                                 chunk_size=self.chunk_size,
                                 as_list=True) for dataitem in data))
     else:
         it = chunk_iter(self.create_iterator(data, *args),
                         chunk_size=self.chunk_size,
                         as_list=True)
     hprint_message('initialized', f'{self.name}{pid}')
     while True:
         while not iq.full():
             try:
                 obj = next(it)
             except StopIteration:
                 return
             iq.put(obj)
         hprint_pairs(('full queue for', f'{self.name}{pid}'),
                      ('wait for', self._wait_time))
         sleep(self._wait_time)
Esempio n. 9
0
    def save(self):
        if self.build_mode:
            self._token_count = sort_by_values(self._token_count, reverse=True)
            self._set_active_tokens_by_counts()
            self._active_tokens.update(self._token2index.keys())
            for token in self._token_count:
                idx = len(self) + self.index_offset
                self._token2index[token] = idx
                self._index2token[idx] = token
            ensure_dir_existence(self.save_path, clear_dir=True)
            write_dict_as_text(self._token2index,
                               output_path=path.join(self.save_path,
                                                     'token2index.txt'))
            write_dict_as_text(self._token_count,
                               output_path=path.join(self.save_path,
                                                     'tokencount.txt'))
            hprint_message(title=f'size of vocabulary {self.vocab_name}',
                           content=self.vocab_size())

            self.build_mode = False
Esempio n. 10
0
def unpack_csv(data_path,
               output_csv_path,
               use_tqdm=False,
               display_msg=None,
               verbose=__debug__):
    """
    Unpacks a compressed pickle file built by `pack_csv` to a csv file.

    :param data_path: the path to the pickle file.
    :param output_csv_path: the output csv file will be saved at this path.
    :param use_tqdm: `True` to use tqdm to display packing progress; otherwise, `False`.
    :param display_msg: the message to print or display to indicate the data is being unpacked.
    :param verbose: `True` to print out as much internal message as possible.
    """
    sep, data_seps, header, data, vocab = pickle_load(data_path,
                                                      compressed=True)
    vocab = kvswap(vocab)
    if verbose:
        hprint_message('data size', len(data))
        hprint_message('vocab size', len(vocab))

    def _tup_iter():
        for fields in tqdm_wrap(data,
                                use_tqdm=use_tqdm,
                                tqdm_msg=display_msg,
                                verbose=verbose):
            if data_seps is None:
                yield tuple(vocab.get(field) for field in fields)
            elif isinstance(data_seps, str):
                yield tuple(
                    data_seps.join([vocab.get(x) for x in field])
                    for field in fields)
            else:
                yield tuple(
                    data_seps.get(i, ' ').join([vocab.get(x) for x in field])
                    for i, field in enumerate(fields))

    write_csv(_tup_iter(),
              output_csv_path=output_csv_path,
              sep=sep,
              header=header)
Esempio n. 11
0
def install_bashrc(rcdir,
                   mainrc,
                   other,
                   target='~/.bashrc',
                   verbose=__debug__):
    """
    Assembles multiple bashrc files.
    This script supports splitting a large complicated bashrc file to multiple files in a directory, and then assemble them back into one file when needed.
    Use this function with the `install_bashrc.sh` bash script to achieve easier management of bashrc files.
    :param rcdir:
    :param mainrc:
    :param other:
    :param target:
    :param verbose:
    :return:
    """
    if isinstance(other, str):
        other = [x.strip() for x in other.split(',')]
    mainrc = path.join(rcdir, mainrc)
    if verbose:
        hprint_message(title='mainrc', content=mainrc)
        hprint_message(title='others', content=other)
        hprint_message(title='target', content=target)
    shutil.copyfile(mainrc, target)
    with open(target, 'a') as f:
        f.write('\n')
        for other in iter__(other):
            other = path.join(rcdir, other)
            f.write(f'source "{other}"\n')
            f.write(f'alias vim_{get_main_name(other)}="vim {other}"\n')
        f.write(f'alias vimsrc="vim {target}"\n')
        f.write(f'alias srcsrc="source {target}"\n')
Esempio n. 12
0
def dispatch_files(num_p, file_paths: List[str], args: Tuple):
    num_files = len(file_paths)
    if __debug__:
        hprint_message(
            f"Dispatching {num_p} processes for {num_files} files ...")
    num_files_per_process = int(num_files / num_p)
    num_files_overflow = num_files - num_files_per_process * num_p
    file_idx_start = 0
    job_args = [None] * num_p
    for pidx in range(num_p):
        file_idx_end = file_idx_start + num_files_per_process + (
            pidx < num_files_overflow)
        if num_p == 1:
            curr_file_paths = file_paths
        elif pidx == num_p - 1:
            curr_file_paths = file_paths[file_idx_start:]
        else:
            curr_file_paths = file_paths[file_idx_start:file_idx_end]
            file_idx_start = file_idx_end
        if __debug__:
            hprint_pairs(('pid', pidx), ('num of files', len(curr_file_paths)))
        job_args[pidx] = (pidx, curr_file_paths) + args
    return job_args
Esempio n. 13
0
def build_index(embeds_path,
                output_path,
                num_clusters=65536,
                use_gpu=False,
                train_ratio=1.0,
                embeds_format='labeled_numpy',
                sort=True,
                **kwargs):
    # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True)

    # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths)
    # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt')
    # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx')

    embeds_list, _ = load_embeds(embeds_path=embeds_path,
                                 format=embeds_format,
                                 sort=sort,
                                 **kwargs)

    tic('Initializing index ...')
    if not num_clusters:
        num_clusters = len(embeds_list) // 100
    index = faiss.index_factory(embeds_list[0].shape[-1],
                                f"IVF{num_clusters},Flat",
                                faiss.METRIC_INNER_PRODUCT)
    if use_gpu:
        index = faiss.index_cpu_to_all_gpus(index)

    tic('Concatenating embeddings ...')
    if 0 < train_ratio < 1:
        gx.hprint_message(
            f"will sample subset for training with ratio {train_ratio}...")

    all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list(
        gx.sampled_iter(embeds_list, train_ratio)))
    toc(msg=f'Initialization done!')

    tic(f'Training embeddings of shape {all_embeds.shape} ...')
    index.train(all_embeds)
    if use_gpu:
        index = faiss.index_gpu_to_cpu(index)
    toc(msg='Index training done!')

    tic('Add embeddings to index ...')
    del all_embeds
    embed_index_start = 0

    for embeds in tqdm(embeds_list):
        embed_count = embeds.shape[0]
        index.add_with_ids(
            embeds,
            np.arange(embed_index_start, embed_index_start + embed_count))
        embed_index_start += embed_count

    # with open(text_file_path, 'w+') as wf:
    #     for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True):
    #         write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False)
    #         embed_count = embeds.shape[0]
    #         index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count))
    #         embed_index_start += embed_count

    if path.exists(output_path):
        os.remove(output_path)
    gx.hprint_message('saving indexed embeddings to', output_path)
    faiss.write_index(index, output_path)
    toc(msg='Indexing done!')
    return index
Esempio n. 14
0
 def __call__(self, pid, data, *args):
     hprint_message('initialized', f'{self.name}{pid}')
     no_job_cnt = 0
     if self._pass_each_data_item:
         if not self._result_dump_path and self.use_queue:
             # TODO file based queue
             iq: Queue = data
             oq: Queue = args[0]
             flags = args[1]
             while True:
                 while not iq.empty():
                     data = iq.get()
                     if self._data_from_files:
                         data = ioex.iter_all_lines_from_all_files(
                             input_paths=data, use_tqdm=True)
                         _data = (self.target(pid, dataitem, *args[2:])
                                  for dataitem in data)
                         oq.put(
                             MPResultTuple((x for x in _data
                                            if x is not None) if self.
                                           _remove_none else _data))
                     else:
                         if self._unpack_singleton_result and len(
                                 data) == 1:
                             oq.put(self.target(pid, data[0], *args[2:]))
                         else:
                             oq.put(
                                 MPResultTuple(
                                     self.target(pid, dataitem, *args[2:])
                                     for dataitem in data))
                 if not flags or flags[0]:
                     return
                 no_job_cnt += 1
                 if no_job_cnt % 10 == 0:
                     hprint_pairs(('no jobs for', f'{self.name}{pid}'),
                                  ('wait for', self._wait_time))
                 sleep(self._wait_time)
         else:
             if self._data_from_files:
                 data = ioex.iter_all_lines_from_all_files(input_paths=data,
                                                           use_tqdm=True)
                 _data = (self.target(pid, dataitem, *args)
                          for dataitem in data)
                 output = MPResultTuple((
                     x for x in _data
                     if x is not None) if self._remove_none else _data)
             elif self._unpack_singleton_result and len(data) == 1:
                 output = self.target(pid, data[0], *args)
             else:
                 data = tqdm(data, desc=f'pid: {pid}')
                 _data = (self.target(pid, dataitem, *args)
                          for dataitem in data)
                 # use a fake data type `MPResultTuple` (actually just a tuple) to inform the outside multi-processing method that the output comes from each data item
                 output = MPResultTuple((
                     x for x in _data
                     if x is not None) if self._remove_none else _data)
     elif not self._result_dump_path and self.use_queue:
         iq: Queue = data
         oq: Queue = args[0]
         flags = args[1]
         while True:
             while not iq.empty():
                 data = iq.get()
                 if self._data_from_files:
                     data = ioex.iter_all_lines_from_all_files(
                         input_paths=data, use_tqdm=True)
                 result = self.target(pid, data, *args[2:])
                 oq.put(result[0] if self._unpack_singleton_result
                        and hasattr(result, '__len__')
                        and hasattr(result, '__getitem__')
                        and len(result) == 1 else result)
             if not flags or flags[0]:
                 return
             no_job_cnt += 1
             if no_job_cnt % 10 == 0:
                 hprint_pairs(('no jobs for', f'{self.name}{pid}'),
                              ('wait for', self._wait_time))
             sleep(self._wait_time)
     else:
         if self._data_from_files:
             data = ioex.iter_all_lines_from_all_files(input_paths=data,
                                                       use_tqdm=True)
         output = self.target(pid, data, *args)
         if self._unpack_singleton_result and hasattr(
                 output, '__len__') and hasattr(
                     output, '__getitem__') and len(output) == 1:
             output = output[0]
     if self._result_dump_path:
         dump_path = path.join(
             self._result_dump_path,
             (ioex.pathex.append_timestamp(str(uuid.uuid4())) +
              '.mpb' if self._result_dump_file_pattern is None else
              self._result_dump_file_pattern.format(pid)))
         self._result_dump_method(output, dump_path)
         return dump_path if not self._always_return_results else output
     else:
         return output
Esempio n. 15
0
def get_mp_cache_files(num_p,
                       file_paths,
                       sort=True,
                       verbose=__debug__,
                       cache_dir_path=None,
                       chunk_size=100000,
                       sort_use_basename=False,
                       rebuild_on_change=True):
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    else:
        file_paths = paex.sort_paths(file_paths,
                                     sort=sort,
                                     sort_by_basename=sort_use_basename)

    num_file_paths = len(file_paths)
    if verbose:
        hprint_pairs(('number of files', num_file_paths), ('num_p', num_p))
    if num_file_paths < num_p:
        if cache_dir_path is None:
            if len(file_paths) == 1:
                cache_dir_path = paex.add_to_main_name(file_paths[0],
                                                       prefix='.mp.')
            else:
                cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp')
        cache_file_ext_name = paex.get_ext_name(file_paths[0])

        tic('Constructs multi-processing cache files at path ' +
            path.join(cache_dir_path, '*' + cache_file_ext_name))

        mp_cache_file_paths = None
        files_id_path = cache_dir_path + '.id'
        if path.exists(cache_dir_path):
            if path.exists(files_id_path):
                old_files_id = ioex.read_all_text(files_id_path).strip()
                new_files_id = ioex.get_files_id(
                    file_paths
                )  # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed
                if new_files_id != old_files_id:
                    hprint_message(f'Files are changed; rebuilding cache at',
                                   cache_dir_path)
                    import shutil, os
                    shutil.rmtree(cache_dir_path)  # removes file cache
                    os.remove(files_id_path)  # removes the id file
                else:
                    mp_cache_file_paths = paex.get_files_by_pattern(
                        dir_or_dirs=cache_dir_path,
                        pattern='*' + cache_file_ext_name,
                        full_path=True,
                        recursive=False,
                        sort=sort,
                        sort_use_basename=sort_use_basename)
                    if not mp_cache_file_paths:
                        wprint_message(
                            'Cache directory exists, but nothing there',
                            cache_dir_path)
            else:
                hprint_message(f'Files id does not exist; rebuilding cache at',
                               cache_dir_path)
                import shutil
                shutil.rmtree(cache_dir_path)  # removes file cache
        if not mp_cache_file_paths:
            ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path)
            ioex.write_all_lines(
                iterable=ioex.iter_all_lines_from_all_files(file_paths),
                output_path=cache_dir_path,
                create_dir=True,
                chunk_size=chunk_size,
                chunked_file_ext_name=cache_file_ext_name)
            mp_cache_file_paths = paex.get_files_by_pattern(
                dir_or_dirs=cache_dir_path,
                pattern='*' + cache_file_ext_name,
                full_path=True,
                recursive=False,
                sort=sort,
                sort_use_basename=sort_use_basename)

        if mp_cache_file_paths:
            hprint_message(title='number of multi-processing cache files',
                           content=len(mp_cache_file_paths))
        else:
            raise IOError('multi-processing cache files are not found')
        file_paths = mp_cache_file_paths
        num_p = min(num_p, len(file_paths))
        toc('Done!')
    return num_p, file_paths
Esempio n. 16
0
def pd_series_plot(df,
                   output_path,
                   series_col,
                   index_col,
                   value_cols,
                   group_cols=None,
                   groups=None,
                   remove_zero_vals=True,
                   title=None,
                   plot_args=None,
                   plot_mode='subplots',
                   xlabel=None):
    # TODO: 'ylabel' does not work
    if group_cols is not None:
        pathex.ensure_dir_existence(output_path)
        if isinstance(group_cols, str):
            group_cols = (group_cols, )
        if isinstance(value_cols, str):
            value_cols = (value_cols, )

        for group_idx, group in enumerate(groups):
            hprint_message(f'generating plot for group {group_idx}', group)
            _df = df
            for group_col, val in zip(group_cols, group):
                _df = _df.loc[_df[group_col] == val]
            if remove_zero_vals:
                for value_col in value_cols:
                    _df[value_col] = _df[value_col].replace(0, np.nan)

            if title is None:
                _title = "_".join(map(str, group))
            elif isinstance(title, str):
                _title = title
            else:
                _title = title[group_idx]

            plt.clf()

            if plot_mode == 'same_fig':
                value_col = value_cols[0]
                __df = _df.pivot(index=index_col,
                                 columns=series_col,
                                 values=value_col)
                _plot_args = plot_args[value_col]
                ylabel = _plot_args.pop('ylabel', None)
                ax = __df.plot(title=_title, **_plot_args)
                if ylabel is not None:
                    ax.set_ylabel(ylabel)
                for value_col in value_cols[1:]:
                    __df = _df.pivot(index=index_col,
                                     columns=series_col,
                                     values=value_col)
                    _plot_args = plot_args[value_col]
                    ylabel = _plot_args.pop('ylabel', None)
                    ax = __df.plot(ax=ax, **_plot_args)
                    if ylabel is not None:
                        ax.set_ylabel(ylabel)
                fig = None
            elif plot_mode == 'subplots':
                fig, axes = init_figure(*value_cols, max_ncols=1, sharex=True)
                for value_col, ax in zip(value_cols, axes):
                    _plot_args = plot_args[value_col]
                    ylabel = _plot_args.pop('ylabel', None)
                    __df = _df.pivot(index=index_col,
                                     columns=series_col,
                                     values=value_col)
                    ax = __df.plot(ax=ax, **_plot_args)
                    if ylabel is not None:
                        ax.set_ylabel(ylabel)
            if xlabel is not None:
                plt.xlabel(xlabel)
            plt.savefig(path.join(output_path, f'{_title}.png'))
            plt.clf()
            plt.close(fig=fig)
Esempio n. 17
0
def scp_upload(src_dir,
               host,
               username,
               password,
               dst_dir,
               pattern='*',
               recursive=True,
               server_path_sep='/',
               ignore_error='silent',
               ignore_unchanged_files=True,
               hash_block_size=65536,
               ssh_timout=15.0,
               **kwargs):
    ssh = SSHClient()
    ssh.load_system_host_keys()
    ssh.connect(hostname=host,
                username=username,
                password=password,
                timeout=ssh_timout,
                **kwargs)

    scp = SCPClient(ssh.get_transport())

    src_dir = path.expandvars(src_dir)
    if ignore_unchanged_files:
        filehash_path = path.join(
            src_dir,
            f"{strex.hash_str('/'.join((host, username, dst_dir, str(pattern), str(recursive))))}_scp_file_hashes"
        )
        filehash_dict = ioex.pickle_load(
            filehash_path,
            compressed=True) if path.exists(filehash_path) else {}

    for local_file, file_name in pathex.iter_files_by_pattern(
            dir_or_dirs=src_dir,
            pattern=pattern,
            recursive=recursive,
            full_path=pathex.FullPathMode.FullPathRelativePathTuple):
        if ignore_unchanged_files:
            filehash = ioex.hash_file(local_file, block_size=hash_block_size)
            if filehash == filehash_dict.get(local_file, None):
                continue

        remote_file = path.join(dst_dir, file_name)
        if server_path_sep != os.sep:
            remote_file = remote_file.replace(os.sep, server_path_sep)

        try:
            scp.put(local_file, remote_file)
        except Exception as err:
            if "No such file or directory" in str(err):
                ssh.exec_command(f"mkdir -p {path.dirname(remote_file)}")
                try:
                    scp.put(local_file, remote_file)
                except Exception as err:
                    if ignore_error is True:
                        eprint_message(title='failed', content=local_file)
                        print(type(err), err)
                        continue
                    elif ignore_error == 'silent':
                        continue
                    else:
                        raise err
            else:
                if ignore_error is True:
                    eprint_message(title='failed', content=local_file)
                    print(type(err), err)
                    continue
                elif ignore_error == 'silent':
                    continue
                else:
                    raise err

        hprint_message(title='success', content=local_file)
        if ignore_unchanged_files:
            filehash_dict[local_file] = filehash

    scp.close()

    if ignore_unchanged_files:
        ioex.pickle_save(filehash_dict, filehash_path, compressed=True)
Esempio n. 18
0
def _solve_multi_path(multi_path_str,
                      file_pattern=None,
                      multi_path_delimiter=DEFAULT_MULTI_PATH_DELIMITER,
                      sort=True,
                      verbose=__debug__):
    if verbose:
        hprint_message('solving multi-file paths from input', multi_path_str)

    # region STEP1: get all paths

    # split the path by the subdir delimiter; special treatment for Windows system.
    input_paths = [
        file_or_dir_path
        for file_or_dir_path in multi_path_str.split(multi_path_delimiter)
        if file_or_dir_path
    ]
    if platform.system() == 'Windows' and multi_path_delimiter == ':' and len(
            input_paths[0]) == 1 and input_paths[0].isalpha(
            ) and input_paths[1][0] == '\\':
        input_paths = [f'{input_paths[0]}:{input_paths[1]}'] + input_paths[2:]

    # replace the final segments of the first path to generate all actual subdir/file paths.
    for i in range(1, len(input_paths)):
        input_paths[i] = replace_path_tail(input_paths[0], input_paths[i])

    # region STEP2: check the path existence.
    path_exists = [False] * len(input_paths)
    has_available_path = False
    for path_idx, possible_path in enumerate(input_paths):
        path_exists[path_idx] = path.exists(possible_path)
        if path_exists[path_idx]:
            has_available_path = True
        if verbose:
            hprint_pairs(('path', possible_path),
                         ('exists', path_exists[path_idx]))
    # endregion

    # region STEP3: if the `file_pattern` is specified, then expand each existing dir path as files that match the provided pattern.
    if file_pattern:
        expanded_input_paths = []
        expanded_path_exists = []
        for input_path, path_exist in zip(input_paths, path_exists):
            if path_exist and path.isdir(input_path):
                files = get_files_by_pattern(input_path, file_pattern)
                if files:
                    expanded_input_paths.extend(files)
                    expanded_path_exists.extend([True] * len(files))
                    has_available_path = True
                    if verbose:
                        hprint_pairs(('extending path', input_path),
                                     ('pattern', file_pattern),
                                     ('num found files', len(files)))
            else:  # ! keeps the original path if 1) it does not exist; 2) it is a file.
                expanded_input_paths.append(input_path)
                expanded_path_exists.append(path_exist)

        if len(expanded_input_paths) == 0:
            warnings.warn(
                f"File pattern '{file_pattern}' specified, but no file of this pattern is found."
            )

        input_paths = expanded_input_paths
        path_exists = expanded_path_exists

    # endregion

    # returns the solved paths, their existence flags, and a single boolean value indicating if any of the path exists.
    if sort:
        input_paths, path_exists = zip(*sorted(zip(input_paths, path_exists)))
    return input_paths, path_exists, has_available_path