Ejemplo n.º 1
0
 def get_vocab_path(self):
     """
     Gets the path of the module vocabulary path.
     """
     save_path = os.path.join(DATA_HOME, 'bert-base-cased', 'bert-base-cased-vocab.txt')
     if not os.path.exists(save_path) or not os.path.isfile(save_path):
         url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt"
         download(url, os.path.join(DATA_HOME, 'bert-base-cased'))
     return save_path
Ejemplo n.º 2
0
 def get_vocab_path(self):
     """
     Gets the path of the module vocabulary path.
     """
     save_path = os.path.join(DATA_HOME, 'ernie_tiny', 'vocab.txt')
     if not os.path.exists(save_path) or not os.path.isfile(save_path):
         url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/vocab.txt"
         download(url, os.path.join(DATA_HOME, 'ernie_tiny'))
     return save_path
Ejemplo n.º 3
0
 def _download_and_uncompress_dataset(self, destination: str, url: str):
     """
     Downloads dataset and uncompresses it.
     Args:
        destination (:obj:`str`): The dataset cached directory.
        url (:obj: str): The link to be downloaded a dataset.
     """
     if not os.path.exists(destination):
         dataset_package = download(url=url, path=DATA_HOME)
         if is_xarfile(dataset_package):
             unarchive(dataset_package, DATA_HOME)
     else:
         logger.info("Dataset {} already cached.".format(destination))
Ejemplo n.º 4
0
    def get_tokenizer(self, tokenize_chinese_chars=True):
        """
        Gets the tokenizer that is customized for this module.
        Args:
            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
                Whether to tokenize chinese characters or not.
        Returns:
            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
        """
        spm_path = os.path.join(DATA_HOME, 'ernie_tiny',
                                'spm_cased_simp_sampled.model')
        if not os.path.exists(spm_path) or not os.path.isfile(spm_path):
            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/spm_cased_simp_sampled.model"
            download(url, os.path.join(DATA_HOME, 'ernie_tiny'))

        word_dict_path = os.path.join(DATA_HOME, 'ernie_tiny',
                                      'dict.wordseg.pickle')
        if not os.path.exists(word_dict_path) or not os.path.isfile(
                word_dict_path):
            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/dict.wordseg.pickle"
            download(url, os.path.join(DATA_HOME, 'ernie_tiny'))

        return ErnieTinyTokenizer(self.get_vocab_path(), spm_path,
                                  word_dict_path)
Ejemplo n.º 5
0
    def download_file_and_uncompress(self, url: str, save_path: str, print_progress: bool):
        with utils.generate_tempdir() as _dir:
            if print_progress:
                with log.ProgressBar('Download {}'.format(url)) as bar:
                    for path, ds, ts in utils.download_with_progress(url=url, path=_dir):
                        bar.update(float(ds) / ts)
            else:
                path = utils.download(url=url, path=_dir)

            if print_progress:
                with log.ProgressBar('Decompress {}'.format(path)) as bar:
                    for path, ds, ts in xarfile.unarchive_with_progress(name=path, path=save_path):
                        bar.update(float(ds) / ts)
            else:
                path = xarfile.unarchive(name=path, path=save_path)