Ejemplo n.º 1
0
    def download_checkpoint(cls,
                            pretrained_model_name: str,
                            cache_dir=cache_dir) -> str:
        r"""Download the specified pre-trained checkpoint, and return the
        directory in which the checkpoint is cached.

        Args:
            pretrained_model_name (str): Name of the model checkpoint.
            cache_dir (str, optional): Path to the cache directory. If `None`,
                uses the default directory (user's home directory).

        Returns:
            Path to the cache directory.
        """
        # cache_dir = cache_dir
        if pretrained_model_name in cls._MODEL2URL:
            download_path = cls._MODEL2URL[pretrained_model_name]
        else:
            raise ValueError(
                f"Pre-trained model not found: {pretrained_model_name}")

        if cache_dir is None:
            cache_path = default_download_dir(cls._MODEL_NAME)
        else:
            cache_path = Path(cache_dir)
        print(cache_path)

        cache_path = cache_path / pretrained_model_name

        if not cache_path.exists():
            if isinstance(download_path, str):
                filename = get_filename(download_path)
                maybe_download(download_path, cache_path, extract=True)

                # removing the compressed file
                (cache_path / filename).unlink()

                folder = None
                # if extracted into a new directory
                for file in cache_path.iterdir():
                    if file.is_dir():
                        folder = file
                if folder is not None:
                    for file in folder.iterdir():
                        file.rename(file.parents[1] / file.name)
                    folder.rmdir()
            else:
                for path in download_path:
                    maybe_download(path, cache_path)
            print(f"Pre-trained {cls._MODEL_NAME} checkpoint "
                  f"{pretrained_model_name} cached to {cache_path}")
        else:
            print(f"Using cached pre-trained {cls._MODEL_NAME} checkpoint "
                  f"from {cache_path}.")

        return str(cache_path)
Ejemplo n.º 2
0
    def download_checkpoint(cls,
                            pretrained_model_name: str,
                            cache_dir: Optional[str] = None) -> str:
        r"""Download the specified pre-trained checkpoint, and return the
        directory in which the checkpoint is cached.

        Args:
            pretrained_model_name (str): Name of the model checkpoint.
            cache_dir (str, optional): Path to the cache directory. If `None`,
                uses the default directory given by
                :meth:`~default_download_dir`.

        Returns:
            Path to the cache directory.
        """
        if pretrained_model_name in cls._MODEL2URL:
            download_path = cls._MODEL2URL[pretrained_model_name]
        else:
            raise ValueError(
                f"Pre-trained model not found: {pretrained_model_name}")

        if cache_dir is None:
            cache_path = default_download_dir(cls._MODEL_NAME)
        else:
            cache_path = Path(cache_dir)
        cache_path = cache_path / pretrained_model_name

        if not cache_path.exists():
            if isinstance(download_path, str):
                filename = download_path.split('/')[-1]
                maybe_download(download_path, cache_path, extract=True)
                folder = None
                for file in cache_path.iterdir():
                    if file.is_dir():
                        folder = file
                assert folder is not None
                (cache_path / filename).unlink()
                for file in folder.iterdir():
                    file.rename(file.parents[1] / file.name)
                folder.rmdir()
            else:
                for path in download_path:
                    maybe_download(path, cache_path)
            print(f"Pre-trained {cls._MODEL_NAME} checkpoint "
                  f"{pretrained_model_name} cached to {cache_path}")
        else:
            print(f"Using cached pre-trained {cls._MODEL_NAME} checkpoint "
                  f"from {cache_path}.")

        return str(cache_path)
    def setUp(self):
        self.tmp_dir = tempfile.TemporaryDirectory()
        self.SAMPLE_VOCAB = maybe_download(
            'https://github.com/google/sentencepiece/blob/master/'
            'python/test/test_model.model?raw=true', self.tmp_dir.name)

        self.tokenizer = SentencePieceTokenizer.load(self.SAMPLE_VOCAB)

        self.tokenizer.save(self.tmp_dir.name)
    def setUp(self):
        self.tmp_dir = tempfile.TemporaryDirectory()
        self.SAMPLE_VOCAB = maybe_download(
            'https://github.com/huggingface/transformers/raw/main/tests/'
            'fixtures/test_sentencepiece.model', self.tmp_dir.name)

        self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB,
                                             configs={'keep_accents': True})
        self.tokenizer.save(self.tmp_dir.name)
Ejemplo n.º 5
0
    def setUp(self):
        self.tmp_dir = tempfile.TemporaryDirectory()
        self.SAMPLE_VOCAB = maybe_download(
            'https://github.com/gpengzhi/pytorch-transformers/blob/master/'
            'pytorch_transformers/tests/fixtures/test_sentencepiece.model'
            '?raw=true', self.tmp_dir.name)

        self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB,
                                             configs={'keep_accents': True})
        self.tokenizer.save(self.tmp_dir.name)
    def test_train(self):
        tmp_dir = tempfile.TemporaryDirectory()
        TEXT_FILE = maybe_download(
            'https://github.com/google/sentencepiece/blob/master/'
            'data/botchan.txt?raw=true', tmp_dir.name)

        hparams = {
            "vocab_file": None,
            "text_file": TEXT_FILE,
            "vocab_size": 1000,
        }
        tokenizer = SentencePieceTokenizer(hparams=hparams)

        with open(TEXT_FILE, 'r', encoding='utf-8') as file:
            for line in file:
                tokenizer.map_token_to_text(tokenizer.map_text_to_token(line))
                tokenizer.map_id_to_text(tokenizer.map_text_to_id(line))
Ejemplo n.º 7
0
    def setUp(self):
        # Create test data
        self._test_dir = tempfile.mkdtemp()

        cat_in_snow = maybe_download(
            'https://storage.googleapis.com/download.tensorflow.org/'
            'example_images/320px-Felis_catus-cat_on_snow.jpg', self._test_dir,
            'cat_0.jpg')
        williamsburg_bridge = maybe_download(
            'https://storage.googleapis.com/download.tensorflow.org/'
            'example_images/194px-New_East_River_Bridge_from_Brooklyn_'
            'det.4a09796u.jpg', self._test_dir, 'bridge_0.jpg')

        _feature_types = {
            'height': ('tf.int64', 'FixedLenFeature', 1),
            'width': ('tf.int64', 'FixedLenFeature', 1),
            'label': ('tf.int64', 'stacked_tensor', 1),
            'shape': (np.int64, 'VarLenFeature'),
            'image_raw': (bytes, 'stacked_tensor'),
            'variable1': (np.str, 'FixedLenFeature'),
            'variable2': ('tf.int64', 'FixedLenFeature'),
        }
        self._feature_convert_types = {
            'variable1': 'tf.float32',
            'variable2': 'tf.string',
        }
        _image_options = {}
        self._unconvert_features = ['height', 'width', 'label']

        self._dataset_valid = {
            'height': [],
            'width': [],
            'shape': [],
            'label': [],
            'image_raw': [],
            'variable1': [],
            'variable2': [],
        }
        _toy_image_labels_valid = {
            cat_in_snow: 0,
            williamsburg_bridge: 1,
        }
        _toy_image_shapes = {
            cat_in_snow: (213, 320, 3),
            williamsburg_bridge: (239, 194),
        }
        _record_filepath = os.path.join(self._test_dir, 'test.pkl')

        # Prepare Validation data
        with RecordData.writer(_record_filepath, _feature_types) as writer:
            for image_path, label in _toy_image_labels_valid.items():
                with open(image_path, 'rb') as fid:
                    image_data = fid.read()
                image_shape = _toy_image_shapes[image_path]

                # _construct_dataset_valid("", shape, label)
                single_data = {
                    'height': image_shape[0],
                    'width': image_shape[1],
                    'shape': image_shape,
                    'label': label,
                    'image_raw': image_data,
                    'variable1': "1234567890",
                    'variable2': int(9876543210),
                }
                for key, value in single_data.items():
                    self._dataset_valid[key].append(value)
                writer.write(single_data)

        self._hparams = {
            "num_epochs": 1,
            "batch_size": 1,
            "shuffle": False,
            "dataset": {
                "files": _record_filepath,
                "feature_original_types": _feature_types,
                "feature_convert_types": self._feature_convert_types,
                "image_options": [_image_options],
            }
        }