def test_spm_converter_bytefallback_warning(self): spm_model_file_without_bytefallback = get_tests_dir( "fixtures/test_sentencepiece.model") spm_model_file_with_bytefallback = get_tests_dir( "fixtures/test_sentencepiece_with_bytefallback.model") original_tokenizer_without_bytefallback = FakeOriginalTokenizer( vocab_file=spm_model_file_without_bytefallback) with warnings.catch_warnings(record=True) as w: _ = SpmConverter(original_tokenizer_without_bytefallback) self.assertEqual(len(w), 0) original_tokenizer_with_bytefallback = FakeOriginalTokenizer( vocab_file=spm_model_file_with_bytefallback) with warnings.catch_warnings(record=True) as w: _ = SpmConverter(original_tokenizer_with_bytefallback) self.assertEqual(len(w), 1) self.assertIn( "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" " which is not implemented in the fast tokenizers.", str(w[0].message), )
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from transformers import BertGenerationTokenizer from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, slow from transformers.utils import cached_property from ...test_tokenization_common import TokenizerTesterMixin SPIECE_UNDERLINE = "▁" SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertGenerationTokenizer test_rust_tokenizer = False test_sentencepiece = True def setUp(self): super().setUp() tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from typing import Tuple from transformers.models.mluke.tokenization_mluke import MLukeTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow from ...test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MLukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": "<s>"} def setUp(self): super().setUp() self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"} def get_tokenizer(self, task=None, **kwargs): kwargs.update(self.special_tokens_map)
Wav2Vec2Processor, ) from transformers.testing_utils import PASS, USER, get_tests_dir, is_staging_test from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) from test_module.custom_configuration import CustomConfig # noqa E402 from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402 from test_module.custom_processing import CustomProcessor # noqa E402 from test_module.custom_tokenization import CustomTokenizer # noqa E402 SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json") SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures") class AutoFeatureExtractorTest(unittest.TestCase): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"] def test_processor_from_model_shortcut(self): processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") self.assertIsInstance(processor, Wav2Vec2Processor) def test_processor_from_local_directory_from_repo(self): with tempfile.TemporaryDirectory() as tmpdirname: model_config = Wav2Vec2Config() processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from typing import Tuple from transformers import AddedToken, LukeTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow from ...test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") SAMPLE_MERGE_FILE = get_tests_dir("fixtures/merges.txt") SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = LukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": "<s>"} def setUp(self): super().setUp() self.special_tokens_map = { "entity_token_1": "<ent>", "entity_token_2": "<ent2>"
import unittest from pathlib import Path import transformers.models.auto from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig from transformers.models.bert.configuration_bert import BertConfig from transformers.models.roberta.configuration_roberta import RobertaConfig from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) from test_module.custom_configuration import CustomConfig # noqa E402 SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json") class AutoConfigTest(unittest.TestCase): def test_module_spec(self): self.assertIsNotNone(transformers.models.auto.__spec__) self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto")) def test_config_from_model_shortcut(self): config = AutoConfig.from_pretrained("bert-base-uncased") self.assertIsInstance(config, BertConfig) def test_config_model_type_from_local_file(self): config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG) self.assertIsInstance(config, RobertaConfig)
sys.path.append(str(Path(__file__).parent.parent / "utils")) from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402 if is_torch_available(): import numpy as np import torch if is_vision_available(): from PIL import Image SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures") def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False): """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, or a list of PyTorch tensors if one specifies torchify=True. """ assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" if equal_resolution: image_inputs = [] for i in range(feature_extract_tester.batch_size): image_inputs.append( np.random.randint( 255,
import io import unittest try: from .utils import calculate_bleu except ImportError: from utils import calculate_bleu import json from parameterized import parameterized from transformers import FSMTForConditionalGeneration, FSMTTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json" with io.open(filename, "r", encoding="utf-8") as f: bleu_data = json.load(f) @require_torch class ModelEvalTester(unittest.TestCase): def get_tokenizer(self, mname): return FSMTTokenizer.from_pretrained(mname) def get_model(self, mname): model = FSMTForConditionalGeneration.from_pretrained(mname).to( torch_device) if torch_device == "cuda": model.half() return model
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from transformers import AlbertTokenizer, AlbertTokenizerFast from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from ...test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True test_sentencepiece = True test_sentencepiece_ignore_case = True def setUp(self): super().setUp()
get_tests_dir, require_deepspeed, require_torch_gpu, slow, ) from transformers.trainer_utils import set_seed if is_torch_available(): from tests.trainer.test_trainer import ( # noqa RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer, ) set_seed(42) FIXTURE_DIRECTORY = get_tests_dir("fixtures") ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir())) DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__)) # default torch.distributed port DEFAULT_MASTER_PORT = "10999" T5_SMALL = "t5-small" # *** Working Models *** ALBERT_TINY = "hf-internal-testing/tiny-albert" BART_TINY = "sshleifer/bart-tiny-random" BERT_TINY = "hf-internal-testing/tiny-bert" BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus" BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird" BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot"
from transformers import ( CONFIG_MAPPING, FEATURE_EXTRACTOR_MAPPING, AutoConfig, AutoFeatureExtractor, Wav2Vec2Config, Wav2Vec2FeatureExtractor, ) from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) from test_module.custom_configuration import CustomConfig # noqa E402 from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402 SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures") SAMPLE_FEATURE_EXTRACTION_CONFIG = get_tests_dir( "fixtures/dummy_feature_extractor_config.json") SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json") class AutoFeatureExtractorTest(unittest.TestCase): def test_feature_extractor_from_model_shortcut(self): config = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") self.assertIsInstance(config, Wav2Vec2FeatureExtractor) def test_feature_extractor_from_local_directory_from_key(self): config = AutoFeatureExtractor.from_pretrained( SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) self.assertIsInstance(config, Wav2Vec2FeatureExtractor)