def test_new_processor_registration(self): try: AutoConfig.register("custom", CustomConfig) AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) AutoProcessor.register(CustomConfig, CustomProcessor) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor) # Now that the config is registered, it can be used as any other config with the auto-API feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: vocab_file = os.path.join(tmp_dir, "vocab.txt") with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) tokenizer = CustomTokenizer(vocab_file) processor = CustomProcessor(feature_extractor, tokenizer) with tempfile.TemporaryDirectory() as tmp_dir: processor.save_pretrained(tmp_dir) new_processor = AutoProcessor.from_pretrained(tmp_dir) self.assertIsInstance(new_processor, CustomProcessor) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_simple_s2t(self): model = Speech2TextForConditionalGeneration.from_pretrained( "facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained( "facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/s2t-small-mustc-en-it-st") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
def test_feature_extractor_not_found(self): with self.assertRaisesRegex( EnvironmentError, "hf-internal-testing/config-no-model does not appear to have a file named preprocessor_config.json.", ): _ = AutoFeatureExtractor.from_pretrained( "hf-internal-testing/config-no-model")
def test_maskformer(self): threshold = 0.8 model_id = "facebook/maskformer-swin-base-ade" model = AutoModelForInstanceSegmentation.from_pretrained(model_id) feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) image_segmenter = pipeline("image-segmentation", model=model, feature_extractor=feature_extractor) image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") file = image[0]["file"] outputs = image_segmenter(file, threshold=threshold) for o in outputs: o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), [ {"mask": "20d1b9480d1dc1501dbdcfdff483e370", "label": "wall", "score": None}, {"mask": "0f902fbc66a0ff711ea455b0e4943adf", "label": "house", "score": None}, {"mask": "4537bdc07d47d84b3f8634b7ada37bd4", "label": "grass", "score": None}, {"mask": "b7ac77dfae44a904b479a0926a2acaf7", "label": "tree", "score": None}, {"mask": "e9bedd56bd40650fb263ce03eb621079", "label": "plant", "score": None}, {"mask": "37a609f8c9c1b8db91fbff269f428b20", "label": "road, route", "score": None}, {"mask": "0d8cdfd63bae8bf6e4344d460a2fa711", "label": "sky", "score": None}, ], )
def test_revision_not_found(self): with self.assertRaisesRegex( EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)" ): _ = AutoFeatureExtractor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_push_to_hub_dynamic_feature_extractor(self): CustomFeatureExtractor.register_for_auto_class() feature_extractor = CustomFeatureExtractor.from_pretrained( SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: repo = Repository( tmp_dir, clone_from=f"{USER}/test-dynamic-feature-extractor", use_auth_token=self._token) feature_extractor.save_pretrained(tmp_dir) # This has added the proper auto_map field to the config self.assertDictEqual( feature_extractor.auto_map, { "AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor" }, ) # The code has been copied from fixtures self.assertTrue( os.path.isfile( os.path.join(tmp_dir, "custom_feature_extraction.py"))) repo.push_to_hub() new_feature_extractor = AutoFeatureExtractor.from_pretrained( f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True) # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path): timm_model = timm.create_model(swinv2_name, pretrained=True) timm_model.eval() config = get_swinv2_config(swinv2_name) model = Swinv2ForImageClassification(config) model.eval() new_state_dict = convert_state_dict(timm_model.state_dict(), model) model.load_state_dict(new_state_dict) url = "http://images.cocodataset.org/val2017/000000039769.jpg" feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-"))) image = Image.open(requests.get(url, stream=True).raw) inputs = feature_extractor(images=image, return_tensors="pt") timm_outs = timm_model(inputs["pixel_values"]) hf_outs = model(**inputs).logits assert torch.allclose(timm_outs, hf_outs, atol=1e-3) print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path) model.push_to_hub( repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name), organization="nandwalritik", commit_message="Add model", )
def test_simple_wav2vec2(self): import numpy as np from datasets import load_dataset model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.zeros((34000, )) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[0]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
def test_for_image_classification(self): feature_extractor = AutoFeatureExtractor.from_pretrained( "microsoft/dit-base-finetuned-rvlcdip") model = AutoModelForImageClassification.from_pretrained( "microsoft/dit-base-finetuned-rvlcdip") model.to(torch_device) from datasets import load_dataset dataset = load_dataset("nielsr/rvlcdip-demo") image = dataset["train"][0]["image"].convert("RGB") inputs = feature_extractor(image, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits expected_shape = torch.Size((1, 16)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [-0.4158, -0.4092, -0.4347], device=torch_device, dtype=torch.float, ) self.assertTrue( torch.allclose(logits[0, :3], expected_slice, atol=1e-4))
def test_simple_wav2vec2(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True): print(f"Converting {name}...") with torch.no_grad(): from_model = timm.create_model(name, pretrained=True).eval() our_model = ResNetForImageClassification(config).eval() module_transfer = ModuleTransfer(src=from_model, dest=our_model) x = torch.randn((1, 3, 224, 224)) module_transfer(x) assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one." checkpoint_name = f"resnet{'-'.join(name.split('resnet'))}" print(checkpoint_name) if push_to_hub: our_model.push_to_hub( repo_path_or_name=save_directory / checkpoint_name, commit_message="Add model", use_temp_dir=True, ) # we can use the convnext one feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k") feature_extractor.push_to_hub( repo_path_or_name=save_directory / checkpoint_name, commit_message="Add feature extractor", use_temp_dir=True, ) print(f"Pushed {checkpoint_name}")
def test_chunking(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") speech_recognizer = pipeline( task="automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, framework="pt", chunk_length_s=10.0, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 audio = np.tile(audio, n_repeats) output = speech_recognizer([audio], batch_size=2) expected_text = "A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats expected = [{"text": expected_text.strip()}] self.assertEqual(output, expected)
def test_chunk_iterator(self): feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") inputs = torch.arange(100).long() outs = list(chunk_iter(inputs, feature_extractor, 100, 0, 0)) self.assertEqual(len(outs), 1) self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)]) self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)]) self.assertEqual([o["is_last"] for o in outs], [True]) # two chunks no stride outs = list(chunk_iter(inputs, feature_extractor, 50, 0, 0)) self.assertEqual(len(outs), 2) self.assertEqual([o["stride"] for o in outs], [(50, 0, 0), (50, 0, 0)]) self.assertEqual([o["input_values"].shape for o in outs], [(1, 50), (1, 50)]) self.assertEqual([o["is_last"] for o in outs], [False, True]) # two chunks incomplete last outs = list(chunk_iter(inputs, feature_extractor, 80, 0, 0)) self.assertEqual(len(outs), 2) self.assertEqual([o["stride"] for o in outs], [(80, 0, 0), (20, 0, 0)]) self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 20)]) self.assertEqual([o["is_last"] for o in outs], [False, True])
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor. <Tip> This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and TrOCRTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods above for more information. </Tip> Args: pretrained_model_name_or_path (`str` or `os.PathLike`): This can be either: - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - a path to a *directory* containing a feature extractor file saved using the [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - a path or url to a saved feature extractor JSON *file*, e.g., `./my_model_directory/preprocessor_config.json`. **kwargs Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and [`PreTrainedTokenizer`] """ feature_extractor = AutoFeatureExtractor.from_pretrained( pretrained_model_name_or_path, **kwargs) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def test_small_model_from_pipeline(self): for small_model in self.small_models: model = AutoModelForImageClassification.from_pretrained( small_model) feature_extractor = AutoFeatureExtractor.from_pretrained( small_model) image_classifier = ImageClassificationPipeline( model=model, feature_extractor=feature_extractor) for valid_input in self.valid_inputs: output = image_classifier(**valid_input) top_k = valid_input.get("top_k", 5) def assert_valid_pipeline_output(pipeline_output): self.assertTrue(isinstance(pipeline_output, list)) self.assertEqual(len(pipeline_output), top_k) for label_result in pipeline_output: self.assertTrue(isinstance(label_result, dict)) self.assertIn("label", label_result) self.assertIn("score", label_result) if isinstance(valid_input["images"], list): # When images are batched, pipeline output is a list of lists of dictionaries self.assertEqual(len(valid_input["images"]), len(output)) for individual_output in output: assert_valid_pipeline_output(individual_output) else: # When images are batched, pipeline output is a list of dictionaries assert_valid_pipeline_output(output)
def convert_cvt_checkpoint(cvt_file, pytorch_dump_folder): """ Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint """ img_labels_file = "imagenet-1k-id2label.json" num_labels = 1000 repo_id = "datasets/huggingface/label-files" num_labels = num_labels id2label = json.load( open(cached_download(hf_hub_url(repo_id, img_labels_file)), "r")) id2label = {int(k): v for k, v in id2label.items()} id2label = id2label label2id = {v: k for k, v in id2label.items()} config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id) # For depth size 13 (13 = 1+2+10) if cvt_file.rsplit("/", 1)[-1][4:6] == "13": config.depth = [1, 2, 10] # For depth size 21 (21 = 1+4+16) elif cvt_file.rsplit("/", 1)[-1][4:6] == "21": config.depth = [1, 4, 16] # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20) else: config.depth = [2, 2, 20] config.num_heads = [3, 12, 16] config.embed_dim = [192, 768, 1024] model = CvtForImageClassification(config) feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/convnext-base-224-22k-1k") original_weights = torch.load(cvt_file, map_location=torch.device("cpu")) huggingface_weights = OrderedDict() list_of_state_dict = [] for idx in range(config.num_stages): if config.cls_token[idx]: list_of_state_dict = list_of_state_dict + cls_token(idx) list_of_state_dict = list_of_state_dict + embeddings(idx) for cnt in range(config.depth[idx]): list_of_state_dict = list_of_state_dict + attention(idx, cnt) list_of_state_dict = list_of_state_dict + final() for gg in list_of_state_dict: print(gg) for i in range(len(list_of_state_dict)): huggingface_weights[list_of_state_dict[i][0]] = original_weights[ list_of_state_dict[i][1]] model.load_state_dict(huggingface_weights) model.save_pretrained(pytorch_dump_folder) feature_extractor.save_pretrained(pytorch_dump_folder)
def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config): try: feature_extractor = AutoFeatureExtractor.from_pretrained(checkpoint) except Exception: feature_extractor = None if hasattr(tiny_config, "image_size") and feature_extractor: feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size) return feature_extractor
def test_feature_extractor_from_local_directory_from_config(self): with tempfile.TemporaryDirectory() as tmpdirname: model_config = Wav2Vec2Config() # remove feature_extractor_type to make sure config.json alone is enough to load feature processor locally config_dict = AutoFeatureExtractor.from_pretrained( SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR).to_dict() config_dict.pop("feature_extractor_type") config = Wav2Vec2FeatureExtractor(config_dict) # save in new folder model_config.save_pretrained(tmpdirname) config.save_pretrained(tmpdirname) config = AutoFeatureExtractor.from_pretrained(tmpdirname) self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def __init__(self, model: str): """Create Hugging Face Inference Session""" self.model = AutoModelForImageClassification.from_pretrained(model) self.feature_extractor = AutoFeatureExtractor.from_pretrained(model) self.session = pipeline( "image-classification", model=self.model, feature_extractor=self.feature_extractor, )
def test_new_feature_extractor_registration(self): try: AutoConfig.register("custom", CustomConfig) AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoFeatureExtractor.register(Wav2Vec2Config, Wav2Vec2FeatureExtractor) # Now that the config is registered, it can be used as any other config with the auto-API feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: feature_extractor.save_pretrained(tmp_dir) new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir) self.assertIsInstance(new_feature_extractor, CustomFeatureExtractor) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
def _onnx_export(self, test_name, name, model_name, feature, onnx_config_class_constructor, device="cpu"): from transformers.onnx import export model_class = FeaturesManager.get_model_class_for_feature(feature) config = AutoConfig.from_pretrained(model_name) model = model_class.from_config(config) onnx_config = onnx_config_class_constructor(model.config) if is_torch_available(): from transformers.utils import torch_version if torch_version < onnx_config.torch_onnx_minimum_version: pytest.skip( "Skipping due to incompatible PyTorch version. Minimum required is" f" {onnx_config.torch_onnx_minimum_version}, got: {torch_version}" ) # Check the modality of the inputs and instantiate the appropriate preprocessor if model.main_input_name == "input_ids": preprocessor = AutoTokenizer.from_pretrained(model_name) # Useful for causal lm models that do not use pad tokens. if not getattr(config, "pad_token_id", None): config.pad_token_id = preprocessor.eos_token_id elif model.main_input_name == "pixel_values": preprocessor = AutoFeatureExtractor.from_pretrained(model_name) else: raise ValueError( f"Unsupported model input name: {model.main_input_name}") with NamedTemporaryFile("w") as output: try: onnx_inputs, onnx_outputs = export( preprocessor, model, onnx_config, onnx_config.default_onnx_opset, Path(output.name), device=device) validate_model_outputs( onnx_config, preprocessor, model, Path(output.name), onnx_outputs, onnx_config.atol_for_validation, ) except (RuntimeError, ValueError) as e: self.fail(f"{name}, {feature} -> {e}")
def test_feature_extractor_from_local_directory_from_config(self): with tempfile.TemporaryDirectory() as tmpdirname: model_config = Wav2Vec2Config() # remove feature_extractor_type to make sure config.json alone is enough to load feature processor locally config_dict = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR).to_dict() config_dict.pop("feature_extractor_type") config = Wav2Vec2FeatureExtractor(**config_dict) # save in new folder model_config.save_pretrained(tmpdirname) config.save_pretrained(tmpdirname) config = AutoFeatureExtractor.from_pretrained(tmpdirname) # make sure private variable is not incorrectly saved dict_as_saved = json.loads(config.to_json_string()) self.assertTrue("_processor_class" not in dict_as_saved) self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config): try: feature_extractor = AutoFeatureExtractor.from_pretrained(checkpoint) except Exception: feature_extractor = None if hasattr(tiny_config, "image_size") and feature_extractor: feature_extractor = feature_extractor.__class__( size=tiny_config.image_size, crop_size=tiny_config.image_size) # Speech2TextModel specific. if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor: feature_extractor = feature_extractor.__class__( feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel) return feature_extractor
def test_small_model_pt_seq2seq(self): model_id = "hf-internal-testing/tiny-random-speech-encoder-decoder" tokenizer = AutoTokenizer.from_pretrained(model_id) feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) speech_recognizer = pipeline( task="automatic-speech-recognition", model=model_id, tokenizer=tokenizer, feature_extractor=feature_extractor, framework="pt", ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = speech_recognizer(waveform) self.assertEqual(output, {"text": "あл ش 湯 清 ه ܬ া लᆨしث ल eか u w 全 u"})
def convert_weight_and_push( name: str, config: VanConfig, checkpoint: str, from_model: nn.Module, save_directory: Path, push_to_hub: bool = True, ): print(f"Downloading weights for {name}...") checkpoint_path = cached_download(checkpoint) print(f"Converting {name}...") from_state_dict = torch.load(checkpoint_path)["state_dict"] from_model.load_state_dict(from_state_dict) from_model.eval() with torch.no_grad(): our_model = VanForImageClassification(config).eval() module_transfer = ModuleTransfer(src=from_model, dest=our_model) x = torch.randn((1, 3, 224, 224)) module_transfer(x) our_model = copy_parameters(from_model, our_model) assert torch.allclose( from_model(x), our_model(x).logits), "The model logits don't match the original one." checkpoint_name = name print(checkpoint_name) if push_to_hub: our_model.push_to_hub( repo_path_or_name=save_directory / checkpoint_name, commit_message="Add model", use_temp_dir=True, ) # we can use the convnext one feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/convnext-base-224-22k-1k") feature_extractor.push_to_hub( repo_path_or_name=save_directory / checkpoint_name, commit_message="Add feature extractor", use_temp_dir=True, ) print(f"Pushed {checkpoint_name}")
def test_large_model_pt(self): model_id = "facebook/detr-resnet-50" model = AutoModelForObjectDetection.from_pretrained(model_id) feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor) outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg") self.assertEqual( nested_simplify(outputs, decimals=4), [ {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}}, {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}}, {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}}, {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}}, {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}}, ], ) outputs = object_detector( [ "http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg", ] ) self.assertEqual( nested_simplify(outputs, decimals=4), [ [ {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}}, {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}}, {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}}, {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}}, {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}}, ], [ {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}}, {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}}, {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}}, {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}}, {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}}, ], ], )
def test_simple_s2t(self): import numpy as np from datasets import load_dataset model = Speech2TextForConditionalGeneration.from_pretrained( "facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained( "facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/s2t-small-mustc-en-it-st") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.zeros((34000, )) output = asr(waveform) self.assertEqual(output, { "text": "E questo è il motivo per cui non ci siamo mai incontrati." }) ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = asr(filename) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) filename = ds[0]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual( output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
def main(args): # load dataset dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True) # for testing: only process the first two examples as a test # dataset = dataset.select(range(10)) # load processor feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id) sampling_rate = feature_extractor.sampling_rate # resample audio dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate)) # load eval pipeline if args.device is None: args.device = 0 if torch.cuda.is_available() else -1 asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device) # map function to decode audio def map_to_pred(batch): prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s) batch["prediction"] = prediction["text"] batch["target"] = normalize_text(batch["sentence"]) return batch # run inference on all examples result = dataset.map(map_to_pred, remove_columns=dataset.column_names) # compute and log_results # do not change function below log_results(result, args)
def test_push_to_hub_dynamic_feature_extractor(self): CustomFeatureExtractor.register_for_auto_class() feature_extractor = CustomFeatureExtractor.from_pretrained( SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token) # This has added the proper auto_map field to the config self.assertDictEqual( feature_extractor.auto_map, { "AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor" }, ) new_feature_extractor = AutoFeatureExtractor.from_pretrained( f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True) # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
def test_small_model_pt(self): model_id = "mishig/tiny-detr-mobilenetsv3" model = AutoModelForObjectDetection.from_pretrained(model_id) feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor) outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0) self.assertEqual( nested_simplify(outputs, decimals=4), [ {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, ], ) outputs = object_detector( [ "http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg", ], threshold=0.0, ) self.assertEqual( nested_simplify(outputs, decimals=4), [ [ {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, ], [ {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 160, "ymin": 120, "xmax": 480, "ymax": 359}}, ], ], )