Python Wav2Vec2Processor Examples, transformers.Wav2Vec2Processor Python Examples

Example #1

0

Show file

File: hg_training.py Project: YamSok/fairseq

def data_preparation():
    data = import_data()
    global processor

    if glob.glob(f"results_hg/{MODEL}/{LABEL}/processor/*"):
        print(">> From pretrained processor ")
        processor = Wav2Vec2Processor.from_pretrained(f"results_hg/{MODEL}/{LABEL}/processor")
    else :
        print(">> Creating processor ")

        gen_vocab(data)
        tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \
            pad_token="[PAD]", word_delimiter_token="|")
        feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \
            sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
        processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
        processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/')

    dataset = data.map(speech_file_to_array_fn, \
         remove_columns=data.column_names["train"], num_proc=4)
    dataset_prepared = dataset.map(prepare_dataset, \
        remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True)

    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

    return processor, dataset_prepared, data_collator

Example #2

0

Show file

File: run_forward.py Project: patrickvonplaten/convert_wav2vec2

def save_processor():
    processor = Wav2Vec2Processor.from_pretrained(
        "facebook/wav2vec2-large-960h-lv60-self")
    processor.save_pretrained(hf_path)

    create_vocab("../add_wav2vec/data/temp/dict.ltr.txt")
    tok = Wav2Vec2CTCTokenizer(hf_path + "/vocab.json")
    tok.save_pretrained(hf_path)
    processor = Wav2Vec2Processor.from_pretrained(hf_path)
    processor.save_pretrained(hf_path)

Example #3

0

Show file

    def test_push_to_hub(self):
        processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
        with tempfile.TemporaryDirectory() as tmp_dir:
            processor.save_pretrained(
                os.path.join(tmp_dir, "test-processor"), push_to_hub=True, use_auth_token=self._token
            )

            new_processor = Wav2Vec2Processor.from_pretrained(f"{USER}/test-processor")
            for k, v in processor.feature_extractor.__dict__.items():
                self.assertEqual(v, getattr(new_processor.feature_extractor, k))
            self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())

Example #4

0

Show file

def load_model():
    model_name = "facebook/wav2vec2-large-xlsr-53-french"
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    resampler = torchaudio.transforms.Resample(orig_freq=16_000,
                                               new_freq=16_000)
    return model, processor, resampler

Example #5

0

Show file

File: expert.py Project: simpleoier/s3prl

    def __init__(self,
                 ckpt: str = None,
                 model_config: str = None,
                 feature_selection: str = None,
                 **kwargs):
        """
        Args:
            ckpt:
                The checkpoint path for loading your pretrained weights.

            model_config:
                The config path for constructing your model.
                Might not needed if you also save that in your checkpoint file.

            feature_selection:
                The string for you to control the different behavior of the
                same pretrained model, like extracting different layers as
                the representations.
        """
        super().__init__()

        self.processor = Wav2Vec2Processor.from_pretrained(ckpt)
        self.model = Wav2Vec2Model.from_pretrained(ckpt)

        pseudo_input = [torch.randn(SAMPLE_RATE)]
        pseudo_output = self.forward(pseudo_input)
        self._output_dim = pseudo_output[0].size(-1)

Example #6

0

Show file

File: run.py Project: nelfin/electron-wav2vec

def main(input_pipe, output_pipe):
    source = ZeroMQSource(input_pipe)
    sink = ZeroMQSink(output_pipe)

    debug('[+] loading processor')
    processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
    debug('[+] loading model')
    model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')

    sink.signal_ready()
    try:
        while True:
            debug('>>> waiting for connection')
            # torchaudio seems to expect complete files so send small parts
            buf = io.BytesIO(source.recv())
            start = time.time()
            debug('[+] converting audio')
            wav = io.BytesIO(webm_to_wav(buf.read()))
            waveform, sample_rate = torchaudio.load(wav)
            waveform = waveform[0]  # Wav2Vec2Processor expects mono 16kHz audio
            debug('[+] input_values')
            input_values = processor(waveform, sampling_rate=sample_rate, return_tensors='pt').input_values
            debug('[+] logits')
            logits = model(input_values).logits
            debug('[+] predicted_ids')
            predicted_ids = torch.argmax(logits, dim=-1)
            debug('[+] transcription')
            transcription = processor.batch_decode(predicted_ids)[0]
            duration = time.time() - start
            sink.send(start, duration, transcription)
    finally:
        source.close()
        sink.close()

Example #7

0

Show file

File: datasets_wav2vec2.py Project: syoamakase/Transformer_ASR

    def __init__(self, csv_file, hp):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the wavs.
        """
        self.landmarks_frame = pd.read_csv(csv_file, sep='\|', header=None)
        self.hp = hp
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(self.hp.spm_model)
        ## TODO: variable
        self.processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-large-lv60")

        ## TODO
        if self.hp.lengths_file is None or not os.path.exists(
                self.hp.lengths_file):
            print('lengths_file is not exists. Make...')
            lengths_list = []
            pbar = tqdm(range(len(self.landmarks_frame)))
            for idx in pbar:
                wav_name = self.landmarks_frame.loc[idx, 0]
                audio_input, sampling_rate = sf.read(wav_name)
                wav_input = self.processor(audio_input,
                                           sampling_rate=sampling_rate,
                                           return_tensors="pt").input_values
                ## TODO: check calucation for lengths (int(wav_input.shape[1]//320))
                # [1, lengths of wav] -> [lengths of wav]
                wav2vec2_length = math.floor(
                    (wav_input.shape[1] - 400) / 320.) + 1

                lengths_list.append(wav2vec2_length)

            self.lengths_np = np.array(lengths_list)
            np.save(self.hp.lengths_file, self.lengths_np)

Example #8

0

Show file

File: model.py Project: samirt8/wav2vec2_test

 def __init__(self):
     super(ASR_CTC, self).__init__()
     #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base')
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab())
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                           unk_token="<unk>",
                                           pad_token="<pad>",
                                           word_delimiter_token="|")
     self.feature_extractor = Wav2Vec2FeatureExtractor(
         feature_size=1,
         sampling_rate=16000,
         padding_value=0.0,
         do_normalize=True,
         return_attention_mask=True)
     self.processor = Wav2Vec2Processor(
         feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
     self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained(
         "facebook/wav2vec2-large-xlsr-53",
         attention_dropout=0.1,
         hidden_dropout=0.1,
         feat_proj_dropout=0.0,
         mask_time_prob=0.05,
         layerdrop=0.1,
         gradient_checkpointing=True,
         ctc_loss_reduction="mean",
         pad_token_id=self.processor.tokenizer.pad_token_id,
         vocab_size=len(self.processor.tokenizer))

Example #9

0

Show file

File: build_pipeline_from_huggingface_transformers.py Project: pytorch/audio

def _get_model(model_id):
    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
    tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer
    labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])]
    original = Wav2Vec2ForCTC.from_pretrained(model_id)
    model = import_huggingface_model(original)
    return model.eval(), labels

Example #10

0

Show file

File: test_modeling_hubert.py Project: huggingface/transformers

    def test_inference_ctc_batched(self):
        model = HubertForCTC.from_pretrained(
            "facebook/hubert-large-ls960-ft",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/hubert-large-ls960-ft", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

Example #11

0

Show file

File: our_datasets.py Project: garrisonhess/autosubs

    def __init__(self,
                 subtitle_lookup_path,
                 wav_dir,
                 wav2vec_checkpoint,
                 pretrained_chpt="facebook/wav2vec2-base-960h",
                 num_proc=4,
                 preprocess_batch_size=8):
        """
        Args:
            subtitle_lookup_path (NOTE this file must be compatible with pd.read_csv())
            wav_dir: directory of wav files e.g. './wav_data/'
            pretrained_chpt: pretrained checkpoints to load
            num_proc: number of processes allowed when doing dataset preprocessing
            preprocess_batch_size: this is ONLY used inside this dataset to preprocess wav files faster
        Returns:
            
        """
        self.subtitle_lookup_path = subtitle_lookup_path
        self.wav_dir = wav_dir
        self.num_proc = num_proc
        self.preprocess_batch_size = preprocess_batch_size
        self.processor = Wav2Vec2Processor.from_pretrained(pretrained_chpt)
        self.feature_extractor = Wav2Vec2ForCTC.from_pretrained(
            pretrained_chpt).wav2vec2.feature_extractor

        # self.knnw_prepared = dataset_dict.load_from_disk(wav2vec_checkpoint)
        # self.knnw_prepared.save_to_disk(wavdir)
        self.knnw_prepared = self.setup_dataset()

Example #12

0

Show file

File: process_for_asr.py Project: divvun/lang-sme-ml-speech

def prepare(reg=True, from_scratch=False):
    # load data
    test = read_txt('./data/speech-sme-asr/test_asr.txt')
    train = read_txt('./data/speech-sme-asr/train_asr.txt')

    # remove special characters
    train = train.map(remove_special_characters)
    test = test.map(remove_special_characters)

    # build vocab dict

    if from_scratch:
        vocab_dict = build_vocab_dict(train, test)
        write_vocab_dict_to_disk(vocab_dict)

        processor = processor_init()
    if reg:
        # processor = processor_init()
        processor = Wav2Vec2Processor.from_pretrained(
            './asr_output/pretrained_processor')

    def prepare_dataset(batch):

        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values

        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    # speech file to array
    train = train.map(speech_file_to_array_fn,
                      remove_columns=train.column_names)
    test = test.map(speech_file_to_array_fn, remove_columns=test.column_names)

    print("Preparing train dataset")
    train = train.map(prepare_dataset,
                      remove_columns=train.column_names,
                      batch_size=1,
                      num_proc=1,
                      batched=True)
    print("Preparing test dataset")
    test = test.map(prepare_dataset,
                    remove_columns=test.column_names,
                    batch_size=1,
                    num_proc=1,
                    batched=True)
    print("Done")

    pickle.dump(train, open('./data/speech-sme-asr/train_asr.pkl', 'wb'))

    pickle.dump(test, open('./data/speech-sme-asr/test_asr.pkl', 'wb'))

    return train, test

Example #13

0

Show file

File: test_modeling_wav2vec2.py Project: Mehrad0711/transformers

    def test_inference_ctc_normal_batched(self):
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        model.to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-base-960h", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech,
                           return_tensors="pt",
                           padding=True,
                           truncation=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

Example #14

0

Show file

    def __init__(self,
                 path="facebook/wav2vec2-base-960h",
                 quantize=False,
                 gpu=True,
                 batch=64):
        """
        Constructs a new transcription pipeline.

        Args:
            path: optional path to model, accepts Hugging Face model hub id or local path,
                  uses default model for task if not provided
            quantize: if model should be quantized, defaults to False
            gpu: True/False if GPU should be enabled, also supports a GPU device id
            batch: batch size used to incrementally process content
            langdetect: path to language detection model, uses a default path if not provided
        """

        # Call parent constructor
        super().__init__(path, quantize, gpu, batch)

        if not SOUNDFILE:
            raise ImportError(
                "SoundFile library not installed or libsndfile not found")

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(self.path)
        self.processor = Wav2Vec2Processor.from_pretrained(self.path)

        # Move model to device
        self.model.to(self.device)

Example #15

0

Show file

File: core.py Project: vishalraj-95/ktrain

    def __init__(self,
                 model_name="facebook/wav2vec2-base-960h",
                 device=None,
                 half=False):
        """
        ```
        basic wrapper speech transcription

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          half(bool): If True, use half precision.
        ```
        """
        if not TORCH:
            raise ImportError('Transcriber requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None:
            self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #if not SOUNDFILE:
        #raise ImportError("SoundFile library not installed or libsndfile not found: pip install soundfile")
        if not LIBROSA:
            raise ImportError(
                "librosa library must be installed: pip install librosa. Conda users may also have to install ffmpeg: conda install -c conda-forge ffmpeg"
            )

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(
            self.torch_device)
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        if half: self.model = self.model.half()

Example #16

0

Show file

File: huggingface_wav2vec.py Project: qingqingxu2020/speechbrain

    def __init__(self,
                 source,
                 save_path,
                 output_norm=True,
                 freeze=True,
                 pretrain=True):
        super().__init__()

        # Download the model from HuggingFace and load it.
        # The Processor is only used to retrieve the normalisation
        self.proc = Wav2Vec2Processor.from_pretrained(source,
                                                      cache_dir=save_path)
        self.model = Wav2Vec2Model.from_pretrained(source, cache_dir=save_path)

        # Randomly initialized layers if pretrain is False
        if not (pretrain):
            self.reset_layer(self.model)

        # We check if inputs need to be normalized w.r.t pretrained wav2vec2
        self.normalize_wav = self.proc.feature_extractor.do_normalize

        self.freeze = freeze
        self.output_norm = output_norm
        if self.freeze:
            self.model.eval()
        else:
            self.model.train()

Example #17

0

Show file

File: test_modeling_data2vec_audio.py Project: huggingface/transformers

    def test_inference_ctc_batched(self):
        model = Data2VecAudioForCTC.from_pretrained(
            "facebook/data2vec-audio-base-960h").to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)

        input_speech = self._load_datasamples(4)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
            "his instant of panic was followed by a small sharp blow high on his chest",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

Example #18

0

Show file

    def __init__(self, model_name='te'):
        model_name = model_name.lower()
        for x, y in LANGUAGE_ALISASES.items():
            model_name = model_name.replace(x, y)

        if model_name not in MODEL_URLS and model_name not in LANGUAGE_ALISASES:
            if model_name in LANGUAGE_ALISASES:
                model_name = LANGUAGE_ALISASES[model_name]

            print(f"model_name should be one of {list(MODEL_URLS.keys())}")
            return None

        home = os.path.expanduser("~")
        lang_path = os.path.join(home, ".IndicASR_" + model_name)
        if not os.path.exists(lang_path):
            os.mkdir(lang_path)

        for file_name, url in MODEL_URLS[model_name].items():
            file_path = os.path.join(lang_path, file_name)
            if os.path.exists(file_path):
                continue
            print(f"Downloading {file_name}")
            pydload.dload(url=url, save_to_path=file_path, max_time=None)

        self.processor = Wav2Vec2Processor.from_pretrained(lang_path)
        self.model = Wav2Vec2ForCTC.from_pretrained(lang_path)

        if torch.cuda.is_available():
            print(f"Using GPU")
            self.model = self.model.cuda()

Example #19

0

Show file

    def test_inference_ctc_robust_batched(self):
        model = TFHubertForCTC.from_pretrained(
            "facebook/hubert-large-ls960-ft")
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/hubert-large-ls960-ft", do_lower_case=True)

        input_speech = self._load_datasamples(4)

        inputs = processor(input_speech,
                           return_tensors="tf",
                           padding=True,
                           sampling_rate=16000)

        input_values = inputs.input_values
        attention_mask = inputs.attention_mask

        logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = tf.argmax(logits, axis=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
            " him with the thousands of spectators were trivialities not worth thinking about",
            "his instant of panic was followed by a small sharp blow high on his chest",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

Example #20

0

Show file

    def test_inference_ctc_batched(self):
        # TODO: enable this test once the finetuned models are available
        model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-100h").to(
            torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "asapp/sew-d-tiny-100k-ft-100h", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

Example #21

0

Show file

 def __init__(self, device="cuda"):
     self.encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
     self.encoder.eval()
     self.encoder = self.encoder.to(device)
     self.preprocessor = Wav2Vec2Processor.from_pretrained(
         "facebook/wav2vec2-base")
     self.preprocessor._sample_rate = 16000
     self.device = device

Example #22

0

Show file

File: w2v2_decode.py Project: martijnbentum/frisian_asr

 def load(self):
     if not hasattr(self, 'processor'):
         self.processor = Wav2Vec2Processor.from_pretrained(
             self.recognizer_dir)
     if self.load_model:
         self.model = Wav2Vec2ForCTC.from_pretrained(self.recognizer_dir)
         if self.use_cuda:
             self.model = self.model.to("cuda")

Example #23

0

Show file

def load_processor(vocab_dir= vocab_dir, cache_dir = cache_dir, force = False):
	global processor
	if processor: return processor
	tokenizer = load_tokenizer(vocab_dir,cache_dir)
	feature_extractor = load_feature_extractor()
	processor = Wav2Vec2Processor(feature_extractor=feature_extractor, 
		tokenizer=tokenizer)
	return processor

Example #24

0

Show file

    def __init__(self):
        self.REQUIRED_SAMPLE_RATE = 16000

        # Use Facebook's pretrained Wav2Vec2 model
        # https://huggingface.co/facebook/wav2vec2-large-960h
        PRETRAINED_MODEL = 'facebook/wav2vec2-base-960h'
        self.processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL)
        self.model = Wav2Vec2ForCTC.from_pretrained(PRETRAINED_MODEL)

Example #25

0

Show file

 def init(self):
     self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
     assert self.processor.feature_extractor.do_normalize is True
     self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
     target_dictionary = list(self.processor.tokenizer.get_vocab().keys())
     print(f"target_dictionary: {target_dictionary}")
     self.decoder = GreedyDecoder(target_dictionary).init()
     return self

Example #26

0

Show file

File: asr_utils.py Project: divvun/lang-sme-ml-speech

def evaluate_asr():
    def read_txt(txt_path):
        data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence'])
        
        has_colon = data['path'].str.contains('|')
        data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True)

        data = Dataset.from_pandas(data)
        return(data)

    test = read_txt('./data/speech-sme-asr/test_asr.txt')
    processor = Wav2Vec2Processor.from_pretrained("asr_output/pretrained_processor")
    # print(processor.__dict__)
    # print(processor.tokenizer)

    # exit()
    model = Wav2Vec2ForCTC.from_pretrained("asr_output/checkpoint-27363").to("cpu")
    # print(model)
    # exit()
    # resampler = torchaudio.transforms.Resample(new_freq=16_000)

    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load('./data/'+ batch["path"])
        batch["speech"] = speech_array[0].numpy()
        return batch

    test_dataset = test.map(speech_file_to_array_fn)
    input_dict = processor(test_dataset['speech'][:11],sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(input_dict.input_values.to("cpu")).logits


    predicted_ids = torch.argmax(logits, dim=-1)

    print("Prediction:", processor.batch_decode(predicted_ids))
    print("Reference:", test_dataset["sentence"][:11])

    wer = load_metric("wer")

    resampler = torchaudio.transforms.Resample(48_000, 16_000)

    def evaluate_batch(batch):
        inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

        with torch.no_grad():
            logits = model(inputs.input_values.to("cpu"), attention_mask=inputs.attention_mask.to("cpu")).logits

        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_strings"] = processor.batch_decode(pred_ids)
        return batch

    result = test_dataset.map(evaluate_batch, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory

    msg = "WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))
    print(msg)
    return msg

Example #27

0

Show file

File: model_ctc.py Project: vbrydik/pyw2v2

    def _init_processor(self, config: EasyDict):
        config.processor.tokenizer.vocab_file = config.common.vocab_file
        tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer)
        feature_extractor = Wav2Vec2FeatureExtractor(
            **config.processor.feature_extractor)

        processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                      tokenizer=tokenizer)
        processor.save_pretrained(config.common.model_path)
        self._processor = processor

Example #28

0

Show file

    def test_mask_time_prob_ctc(self):
        model = UniSpeechSatForCTC.from_pretrained(
            "hf-internal-testing/tiny-random-unispeech-sat", mask_time_prob=0.2, mask_time_length=2
        )
        model.to(torch_device).train()
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
        )

        batch_duration_in_seconds = [1, 3, 2, 6]
        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]

Example #29

0

Show file

File: finetune_wav2vec.py Project: shunnakamu/AITrialTraining

 def __init__(self, hidden_size=512, num_classes=8, device='cpu', sr=16000):
     super(Wav2VecClassifier, self).__init__()
     self.hidden_size = hidden_size
     self.sr = sr
     self.device = device
     self.processor = Wav2Vec2Processor.from_pretrained(
         "facebook/wav2vec2-base-960h")
     self.model = Wav2Vec2Model.from_pretrained(
         "facebook/wav2vec2-base-960h")
     self.lstm = nn.LSTM(768, hidden_size, batch_first=True)
     self.fc = nn.Linear(hidden_size, num_classes)

Example #30

0

Show file

    def test_mask_time_prob_ctc(self):
        model = Data2VecAudioForCTC.from_pretrained(
            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
        )
        model.to(torch_device).train()
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
        )

        batch_duration_in_seconds = [1, 3, 2, 6]
        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]