def data_preparation(): data = import_data() global processor if glob.glob(f"results_hg/{MODEL}/{LABEL}/processor/*"): print(">> From pretrained processor ") processor = Wav2Vec2Processor.from_pretrained(f"results_hg/{MODEL}/{LABEL}/processor") else : print(">> Creating processor ") gen_vocab(data) tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \ pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \ sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/') dataset = data.map(speech_file_to_array_fn, \ remove_columns=data.column_names["train"], num_proc=4) dataset_prepared = dataset.map(prepare_dataset, \ remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True) data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) return processor, dataset_prepared, data_collator
def save_processor(): processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-large-960h-lv60-self") processor.save_pretrained(hf_path) create_vocab("../add_wav2vec/data/temp/dict.ltr.txt") tok = Wav2Vec2CTCTokenizer(hf_path + "/vocab.json") tok.save_pretrained(hf_path) processor = Wav2Vec2Processor.from_pretrained(hf_path) processor.save_pretrained(hf_path)
def test_push_to_hub(self): processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: processor.save_pretrained( os.path.join(tmp_dir, "test-processor"), push_to_hub=True, use_auth_token=self._token ) new_processor = Wav2Vec2Processor.from_pretrained(f"{USER}/test-processor") for k, v in processor.feature_extractor.__dict__.items(): self.assertEqual(v, getattr(new_processor.feature_extractor, k)) self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
def load_model(): model_name = "facebook/wav2vec2-large-xlsr-53-french" model = Wav2Vec2ForCTC.from_pretrained(model_name) processor = Wav2Vec2Processor.from_pretrained(model_name) resampler = torchaudio.transforms.Resample(orig_freq=16_000, new_freq=16_000) return model, processor, resampler
def __init__(self, ckpt: str = None, model_config: str = None, feature_selection: str = None, **kwargs): """ Args: ckpt: The checkpoint path for loading your pretrained weights. model_config: The config path for constructing your model. Might not needed if you also save that in your checkpoint file. feature_selection: The string for you to control the different behavior of the same pretrained model, like extracting different layers as the representations. """ super().__init__() self.processor = Wav2Vec2Processor.from_pretrained(ckpt) self.model = Wav2Vec2Model.from_pretrained(ckpt) pseudo_input = [torch.randn(SAMPLE_RATE)] pseudo_output = self.forward(pseudo_input) self._output_dim = pseudo_output[0].size(-1)
def main(input_pipe, output_pipe): source = ZeroMQSource(input_pipe) sink = ZeroMQSink(output_pipe) debug('[+] loading processor') processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h') debug('[+] loading model') model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h') sink.signal_ready() try: while True: debug('>>> waiting for connection') # torchaudio seems to expect complete files so send small parts buf = io.BytesIO(source.recv()) start = time.time() debug('[+] converting audio') wav = io.BytesIO(webm_to_wav(buf.read())) waveform, sample_rate = torchaudio.load(wav) waveform = waveform[0] # Wav2Vec2Processor expects mono 16kHz audio debug('[+] input_values') input_values = processor(waveform, sampling_rate=sample_rate, return_tensors='pt').input_values debug('[+] logits') logits = model(input_values).logits debug('[+] predicted_ids') predicted_ids = torch.argmax(logits, dim=-1) debug('[+] transcription') transcription = processor.batch_decode(predicted_ids)[0] duration = time.time() - start sink.send(start, duration, transcription) finally: source.close() sink.close()
def __init__(self, csv_file, hp): """ Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the wavs. """ self.landmarks_frame = pd.read_csv(csv_file, sep='\|', header=None) self.hp = hp self.sp = spm.SentencePieceProcessor() self.sp.Load(self.hp.spm_model) ## TODO: variable self.processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-large-lv60") ## TODO if self.hp.lengths_file is None or not os.path.exists( self.hp.lengths_file): print('lengths_file is not exists. Make...') lengths_list = [] pbar = tqdm(range(len(self.landmarks_frame))) for idx in pbar: wav_name = self.landmarks_frame.loc[idx, 0] audio_input, sampling_rate = sf.read(wav_name) wav_input = self.processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_values ## TODO: check calucation for lengths (int(wav_input.shape[1]//320)) # [1, lengths of wav] -> [lengths of wav] wav2vec2_length = math.floor( (wav_input.shape[1] - 400) / 320.) + 1 lengths_list.append(wav2vec2_length) self.lengths_np = np.array(lengths_list) np.save(self.hp.lengths_file, self.lengths_np)
def __init__(self): super(ASR_CTC, self).__init__() #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base') #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab()) #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") self.feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor( feature_extractor=self.feature_extractor, tokenizer=self.tokenizer) self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction="mean", pad_token_id=self.processor.tokenizer.pad_token_id, vocab_size=len(self.processor.tokenizer))
def _get_model(model_id): from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])] original = Wav2Vec2ForCTC.from_pretrained(model_id) model = import_huggingface_model(original) return model.eval(), labels
def test_inference_ctc_batched(self): model = HubertForCTC.from_pretrained( "facebook/hubert-large-ls960-ft", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def __init__(self, subtitle_lookup_path, wav_dir, wav2vec_checkpoint, pretrained_chpt="facebook/wav2vec2-base-960h", num_proc=4, preprocess_batch_size=8): """ Args: subtitle_lookup_path (NOTE this file must be compatible with pd.read_csv()) wav_dir: directory of wav files e.g. './wav_data/' pretrained_chpt: pretrained checkpoints to load num_proc: number of processes allowed when doing dataset preprocessing preprocess_batch_size: this is ONLY used inside this dataset to preprocess wav files faster Returns: """ self.subtitle_lookup_path = subtitle_lookup_path self.wav_dir = wav_dir self.num_proc = num_proc self.preprocess_batch_size = preprocess_batch_size self.processor = Wav2Vec2Processor.from_pretrained(pretrained_chpt) self.feature_extractor = Wav2Vec2ForCTC.from_pretrained( pretrained_chpt).wav2vec2.feature_extractor # self.knnw_prepared = dataset_dict.load_from_disk(wav2vec_checkpoint) # self.knnw_prepared.save_to_disk(wavdir) self.knnw_prepared = self.setup_dataset()
def prepare(reg=True, from_scratch=False): # load data test = read_txt('./data/speech-sme-asr/test_asr.txt') train = read_txt('./data/speech-sme-asr/train_asr.txt') # remove special characters train = train.map(remove_special_characters) test = test.map(remove_special_characters) # build vocab dict if from_scratch: vocab_dict = build_vocab_dict(train, test) write_vocab_dict_to_disk(vocab_dict) processor = processor_init() if reg: # processor = processor_init() processor = Wav2Vec2Processor.from_pretrained( './asr_output/pretrained_processor') def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch # speech file to array train = train.map(speech_file_to_array_fn, remove_columns=train.column_names) test = test.map(speech_file_to_array_fn, remove_columns=test.column_names) print("Preparing train dataset") train = train.map(prepare_dataset, remove_columns=train.column_names, batch_size=1, num_proc=1, batched=True) print("Preparing test dataset") test = test.map(prepare_dataset, remove_columns=test.column_names, batch_size=1, num_proc=1, batched=True) print("Done") pickle.dump(train, open('./data/speech-sme-asr/train_asr.pkl', 'wb')) pickle.dump(test, open('./data/speech-sme-asr/test_asr.pkl', 'wb')) return train, test
def test_inference_ctc_normal_batched(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def __init__(self, path="facebook/wav2vec2-base-960h", quantize=False, gpu=True, batch=64): """ Constructs a new transcription pipeline. Args: path: optional path to model, accepts Hugging Face model hub id or local path, uses default model for task if not provided quantize: if model should be quantized, defaults to False gpu: True/False if GPU should be enabled, also supports a GPU device id batch: batch size used to incrementally process content langdetect: path to language detection model, uses a default path if not provided """ # Call parent constructor super().__init__(path, quantize, gpu, batch) if not SOUNDFILE: raise ImportError( "SoundFile library not installed or libsndfile not found") # load model and processor self.model = Wav2Vec2ForCTC.from_pretrained(self.path) self.processor = Wav2Vec2Processor.from_pretrained(self.path) # Move model to device self.model.to(self.device)
def __init__(self, model_name="facebook/wav2vec2-base-960h", device=None, half=False): """ ``` basic wrapper speech transcription Args: model_name(str): Helsinki-NLP model device(str): device to use (e.g., 'cuda', 'cpu') half(bool): If True, use half precision. ``` """ if not TORCH: raise ImportError('Transcriber requires PyTorch to be installed.') self.torch_device = device if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' #if not SOUNDFILE: #raise ImportError("SoundFile library not installed or libsndfile not found: pip install soundfile") if not LIBROSA: raise ImportError( "librosa library must be installed: pip install librosa. Conda users may also have to install ffmpeg: conda install -c conda-forge ffmpeg" ) # load model and processor self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to( self.torch_device) self.processor = Wav2Vec2Processor.from_pretrained(model_name) if half: self.model = self.model.half()
def __init__(self, source, save_path, output_norm=True, freeze=True, pretrain=True): super().__init__() # Download the model from HuggingFace and load it. # The Processor is only used to retrieve the normalisation self.proc = Wav2Vec2Processor.from_pretrained(source, cache_dir=save_path) self.model = Wav2Vec2Model.from_pretrained(source, cache_dir=save_path) # Randomly initialized layers if pretrain is False if not (pretrain): self.reset_layer(self.model) # We check if inputs need to be normalized w.r.t pretrained wav2vec2 self.normalize_wav = self.proc.feature_extractor.do_normalize self.freeze = freeze self.output_norm = output_norm if self.freeze: self.model.eval() else: self.model.train()
def test_inference_ctc_batched(self): model = Data2VecAudioForCTC.from_pretrained( "facebook/data2vec-audio-base-960h").to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True) input_speech = self._load_datasamples(4) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about", "his instant of panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def __init__(self, model_name='te'): model_name = model_name.lower() for x, y in LANGUAGE_ALISASES.items(): model_name = model_name.replace(x, y) if model_name not in MODEL_URLS and model_name not in LANGUAGE_ALISASES: if model_name in LANGUAGE_ALISASES: model_name = LANGUAGE_ALISASES[model_name] print(f"model_name should be one of {list(MODEL_URLS.keys())}") return None home = os.path.expanduser("~") lang_path = os.path.join(home, ".IndicASR_" + model_name) if not os.path.exists(lang_path): os.mkdir(lang_path) for file_name, url in MODEL_URLS[model_name].items(): file_path = os.path.join(lang_path, file_name) if os.path.exists(file_path): continue print(f"Downloading {file_name}") pydload.dload(url=url, save_to_path=file_path, max_time=None) self.processor = Wav2Vec2Processor.from_pretrained(lang_path) self.model = Wav2Vec2ForCTC.from_pretrained(lang_path) if torch.cuda.is_available(): print(f"Using GPU") self.model = self.model.cuda()
def test_inference_ctc_robust_batched(self): model = TFHubertForCTC.from_pretrained( "facebook/hubert-large-ls960-ft") processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(4) inputs = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000) input_values = inputs.input_values attention_mask = inputs.attention_mask logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = tf.argmax(logits, axis=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around" " him with the thousands of spectators were trivialities not worth thinking about", "his instant of panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def test_inference_ctc_batched(self): # TODO: enable this test once the finetuned models are available model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-100h").to( torch_device) processor = Wav2Vec2Processor.from_pretrained( "asapp/sew-d-tiny-100k-ft-100h", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def __init__(self, device="cuda"): self.encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base") self.encoder.eval() self.encoder = self.encoder.to(device) self.preprocessor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-base") self.preprocessor._sample_rate = 16000 self.device = device
def load(self): if not hasattr(self, 'processor'): self.processor = Wav2Vec2Processor.from_pretrained( self.recognizer_dir) if self.load_model: self.model = Wav2Vec2ForCTC.from_pretrained(self.recognizer_dir) if self.use_cuda: self.model = self.model.to("cuda")
def load_processor(vocab_dir= vocab_dir, cache_dir = cache_dir, force = False): global processor if processor: return processor tokenizer = load_tokenizer(vocab_dir,cache_dir) feature_extractor = load_feature_extractor() processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) return processor
def __init__(self): self.REQUIRED_SAMPLE_RATE = 16000 # Use Facebook's pretrained Wav2Vec2 model # https://huggingface.co/facebook/wav2vec2-large-960h PRETRAINED_MODEL = 'facebook/wav2vec2-base-960h' self.processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL) self.model = Wav2Vec2ForCTC.from_pretrained(PRETRAINED_MODEL)
def init(self): self.processor = Wav2Vec2Processor.from_pretrained(self.model_name) assert self.processor.feature_extractor.do_normalize is True self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name) target_dictionary = list(self.processor.tokenizer.get_vocab().keys()) print(f"target_dictionary: {target_dictionary}") self.decoder = GreedyDecoder(target_dictionary).init() return self
def evaluate_asr(): def read_txt(txt_path): data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence']) has_colon = data['path'].str.contains('|') data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True) data = Dataset.from_pandas(data) return(data) test = read_txt('./data/speech-sme-asr/test_asr.txt') processor = Wav2Vec2Processor.from_pretrained("asr_output/pretrained_processor") # print(processor.__dict__) # print(processor.tokenizer) # exit() model = Wav2Vec2ForCTC.from_pretrained("asr_output/checkpoint-27363").to("cpu") # print(model) # exit() # resampler = torchaudio.transforms.Resample(new_freq=16_000) def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load('./data/'+ batch["path"]) batch["speech"] = speech_array[0].numpy() return batch test_dataset = test.map(speech_file_to_array_fn) input_dict = processor(test_dataset['speech'][:11],sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(input_dict.input_values.to("cpu")).logits predicted_ids = torch.argmax(logits, dim=-1) print("Prediction:", processor.batch_decode(predicted_ids)) print("Reference:", test_dataset["sentence"][:11]) wer = load_metric("wer") resampler = torchaudio.transforms.Resample(48_000, 16_000) def evaluate_batch(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values.to("cpu"), attention_mask=inputs.attention_mask.to("cpu")).logits pred_ids = torch.argmax(logits, dim=-1) batch["pred_strings"] = processor.batch_decode(pred_ids) return batch result = test_dataset.map(evaluate_batch, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory msg = "WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])) print(msg) return msg
def _init_processor(self, config: EasyDict): config.processor.tokenizer.vocab_file = config.common.vocab_file tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer) feature_extractor = Wav2Vec2FeatureExtractor( **config.processor.feature_extractor) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(config.common.model_path) self._processor = processor
def test_mask_time_prob_ctc(self): model = UniSpeechSatForCTC.from_pretrained( "hf-internal-testing/tiny-random-unispeech-sat", mask_time_prob=0.2, mask_time_length=2 ) model.to(torch_device).train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True ) batch_duration_in_seconds = [1, 3, 2, 6] input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
def __init__(self, hidden_size=512, num_classes=8, device='cpu', sr=16000): super(Wav2VecClassifier, self).__init__() self.hidden_size = hidden_size self.sr = sr self.device = device self.processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-base-960h") self.model = Wav2Vec2Model.from_pretrained( "facebook/wav2vec2-base-960h") self.lstm = nn.LSTM(768, hidden_size, batch_first=True) self.fc = nn.Linear(hidden_size, num_classes)
def test_mask_time_prob_ctc(self): model = Data2VecAudioForCTC.from_pretrained( "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2 ) model.to(torch_device).train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) batch_duration_in_seconds = [1, 3, 2, 6] input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]