Example #1
0
 def _init_model(self, config: EasyDict):
     if not config.common.checkpoint_model:
         print(f"Loading pretrained model {config.common.pretrained_model}")
         config.model.pretrained_model_name_or_path = config.common.pretrained_model
         config.model.pad_token_id = self._processor.tokenizer.pad_token_id
         config.model.vocab_size = len(self._processor.tokenizer)
         self._model = Wav2Vec2ForCTC.from_pretrained(**config.model)
     else:
         print(f"Loading from checkpoint {config.common.checkpoint_model}")
         self._model = Wav2Vec2ForCTC.from_pretrained(
             config.common.checkpoint_model).to("cuda")
Example #2
0
def load_asr_model(device):
    """Load model"""
    print(f"[INFO]: Load the pre-trained ASR by {ASR_PRETRAINED_MODEL}.")
    model = Wav2Vec2ForCTC.from_pretrained(ASR_PRETRAINED_MODEL).to(device)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(ASR_PRETRAINED_MODEL)
    models = {"model": model, "tokenizer": tokenizer}
    return models
Example #3
0
def main(input_pipe, output_pipe):
    source = ZeroMQSource(input_pipe)
    sink = ZeroMQSink(output_pipe)

    debug('[+] loading processor')
    processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
    debug('[+] loading model')
    model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')

    sink.signal_ready()
    try:
        while True:
            debug('>>> waiting for connection')
            # torchaudio seems to expect complete files so send small parts
            buf = io.BytesIO(source.recv())
            start = time.time()
            debug('[+] converting audio')
            wav = io.BytesIO(webm_to_wav(buf.read()))
            waveform, sample_rate = torchaudio.load(wav)
            waveform = waveform[0]  # Wav2Vec2Processor expects mono 16kHz audio
            debug('[+] input_values')
            input_values = processor(waveform, sampling_rate=sample_rate, return_tensors='pt').input_values
            debug('[+] logits')
            logits = model(input_values).logits
            debug('[+] predicted_ids')
            predicted_ids = torch.argmax(logits, dim=-1)
            debug('[+] transcription')
            transcription = processor.batch_decode(predicted_ids)[0]
            duration = time.time() - start
            sink.send(start, duration, transcription)
    finally:
        source.close()
        sink.close()
Example #4
0
def main():
    config = configparser.ConfigParser()
    config.read('config.ini')
    # Initialize tokenizer and model from HuggingFace
    tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

    if config.getboolean('config', 'from_microphone'):
        # Record from microphone and transcript
        audio = record_from_mic(config)
        transcriptions = wav2vec2_inference(audio, tokenizer, model)
        print(f"Transcribed audio: {transcriptions}")
        if config.getboolean('config', 'save_transcriptions'):
            with open('mic_transcription.txt', 'w') as file:
                file.write(transcriptions)
            print(f"Transcribed audio stored in mic_transcription.txt")
    else:
        # Transcript files in configuration file
        audio_files = json.loads(config.get('config', 'audio_files'))
        for audio_file in audio_files:
            audio, _ = sf.read(audio_file, dtype='float32')
            transcriptions = wav2vec2_inference(audio, tokenizer, model)
            print(f"Transcribed audio: {transcriptions}")
            if config.getboolean('config', 'save_transcriptions'):
                with open(f'{Path(audio_file).stem}.txt', 'w') as file:
                    file.write(transcriptions)
                print(f"Transcribed audio stored in {Path(audio_file).stem}.txt")
Example #5
0
 def __init__(self):
     super(ASR_CTC, self).__init__()
     #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base')
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab())
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                           unk_token="<unk>",
                                           pad_token="<pad>",
                                           word_delimiter_token="|")
     self.feature_extractor = Wav2Vec2FeatureExtractor(
         feature_size=1,
         sampling_rate=16000,
         padding_value=0.0,
         do_normalize=True,
         return_attention_mask=True)
     self.processor = Wav2Vec2Processor(
         feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
     self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained(
         "facebook/wav2vec2-large-xlsr-53",
         attention_dropout=0.1,
         hidden_dropout=0.1,
         feat_proj_dropout=0.0,
         mask_time_prob=0.05,
         layerdrop=0.1,
         gradient_checkpointing=True,
         ctc_loss_reduction="mean",
         pad_token_id=self.processor.tokenizer.pad_token_id,
         vocab_size=len(self.processor.tokenizer))
Example #6
0
def load_model():
    model_name = "facebook/wav2vec2-large-xlsr-53-french"
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    resampler = torchaudio.transforms.Resample(orig_freq=16_000,
                                               new_freq=16_000)
    return model, processor, resampler
    def test_simple_wav2vec2(self):
        import numpy as np
        from datasets import load_dataset

        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        tokenizer = AutoTokenizer.from_pretrained(
            "facebook/wav2vec2-base-960h")
        feature_extractor = AutoFeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base-960h")

        asr = AutomaticSpeechRecognitionPipeline(
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor)

        waveform = np.zeros((34000, ))
        output = asr(waveform)
        self.assertEqual(output, {"text": ""})

        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy",
                          "clean",
                          split="validation")
        filename = ds[0]["file"]
        output = asr(filename)
        self.assertEqual(output,
                         {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})

        filename = ds[0]["file"]
        with open(filename, "rb") as f:
            data = f.read()
        output = asr(data)
        self.assertEqual(output,
                         {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
Example #8
0
    def __init__(self,
                 subtitle_lookup_path,
                 wav_dir,
                 wav2vec_checkpoint,
                 pretrained_chpt="facebook/wav2vec2-base-960h",
                 num_proc=4,
                 preprocess_batch_size=8):
        """
        Args:
            subtitle_lookup_path (NOTE this file must be compatible with pd.read_csv())
            wav_dir: directory of wav files e.g. './wav_data/'
            pretrained_chpt: pretrained checkpoints to load
            num_proc: number of processes allowed when doing dataset preprocessing
            preprocess_batch_size: this is ONLY used inside this dataset to preprocess wav files faster
        Returns:
            
        """
        self.subtitle_lookup_path = subtitle_lookup_path
        self.wav_dir = wav_dir
        self.num_proc = num_proc
        self.preprocess_batch_size = preprocess_batch_size
        self.processor = Wav2Vec2Processor.from_pretrained(pretrained_chpt)
        self.feature_extractor = Wav2Vec2ForCTC.from_pretrained(
            pretrained_chpt).wav2vec2.feature_extractor

        # self.knnw_prepared = dataset_dict.load_from_disk(wav2vec_checkpoint)
        # self.knnw_prepared.save_to_disk(wavdir)
        self.knnw_prepared = self.setup_dataset()
    def check_training(self, config, input_values, *args):
        config.ctc_zero_infinity = True
        model = Wav2Vec2ForCTC(config=config)
        model.to(torch_device)
        model.train()

        # freeze feature encoder
        model.freeze_feature_extractor()

        input_values = input_values[:3]

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(
            torch.tensor(input_lengths))
        labels = ids_tensor(
            (input_values.shape[0], max(max_length_labels) - 2),
            model.config.vocab_size)

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i]:] = 0.0

            if max_length_labels[i] < labels.shape[-1]:
                # it's important that we make sure that target lenghts are at least
                # one shorter than logit lenghts to prevent -inf
                labels[i, max_length_labels[i] - 1:] = -100

        loss = model(input_values, labels=labels).loss
        self.parent.assertFalse(torch.isinf(loss).item())

        loss.backward()
    def test_inference_ctc_normal_batched(self):
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        model.to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-base-960h", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech,
                           return_tensors="pt",
                           padding=True,
                           truncation=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
Example #11
0
def get_predictions(test_dir_root: str, bs: int, extra_step: float, loading_step: float) -> None:

    device = torch.device("cuda:0") if torch.cuda.is_available() \
        else torch.device("cpu")

    # load model and tokenizer
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).eval().to(device)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(MODEL_NAME)

    test_dir_root = Path(test_dir_root)

    # iterate over the files in the correct order
    with open(test_dir_root / "FILE_ORDER", "r") as f:
        wav_file_order = f.read().splitlines()

    token_predictions = {}
    for wf in wav_file_order:
        wf = f"{wf}.wav"
        print(f"Generating token predictions for {wf}")
        path_to_wav = test_dir_root / "wavs" / wf
        token_predictions[wf] = get_preds_for_wav(model, tokenizer, device, bs,
            path_to_wav, extra_step, loading_step)

    test_dir_root.mkdir(parents = True, exist_ok = True)
    path_to_preds = test_dir_root / "token_predictions.json"
    with open(path_to_preds, "w") as f:
        json.dump(token_predictions, f)

    print(f"Wav2Vec predictions saved at {path_to_preds}")
    def test_simple_wav2vec2(self):

        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        tokenizer = AutoTokenizer.from_pretrained(
            "facebook/wav2vec2-base-960h")
        feature_extractor = AutoFeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base-960h")

        asr = AutomaticSpeechRecognitionPipeline(
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor)

        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
        output = asr(waveform)
        self.assertEqual(output, {"text": ""})

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
                          "clean",
                          split="validation").sort("id")
        filename = ds[40]["file"]
        output = asr(filename)
        self.assertEqual(output,
                         {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})

        filename = ds[40]["file"]
        with open(filename, "rb") as f:
            data = f.read()
        output = asr(data)
        self.assertEqual(output,
                         {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
Example #13
0
    def __init__(self,
                 model_name="facebook/wav2vec2-base-960h",
                 device=None,
                 half=False):
        """
        ```
        basic wrapper speech transcription

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          half(bool): If True, use half precision.
        ```
        """
        if not TORCH:
            raise ImportError('Transcriber requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None:
            self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #if not SOUNDFILE:
        #raise ImportError("SoundFile library not installed or libsndfile not found: pip install soundfile")
        if not LIBROSA:
            raise ImportError(
                "librosa library must be installed: pip install librosa. Conda users may also have to install ffmpeg: conda install -c conda-forge ffmpeg"
            )

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(
            self.torch_device)
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        if half: self.model = self.model.half()
def _get_model(model_id):
    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
    tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer
    labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])]
    original = Wav2Vec2ForCTC.from_pretrained(model_id)
    model = import_huggingface_model(original)
    return model.eval(), labels
def convert_to_onnx(model_id_or_path, onnx_model_name):
    print(f"Converting {model_id_or_path} to onnx")
    model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
    audio_len = 250000

    x = torch.randn(1, audio_len, requires_grad=True)

    torch.onnx.export(
        model,  # model being run
        x,  # model input (or a tuple for multiple inputs)
        onnx_model_name,  # where to save the model (can be a file or file-like object)
        export_params=
        True,  # store the trained parameter weights inside the model file
        opset_version=11,  # the ONNX version to export the model to
        do_constant_folding=
        True,  # whether to execute constant folding for optimization
        input_names=['input'],  # the model's input names
        output_names=['output'],  # the model's output names
        dynamic_axes={
            'input': {
                1: 'audio_len'
            },  # variable length axes
            'output': {
                1: 'audio_len'
            }
        })
    def test_chunking(self):
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        tokenizer = AutoTokenizer.from_pretrained(
            "facebook/wav2vec2-base-960h")
        feature_extractor = AutoFeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base-960h")
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor,
            framework="pt",
            chunk_length_s=10.0,
        )

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
                          "clean",
                          split="validation").sort("id")
        audio = ds[40]["audio"]["array"]

        n_repeats = 10
        audio = np.tile(audio, n_repeats)
        output = speech_recognizer([audio], batch_size=2)
        expected_text = "A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats
        expected = [{"text": expected_text.strip()}]
        self.assertEqual(output, expected)
Example #17
0
    def __init__(self,
                 path="facebook/wav2vec2-base-960h",
                 quantize=False,
                 gpu=True,
                 batch=64):
        """
        Constructs a new transcription pipeline.

        Args:
            path: optional path to model, accepts Hugging Face model hub id or local path,
                  uses default model for task if not provided
            quantize: if model should be quantized, defaults to False
            gpu: True/False if GPU should be enabled, also supports a GPU device id
            batch: batch size used to incrementally process content
            langdetect: path to language detection model, uses a default path if not provided
        """

        # Call parent constructor
        super().__init__(path, quantize, gpu, batch)

        if not SOUNDFILE:
            raise ImportError(
                "SoundFile library not installed or libsndfile not found")

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(self.path)
        self.processor = Wav2Vec2Processor.from_pretrained(self.path)

        # Move model to device
        self.model.to(self.device)
def get_model(tokenizer, n_langs=2):
    """Constructs the model with asr and language identification,
    from the base Wav2Vec2 model by modifying the last lm_head layer.
    Args:
        tokenizer: The tokenizer whose length is all the alphabets that 
                   the model can predict.
        n_langs: The number of different languages the model needs to distinguish between.
    Returns:
        The constructed model, having len(tokenizer)+n_langs+1 outputs in the last layer.
    """
    model = Wav2Vec2ForCTC.from_pretrained(config.model)

    pt_wts = model.lm_head.weight
    pt_bias = model.lm_head.bias

    new_lm_head = nn.Linear(
        pt_wts.shape[1],
        len(tokenizer) + (0 if n_langs <= 1 else n_langs + 1))

    init_wts = new_lm_head.weight.clone().detach()
    init_bs = new_lm_head.bias.clone().detach()
    init_wts[:pt_wts.shape[0], :] = pt_wts.clone().detach()
    init_wts[pt_wts.shape[0]:, :] = torch.mean(pt_wts.clone().detach(), dim=0)
    init_bs[:pt_bias.shape[0]] = pt_bias.clone().detach()
    init_bs[pt_wts.shape[0]:] = torch.mean(pt_bias.clone().detach(), dim=0)

    with torch.no_grad():
        new_lm_head.weight = nn.Parameter(init_wts)
        new_lm_head.bias = nn.Parameter(init_bs)

    model.lm_head = new_lm_head

    return model.to(config.device)
Example #19
0
    def __init__(self, model_name='te'):
        model_name = model_name.lower()
        for x, y in LANGUAGE_ALISASES.items():
            model_name = model_name.replace(x, y)

        if model_name not in MODEL_URLS and model_name not in LANGUAGE_ALISASES:
            if model_name in LANGUAGE_ALISASES:
                model_name = LANGUAGE_ALISASES[model_name]

            print(f"model_name should be one of {list(MODEL_URLS.keys())}")
            return None

        home = os.path.expanduser("~")
        lang_path = os.path.join(home, ".IndicASR_" + model_name)
        if not os.path.exists(lang_path):
            os.mkdir(lang_path)

        for file_name, url in MODEL_URLS[model_name].items():
            file_path = os.path.join(lang_path, file_name)
            if os.path.exists(file_path):
                continue
            print(f"Downloading {file_name}")
            pydload.dload(url=url, save_to_path=file_path, max_time=None)

        self.processor = Wav2Vec2Processor.from_pretrained(lang_path)
        self.model = Wav2Vec2ForCTC.from_pretrained(lang_path)

        if torch.cuda.is_available():
            print(f"Using GPU")
            self.model = self.model.cuda()
Example #20
0
def convert_wav2vec2_checkpoint(checkpoint_path,
                                pytorch_dump_folder_path,
                                config_path=None,
                                dict_path=None,
                                is_finetuned=True):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = Wav2Vec2Config.from_pretrained(config_path)
    else:
        config = Wav2Vec2Config()

    if is_finetuned:
        hf_wav2vec = Wav2Vec2ForCTC(config)
    else:
        hf_wav2vec = Wav2Vec2Model(config)

    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path], arg_overrides={"data": dict_path})
    else:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path])

    model = model[0].eval()

    recursively_load_weights(model, hf_wav2vec, is_finetuned)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
    def test_inference_ctc_robust_batched(self):
        model = Wav2Vec2ForCTC.from_pretrained(
            "facebook/wav2vec2-large-960h-lv60-self").to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)

        input_speech = self._load_datasamples(4)

        inputs = processor(input_speech,
                           return_tensors="pt",
                           padding=True,
                           truncation=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
            "his instant panic was followed by a small sharp blow high on his chest",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
    def check_ctc_loss(self, config, input_values, *args):
        model = Wav2Vec2ForCTC(config=config)
        model.to(torch_device)

        # make sure that dropout is disabled
        model.eval()

        input_values = input_values[:3]
        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i] :] = 0.0
            attention_mask[i, input_lengths[i] :] = 0

        model.config.ctc_loss_reduction = "sum"
        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        model.config.ctc_loss_reduction = "mean"
        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        self.parent.assertTrue(isinstance(sum_loss, float))
        self.parent.assertTrue(isinstance(mean_loss, float))
Example #23
0
 def load(self):
     if not hasattr(self, 'processor'):
         self.processor = Wav2Vec2Processor.from_pretrained(
             self.recognizer_dir)
     if self.load_model:
         self.model = Wav2Vec2ForCTC.from_pretrained(self.recognizer_dir)
         if self.use_cuda:
             self.model = self.model.to("cuda")
Example #24
0
    def __init__(self):
        self.REQUIRED_SAMPLE_RATE = 16000

        # Use Facebook's pretrained Wav2Vec2 model
        # https://huggingface.co/facebook/wav2vec2-large-960h
        PRETRAINED_MODEL = 'facebook/wav2vec2-base-960h'
        self.processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL)
        self.model = Wav2Vec2ForCTC.from_pretrained(PRETRAINED_MODEL)
Example #25
0
 def init(self):
     self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
     assert self.processor.feature_extractor.do_normalize is True
     self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
     target_dictionary = list(self.processor.tokenizer.get_vocab().keys())
     print(f"target_dictionary: {target_dictionary}")
     self.decoder = GreedyDecoder(target_dictionary).init()
     return self
Example #26
0
def evaluate_asr():
    def read_txt(txt_path):
        data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence'])
        
        has_colon = data['path'].str.contains('|')
        data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True)

        data = Dataset.from_pandas(data)
        return(data)

    test = read_txt('./data/speech-sme-asr/test_asr.txt')
    processor = Wav2Vec2Processor.from_pretrained("asr_output/pretrained_processor")
    # print(processor.__dict__)
    # print(processor.tokenizer)

    # exit()
    model = Wav2Vec2ForCTC.from_pretrained("asr_output/checkpoint-27363").to("cpu")
    # print(model)
    # exit()
    # resampler = torchaudio.transforms.Resample(new_freq=16_000)

    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load('./data/'+ batch["path"])
        batch["speech"] = speech_array[0].numpy()
        return batch

    test_dataset = test.map(speech_file_to_array_fn)
    input_dict = processor(test_dataset['speech'][:11],sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(input_dict.input_values.to("cpu")).logits


    predicted_ids = torch.argmax(logits, dim=-1)

    print("Prediction:", processor.batch_decode(predicted_ids))
    print("Reference:", test_dataset["sentence"][:11])

    wer = load_metric("wer")

    resampler = torchaudio.transforms.Resample(48_000, 16_000)

    def evaluate_batch(batch):
        inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

        with torch.no_grad():
            logits = model(inputs.input_values.to("cpu"), attention_mask=inputs.attention_mask.to("cpu")).logits

        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_strings"] = processor.batch_decode(pred_ids)
        return batch

    result = test_dataset.map(evaluate_batch, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory

    msg = "WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))
    print(msg)
    return msg
def process(PATH):
    audio, sampling_rate = librosa.load(PATH, sr=16000)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(
        "facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    input_values = tokenizer(audio, return_tensors='pt').input_values
    logits = model(input_values).logits
    prediction = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(prediction)[0]
    print(transcription)
    return
    def test_mask_time_prob_ctc(self):
        model = Wav2Vec2ForCTC.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
        )
        model.to(torch_device).train()
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
        )

        batch_duration_in_seconds = [1, 3, 2, 6]
        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
Example #29
0
class Transcription():
    """
    Simple class to upload the data in the sound file and transcribe it.
    """
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    #initialize file names
    origin_file = 'audio.wav'
    destination_file = 'rec4.wav'

    file_name = 'rec4.wav'
    file_path = os.path.join('.', file_name)

#    def __init__(self, origin_file):
#        self.origin_file = origin_file


    def change_filename(self):
        "Change the audio file from .oga to .wav"

        if os.path.exists(self.destination_file):
            os.remove(self.destination_file)

        process = subprocess.run(['ffmpeg', '-hide_banner','-i', self.origin_file, self.destination_file])
        if process.returncode != 0:
            raise Exception("Something went wrong")


    def map_to_array(self):
        "Read file and convert to a format that the model can accept"

        self.speech, self.sampling_rate = torchaudio.load(self.origin_file)
        self.resample_rate = 16000
        self.speech = librosa.resample(np.asarray(self.speech).reshape(-1,), self.sampling_rate, self.resample_rate)
        self.speech = librosa.to_mono(self.speech)
        return self.speech, self.resample_rate


    def indicate_transcription(self):
        "Transcribe"

        #self.change_filename()
        self.speech, self.sampling_rate = self.map_to_array()
        input_values = self.tokenizer(self.speech, return_tensors="pt", padding="longest").input_values
        logits = self.model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.tokenizer.batch_decode(predicted_ids)
        transcription = ''.join(transcription)
        return transcription.lower()

    def __str__(self):
        return self.indicate_transcription()
Example #30
0
def decode_audio(audio_values, recognizer_dir = '', model = None, processor = None):
	if recognizer_dir:
		model = Wav2Vec2ForCTC.from_pretrained(recognizer_dir).to('cuda')
		processor = Wav2Vec2Processor.from_pretrained(recognizer_dir)
	if not model: 
		m = 'please provide model directory or model and processor'
		raise ValueError(m)
	input_dict = processor(audio_values, return_tensors='pt', padding = True,
		sampling_rate = 16_000)
	logits = model(input_dict.input_values.to('cuda')).logits
	labels = torch_argmax(logits, dim = -1)[0]
	return labels_to_letters(pred_ids, processor=processor)