Ejemplo n.º 1
0
    def _transform_audio(self):

        waveform, rate = load(os.path.join(data_dir, self.file))
        new_rate = rate / 100
        resampled = Resample(rate, new_rate)(waveform)
        self.stft = self._get_stft(resampled, new_rate)
        self.mfcc = self._get_mfcc(resampled, new_rate)
Ejemplo n.º 2
0
 def __init__(self,
              input_sr,
              output_sr=None,
              melspec_buckets=80,
              hop_length=256,
              n_fft=1024,
              cut_silence=False):
     """
     The parameters are by default set up to do well
     on a 16kHz signal. A different frequency may
     require different hop_length and n_fft (e.g.
     doubling frequency --> doubling hop_length and
     doubling n_fft)
     """
     self.cut_silence = cut_silence
     self.sr = input_sr
     self.new_sr = output_sr
     self.hop_length = hop_length
     self.n_fft = n_fft
     self.mel_buckets = melspec_buckets
     self.vad = VoiceActivityDetection(
         sample_rate=input_sr
     )  # This needs heavy tweaking, depending of the data
     self.mu_encode = MuLawEncoding()
     self.mu_decode = MuLawDecoding()
     self.meter = pyln.Meter(input_sr)
     self.final_sr = input_sr
     if output_sr is not None and output_sr != input_sr:
         self.resample = Resample(orig_freq=input_sr, new_freq=output_sr)
         self.final_sr = output_sr
     else:
         self.resample = lambda x: x
 def tokenize(self, audio_path):
     audio_input, sampling_rate = sf.read(audio_path)
     resampler = Resample(sampling_rate)
     audio_input = resampler(th.tensor(audio_input))
     input_values = self.tokenizer(audio_input,
                                   return_tensors="pt").input_values
     return input_values
Ejemplo n.º 4
0
 def from_path(self, path: str, return_two=False, return_mfcc=False):
     logger.info(f"Processing {path} to tensor of spectrogram")
     # This comes in -1, 1 normalized
     waveform, inp_freq = torchaudio.load(path)
     waveform = waveform.mean(dim=0, keepdims=True)
     waveform = Resample(inp_freq, self.sample_rate)(waveform)
     n_samples = waveform.shape[1]
     min_samples = self.min_samples * 2 if return_two else self.min_samples
     if n_samples < min_samples:
         raise AudioTooShortError(
             f"Input must be at least {self.seconds * 2 if return_two else self.seconds} seconds long"
         )
     first_idx = torch.randint(0, n_samples - min_samples, (1,))
     first_waveform = waveform[:, first_idx : (first_idx + self.min_samples)]  # type: ignore
     first_sgram = self.forward(first_waveform)
     if return_mfcc:
         first_mfcc = self.get_mfcc(first_waveform)
     if return_two:
         second_idx = torch.randint(first_idx.item() + self.min_samples, n_samples - self.min_samples, (1,))  # type: ignore
         second_waveform = waveform[:, second_idx : (second_idx + self.min_samples)]
         second_sgram = self.forward(second_waveform)
         if return_mfcc:
             second_mfcc = self.get_mfcc(second_waveform)
             return first_sgram, first_mfcc, second_sgram, second_mfcc
         return first_sgram, second_sgram
     if return_mfcc:
         return first_sgram, first_mfcc
     return first_sgram
class AudioFile:
    def __init__(self, filename: str, transcription: str,
            pronunciation_dictionary: PronunciationDictionary,
            fileobj: Optional[BinaryIO] = None,
            wavobj: Optional[Tuple[Tensor, int]] = None,
            offset: int = 0):

        self.filename = filename
        self.pronunciation_dictionary = pronunciation_dictionary
        self.offset = offset
        self.load_audio(fileobj, wavobj)

        self.transcription, self.words = pronunciation_dictionary.spell_sentence(transcription, return_words=True)
        self.tensor_transcription = torch.tensor([self.pronunciation_dictionary.phonemic_mapping[x] \
                                                    for x in self.transcription])


    def load_audio(self, fileobj: Optional[BinaryIO] = None, wavobj = None):
        if fileobj is not None:
            self.wav, sr = torchaudio.load(fileobj)
        elif wavobj is not None:
            self.wav, sr = wavobj
        else:
            self.wav, sr = torchaudio.load(self.filename)
        if self.wav.shape[0] != 1:
            self.wav = torch.mean(self.wav, dim=0).unsqueeze(0)

        if sr != 16000:
            self.wav = Resample(sr, 16000)(self.wav)

    def move_to_device(self, device:str):
        self.wav = self.wav.to(device)
        self.tensor_transcription = self.tensor_transcription.to(device)
Ejemplo n.º 6
0
def load(filename, sample_rate):
    y, source_rate = torchaudio.load(filename)
    if source_rate != sample_rate:
        resample = Resample(source_rate, sample_rate)
        y = resample(y)

    return y
Ejemplo n.º 7
0
	def __init__(self, sr: int, n_steps: float, bins_per_octave: int = 12, p: float = 1.0):
		super().__init__(p=p)
		self.sr = sr
		self.n_steps = n_steps
		self.bins_per_octave = bins_per_octave

		self.resample = Resample()
		self.crop = Crop(0)
Ejemplo n.º 8
0
 def predict(self, audio_path, features_path):
     audio_input, sampling_rate = sf.read(audio_path)
     resampler = Resample(sampling_rate)
     audio_input = resampler(th.tensor(audio_input))
     input_values = self.tokenizer(
         audio_input, return_tensors="pt").input_values.to(self.device)
     hidden_state = self.raw_model(input_values).last_hidden_state
     hidden_state = hidden_state.flatten()  # this shouldn't be here
     hidden_state = hidden_state.cpu().detach().numpy()
     np.save(features_path, hidden_state)
Ejemplo n.º 9
0
def preprocess(mp3):
    sample_rate = 16000
    root_dir = Path("/home/nlpmaster/ssd-1t/corpus/TaiBible/PKL")
    new_dir = Path("/home/nlpmaster/ssd-1t/corpus/TaiBible/PKL_wav")
    y, sr = torchaudio.load(str(root_dir.joinpath(mp3)))
    resample = Resample(orig_freq=sr, new_freq=sample_rate)
    resampled_y = resample(y)
    wavfile = new_dir.joinpath(mp3)
    wavfile.parent.mkdir(exist_ok=True)
    torchaudio.save(str(wavfile), resampled_y, sample_rate=sample_rate)
    return str(wavfile)
def main():
    wav, sr = sf.read(args.recording)
    target_sr = 16000
    if USE_TORCHAUDIO_RESAMPLING:
        resampling_transform = Resample(orig_freq=sr, new_freq=target_sr)

        inputs = resampling_transform(torch.Tensor([wav])).squeeze()
    else:
        inputs = resample(wav, num=int(len(wav) * target_sr / sr))

    print(wavenet.transcribe(inputs))
Ejemplo n.º 11
0
def make_train_dataset(dataset_dir, speakers=None):
    """Make the training dataset for MVAE from the VCC2018 dataset.

    Args:
        dataset_dir (str): Path of the VCC2018 dataset.
        speakers (List[str]): Speakers to be used.

    Returns:
        List[Tuple[torch.Tensor, torch.Tensor]]:
            List of spectrogram and speaker label.
    """
    training_dir = os.path.join(dataset_dir, 'vcc2018_training')
    evaluation_dir = os.path.join(dataset_dir, 'vcc2018_evaluation')
    if speakers is None:
        speakers = [
            speaker for speaker in os.listdir(training_dir)
            if speaker.startswith('VCC2')
            and os.path.isdir(os.path.join(training_dir, speaker))
        ]

    resample = Resample(22050, 16000)
    create_spectrogram = Spectrogram(n_fft=N_FFT, hop_length=HOP_LEN)

    dataset = []
    with torch.no_grad():
        for c, speaker in enumerate(speakers):
            speaker_dir = os.path.join(training_dir, speaker)
            wav_files = [
                os.path.join(speaker_dir, wav_file)
                for wav_file in os.listdir(speaker_dir)
                if os.path.splitext(wav_file)[1] == '.wav'
            ]
            speaker_dir = os.path.join(evaluation_dir, speaker)
            wav_files.extend([
                os.path.join(speaker_dir, wav_file)
                for wav_file in os.listdir(speaker_dir)
                if os.path.splitext(wav_file)[1] == '.wav'
            ])
            spectrogram = []
            for wav_file in wav_files:
                sound, _ = torchaudio.load(wav_file)
                sound = resample(sound)
                spectrogram.append(create_spectrogram(sound).squeeze(0))
            spectrogram = torch.cat(spectrogram, dim=1)

            hop_length = DATA_LEN // 4
            for n in range((spectrogram.size(1) - DATA_LEN) // hop_length + 1):
                start = n * hop_length
                data = spectrogram[:, start:start + DATA_LEN]
                label = torch.zeros(len(speakers))
                label[c] = 1
                dataset.append((data, label))

    return dataset
Ejemplo n.º 12
0
    def _load(self, path, mfcc=True):
        try:
            waveform, ori_sr = torchaudio.load(path)
            waveform = waveform.mean(0, keepdims=True)
        except RuntimeError:
            raise Exception(f"Error loading {path}")
        _resample = Resample(ori_sr, self.sr)
        audio = _resample(waveform)

        if mfcc:
            audio = self._mfcc(audio)
        return audio
Ejemplo n.º 13
0
def main(meta_dir: str, pretrained_path: str, model_name: str = 'generator_mb'):
    # load model
    gen = load_model(model_name, pretrained_path).cuda()

    print(gen)
    print(f'Numb. Parameters : {sum(p.numel() for p in gen.parameters() if p.requires_grad)}')

    # make mel func
    mel_func = LogMelSpectrogram(
        settings.SAMPLE_RATE, settings.MEL_SIZE, settings.WIN_LENGTH, settings.WIN_LENGTH, settings.HOP_LENGTH,
        float(settings.MEL_MIN), float(settings.MEL_MAX)
    ).cuda()

    pqmf_func = PQMF().cuda()

    # get datasets
    _, valid_loader = get_datasets(
        meta_dir, batch_size=1, num_workers=1, crop_length=0, random_seed=1234
    )

    resample_func = Resample(22050, 16000).cuda()

    # score
    score_list = []

    for wav, _ in tqdm(valid_loader):
        wav = wav.cuda()

        # to mel
        mel = mel_func(wav)

        with torch.no_grad():
            pred_subbands = gen(mel)
            pred = pqmf_func.synthesis(pred_subbands)
        pred, wav = match_dim(pred, wav)

        # resample
        pred = resample_func(pred)
        wav = resample_func(wav)

        # to cpu
        wav = wav.cpu().numpy().squeeze()
        pred = pred.detach().cpu().numpy().squeeze()

        # resample
        item_score = pesq(16000, wav, pred.clip(-1., 1.), 'wb')
        score_list.append(item_score)

    print(
        f'mean : {np.mean(score_list)}, std : {np.std(score_list)}, '
        f'min : {np.min(score_list)}, max : {np.max(score_list)}'
    )
Ejemplo n.º 14
0
 def __init__(self, data_dir, meta_path, pre_load=True):
     self.data_dir = data_dir
     self.pre_load = pre_load
     with open(meta_path, 'r') as f:
         self.data = json.load(f)
     self.class_dict = self.data['labels']
     self.class_num = len(self.class_dict)
     self.meta_data = self.data['meta_data']
     _, origin_sr = torchaudio.load(
         path_join(self.data_dir, self.meta_data[0]['path']))
     self.resampler = Resample(origin_sr, SAMPLE_RATE)
     if self.pre_load:
         self.wavs = self._load_all()
Ejemplo n.º 15
0
 def __init__(self,
              augs_list,
              cap=3,
              resample=True,
              osr=44100,
              nsr=16000,
              sec=5,
              stretch_p=0.5):
     self.augs_list = augs_list
     self.cap = cap
     self.resampler = Resample(orig_freq=osr, new_freq=nsr)
     self.sampler = SameSize(sec * nsr)
     self.wav_stretcher = Stretcher(p=stretch_p)
Ejemplo n.º 16
0
def spectrogram_from_audio(audio: Tensor, sample_rate: int, resample_rate: int,
                           mel_filters: int, seconds: int) -> Tensor:
    resampled_audio = Resample(orig_freq=sample_rate,
                               new_freq=resample_rate)(audio)
    mono_audio = mean(resampled_audio, dim=0, keepdim=True)
    mel_transform = MelSpectrogram(sample_rate=resample_rate,
                                   n_mels=mel_filters)
    spectrogram = mel_transform(mono_audio)
    log_spectrogram = AmplitudeToDB()(spectrogram)
    original_length = log_spectrogram.shape[2]
    length = seconds * (resample_rate // mel_transform.hop_length)
    return pad(log_spectrogram, (0, length - original_length)) if original_length < length \
        else log_spectrogram[:, :, :length]
Ejemplo n.º 17
0
def preprocess_as_spec(path_wav: Path, id: ItemIdJSSS, dir_dataset: Path, new_sr: Optional[int] = None) -> None:
    """Transform JSSS corpus contents into spectrogram Tensor.

    Before this preprocessing, corpus contents should be deployed.
    """
    
    waveform, _sr_orig = load_wav(path_wav)
    if new_sr is not None:
        waveform = Resample(_sr_orig, new_sr)(waveform)
    # :: [1, Length] -> [Length,]
    waveform: Tensor = waveform[0, :]
    # defaults: hop_length = win_length // 2, window_fn = torch.hann_window, power = 2
    spec: Tensor = Spectrogram(254)(waveform)
    path_spec = get_dataset_spec_path(dir_dataset, id)
    path_spec.parent.mkdir(parents=True, exist_ok=True)
    save(spec, path_spec)
Ejemplo n.º 18
0
def resample_wav(
        input_path: Path,
        output_path: Path,
        stereo_to_mono: bool = True,
        sampling_rate: int = 22050
):
    waveform, original_sampling_rate = torchaudio.load(input_path)

    waveform = Resample(original_sampling_rate, sampling_rate)(waveform)
    if stereo_to_mono and len(waveform.shape) == 2:
        # waveform.shape==(channels, time) - we have to trim to 1 channel
        # note: we could also take mean from 2 channels but this is not guaranteed to work:
        # https://dsp.stackexchange.com/questions/2484/converting-from-stereo-to-mono-by-averaging
        waveform = waveform[0].unsqueeze(0)

    torchaudio.save(str(output_path), waveform, sampling_rate)
Ejemplo n.º 19
0
 def __init__(self, sample_rate, n_fft, top_db, max_perc):
     super().__init__()
     self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft//2+1)
     self.stft = Spectrogram(n_fft=n_fft, power=None)
     self.com_norm = ComplexNorm(power=2.)
     self.fm = FrequencyMasking(50)
     self.tm = TimeMasking(50)
     self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000)
     self.AtoDB= AmplitudeToDB(top_db=top_db)
     self.max_perc = max_perc
     self.sample_rate = sample_rate
     self.resamples = [
             Resample(sample_rate, sample_rate*0.6),
             Resample(sample_rate, sample_rate*0.7),
             Resample(sample_rate, sample_rate*0.8),
             Resample(sample_rate, sample_rate*0.9),
             Resample(sample_rate, sample_rate*1),
             Resample(sample_rate, sample_rate*1.1),
             Resample(sample_rate, sample_rate*1.2),
             Resample(sample_rate, sample_rate*1.3),
             Resample(sample_rate, sample_rate*1.4)
         ]
Ejemplo n.º 20
0
def preprocess_as_wave(path_wav: Path,
                       id: ItemIdJSSS,
                       dir_dataset: Path,
                       new_sr: Optional[int] = None) -> None:
    """Transform JSSS corpus contents into waveform Tensor.
    
    Before this preprocessing, corpus contents should be deployed.
    """

    waveform, _sr_orig = load_wav(path_wav)
    if new_sr is not None:
        waveform = Resample(_sr_orig, new_sr)(waveform)
    # :: [1, Length] -> [Length,]
    waveform: Tensor = waveform[0, :]
    path_wave = get_dataset_wave_path(dir_dataset, id)
    path_wave.parent.mkdir(parents=True, exist_ok=True)
    save(waveform, path_wave)
Ejemplo n.º 21
0
def postprocess(feats, curr_sample_rate, normalize=True):
    if args.sample_rate != curr_sample_rate:
        feats = Resample(curr_sample_rate, args.sample_rate)(feats)

    if feats.dim() == 2:
        feats = feats.mean(-1)

    assert feats.dim() == 1, feats.dim()

    if normalize:
        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
    return feats
Ejemplo n.º 22
0
    def _load(self, line, mfcc=True, wav_name=None):
        if wav_name:
            waveform, ori_sr = torchaudio.load(wav_name)
            waveform = waveform.mean(0, keepdims=True)
        else:
            try:
                waveform, ori_sr = torchaudio.load(
                    line.audio_fn,
                    frame_offset=line.start_frame,
                    num_frames=line.nframes)
                waveform = waveform.mean(0, keepdims=True)
            except RuntimeError:
                raise Exception(f"Error loading {line.audio_fn}")
        _resample = Resample(ori_sr, self.sr)
        audio = _resample(waveform)
        # print('audio',audio.shape)

        if mfcc:
            audio = self._mfcc(audio)
        return audio
Ejemplo n.º 23
0
    def text_to_instance(self, data: Tuple[str,
                                           str]) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        wav_file, text = data
        if callable(wav_file):
            y = wav_file()
        else:
            y, orig_freq = torchaudio.load(wav_file)
            if orig_freq != self._sample_rate:
                resample = Resample(orig_freq=orig_freq, new_freq=16000)
                y = resample(y)

        source_array = torchaudio.compliance.kaldi.fbank(
            y, num_mel_bins=80, use_energy=True).detach()
        #source_array = self._mel_spectrogram(y).detach()
        source_array, src_len = pad_and_stack(source_array,
                                              self.input_stack_rate,
                                              self.model_stack_rate,
                                              pad_mode=self._pad_mode)
        source_length_field = LabelField(src_len, skip_indexing=True)
        source_field = TensorField(source_array)

        if text is not None:
            target = self._target_tokenizer.tokenize(text)
            if self._target_add_start_end_token:
                target.insert(0, Token(START_SYMBOL))
                target.append(Token(END_SYMBOL))

            target_field = TextField(target, self._target_token_indexers)
            return Instance({
                "source_features": source_field,
                "target_tokens": target_field,
                "source_lengths": source_length_field
            })
        else:
            return Instance({
                "source_features": source_field,
                "source_lengths": source_length_field
            })
Ejemplo n.º 24
0
    def __init__(self,
                 root: str,
                 training: bool = True,
                 frequency: int = 16000,
                 max_length: int = 280,
                 transform=None,
                 return_length: bool = False):
        self.data = []
        self.return_length = return_length
        if transform is None:
            self.transform = MFCC(frequency)
        else:
            self.transform = transform

        self.training = training
        self.filenames = []
        self.max_length = max_length
        if frequency != 16000:
            self.resampler = Resample(orig_freq=16000, new_freq=frequency)

        if training:
            df_labels = pd.read_csv(root + "train_label.csv")
            root = root + "Train/"
            self.labels = []
        else:
            root = root + "Public_Test/"

        for filename in os.listdir(root):
            if filename.endswith(".wav"):
                self.filenames.append(filename)
                input_audio, sample_rate = load_wav(root + filename)
                if frequency != 16000:
                    input_audio = self.resampler(input_audio)

                self.data.append(input_audio)
                if training:
                    self.labels.append(
                        df_labels.loc[df_labels["File"] == filename,
                                      "Label"].values.item())
Ejemplo n.º 25
0
 def _audio_transform(self):
     """
     This function contains example transforms using both PyTorchVideo and TorchAudio
     in the same Callable.
     """
     args = self.args
     n_fft = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
     )
     hop_length = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
     )
     eps = 1e-10
     return ApplyTransformToKey(
         key="audio",
         transform=Compose(
             [
                 Resample(
                     orig_freq=args.audio_raw_sample_rate,
                     new_freq=args.audio_resampled_rate,
                 ),
                 MelSpectrogram(
                     sample_rate=args.audio_resampled_rate,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     n_mels=args.audio_num_mels,
                     center=False,
                 ),
                 Lambda(lambda x: x.clamp(min=eps)),
                 Lambda(torch.log),
                 UniformTemporalSubsample(args.audio_mel_num_subsample),
                 Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
                 Lambda(
                     lambda x: x.view(1, x.size(0), 1, x.size(1))
                 ),  # (T, F) -> (1, T, 1, F)
                 Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
             ]
         ),
     )
Ejemplo n.º 26
0
def download_vctk(destination, tmp_dir=None, device="cpu"):
    """Download dataset and perform resample to 16000 Hz.

    Arguments
    ---------
    destination : str
        Place to put final zipped dataset.
    tmp_dir : str
        Location to store temporary files. Will use `tempfile` if not provided.
    device : str
        Passed directly to pytorch's ``.to()`` method. Used for resampling.
    """
    dataset_name = "noisy-vctk-16k"
    if tmp_dir is None:
        tmp_dir = tempfile.gettempdir()
    final_dir = os.path.join(tmp_dir, dataset_name)

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)

    if not os.path.isdir(final_dir):
        os.mkdir(final_dir)

    prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/"
    noisy_vctk_urls = [
        prefix + "clean_testset_wav.zip",
        prefix + "noisy_testset_wav.zip",
        prefix + "testset_txt.zip",
        prefix + "clean_trainset_28spk_wav.zip",
        prefix + "noisy_trainset_28spk_wav.zip",
        prefix + "trainset_28spk_txt.zip",
    ]

    zip_files = []
    for url in noisy_vctk_urls:
        filename = os.path.join(tmp_dir, url.split("/")[-1])
        zip_files.append(filename)
        if not os.path.isfile(filename):
            logger.info("Downloading " + url)
            with urllib.request.urlopen(url) as response:
                with open(filename, "wb") as tmp_file:
                    logger.info("... to " + tmp_file.name)
                    shutil.copyfileobj(response, tmp_file)

    # Unzip
    for zip_file in zip_files:
        logger.info("Unzipping " + zip_file)
        shutil.unpack_archive(zip_file, tmp_dir, "zip")
        os.remove(zip_file)

    # Move transcripts to final dir
    shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir)
    shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir)

    # Downsample
    dirs = [
        "noisy_testset_wav",
        "clean_testset_wav",
        "noisy_trainset_28spk_wav",
        "clean_trainset_28spk_wav",
    ]

    downsampler = Resample(orig_freq=48000, new_freq=16000)

    for directory in dirs:
        logger.info("Resampling " + directory)
        dirname = os.path.join(tmp_dir, directory)

        # Make directory to store downsampled files
        dirname_16k = os.path.join(final_dir, directory + "_16k")
        if not os.path.isdir(dirname_16k):
            os.mkdir(dirname_16k)

        # Load files and downsample
        for filename in get_all_files(dirname, match_and=[".wav"]):
            signal, rate = torchaudio.load(filename)
            downsampled_signal = downsampler(signal.view(1, -1).to(device))

            # Save downsampled file
            torchaudio.save(
                os.path.join(dirname_16k, filename[-12:]),
                downsampled_signal[0].cpu(),
                sample_rate=16000,
                channels_first=False,
            )

            # Remove old file
            os.remove(filename)

        # Remove old directory
        os.rmdir(dirname)

    logger.info("Zipping " + final_dir)
    final_zip = shutil.make_archive(
        base_name=final_dir,
        format="zip",
        root_dir=os.path.dirname(final_dir),
        base_dir=os.path.basename(final_dir),
    )

    logger.info(f"Moving {final_zip} to {destination}")
    shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
Ejemplo n.º 27
0
    async def __resample_file(self, array, original_sr, target_sr):
        resampling_transform = Resample(orig_freq=original_sr,
                                        new_freq=target_sr)

        sample = resampling_transform(torch.Tensor([array])).squeeze()
        return sample
Ejemplo n.º 28
0
    return base64.urlsafe_b64decode(encoded[21:])

def create_tmp_file(binary_obj):
    with open('templates/tmp.webm', 'wb') as f:
        f.write(binary_obj)

def convert_tmp_file():
    cmd = 'ffmpeg -y -i templates/tmp.webm -vn templates/tmp.wav'
    subprocess.call(cmd.split())

def load_as_tensor(transform):
    wf, sampling_rate = torchaudio.load('templates/tmp.wav')
    wf = transform(wf)
    return wf

resample = Resample(48000, 16000)

def pipeline(encoded):
    create_tmp_file(decode_base64(encoded))
    convert_tmp_file()
    return load_as_tensor(resample)

# ============================================================
class Model:
    def __init__(self, classifier_config_path):
        clf_cfg = Hparam(classifier_config_path)
        cpc_cfg = Hparam(clf_cfg.model.cpc_config_path)
        self.device = clf_cfg.train.device

        speakers_bank = pickle.load(open('templates/mean_speakers_vecs_dict.pkl', 'rb'))
        self.speakers, self.mean_vecs = list(speakers_bank.keys()), torch.stack(list(speakers_bank.values()), dim=0)
Ejemplo n.º 29
0
def process_utterance(in_dir, out_dir, spker, basename):
    wav_path = os.path.join(in_dir, 'wav48', spker, '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', spker,
                           '{}.TextGrid'.format(basename))

    if not os.path.exists(tg_path):
        return None

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    sr, wav = read(wav_path)
    wav = torch.tensor(wav.astype(np.float32))
    if sr != hp.sampling_rate:
        wav = Resample(orig_freq=sr, new_freq=hp.sampling_rate)(wav)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)]

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.numpy().astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate * 1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav)
    mel_spectrogram = mel_spectrogram.cpu().numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # if the shape is not right, you can check get_alignment function
    try:
        assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1])
    except AssertionError as e:
        print("duration problem: {}".format(wav_path))
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    try:
        return '|'.join([basename, text]), max(f0), min([
            f for f in f0 if f != 0
        ]), max(energy), min(energy), mel_spectrogram.shape[1]
    except:
        print(basename)
        return None
Ejemplo n.º 30
0
import numpy as np
import librosa
import math

import torch
import torchaudio
from torchaudio.transforms import Spectrogram, MelSpectrogram, AmplitudeToDB, ComplexNorm, Resample
from torchaudio.functional import lowpass_biquad, highpass_biquad

from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
from multiprocessing import Pool, cpu_count

def cov_pk(info):
    path, resample, rate = info
    
    waveform = torch.load(path)
    waveform = resample(waveform)
    ebird_code = path.parent.name
    torch.save(waveform, f'../../dataset/tensor_audio/{ebird_code}/re{rate}-{path.stem}.tensor')

NUM_WORKERS = cpu_count()
sr = 32_000

for i in [0.8, 0.9, 1.1, 1.2]:
    resample = Resample(sr, sr*i)
    for directory in tqdm(Path('../../dataset/tensor_audio').iterdir()):
        file_paths = list(directory.iterdir())
        with Pool(5) as p:
            p.map(cov_pk, (file_paths, resample, i))