Example #1
0
 def __init__(self, n_audio_channel, training_files, segment_length,
              filter_length, hop_length, win_length, sampling_rate,
              mel_fmin, mel_fmax):
     # self.audio_files = files_to_list(training_files)
     # random.seed(1234)
     # random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin,
                              mel_fmax=mel_fmax,
                              n_group=n_audio_channel)
Example #2
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, n_audio_channel, training_files, segment_length,
                 filter_length, hop_length, win_length, sampling_rate,
                 mel_fmin, mel_fmax):
        # self.audio_files = files_to_list(training_files)
        # random.seed(1234)
        # random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax,
                                 n_group=n_audio_channel)
        # self.segment_length = segment_length
        # self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        # at this step, audio_norm this is same as torchaudio.load output in our repo
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec
Example #3
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, n_audio_channel, training_files, segment_length,
                 filter_length, hop_length, win_length, sampling_rate,
                 mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax,
                                 n_group=n_audio_channel)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Example #4
0
from scipy.io.wavfile import read
import numpy as np

from TacotronSTFT import TacotronSTFT

from timeit import default_timer as timer

## RTF is the real-time factor which tells how many seconds of speech are generated in 1 second of wall time

MAX_WAV_VALUE = 32768.0

n_audio_channel = 128

stft = TacotronSTFT(filter_length=1024,
                    hop_length=256,
                    win_length=1024,
                    sampling_rate=22050,
                    mel_fmin=0.0, mel_fmax=8000.0,
                    n_group=n_audio_channel)

def load_wav_to_torch(full_path):
    """
    Loads wavdata into torch array
    """
    sampling_rate, data = read(full_path)
    return torch.from_numpy(data).float(), sampling_rate

def get_mel(audio):
    audio_norm = audio / MAX_WAV_VALUE
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
Example #5
0
    def __init__(self,
                 n_audio_channel,
                 path_in,
                 split,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 stride,
                 temp_jitter=False,
                 store_in_ram=False,
                 seed=0,
                 split_utterances=True,
                 pc_split_utterances=0.1,
                 split_speakers=False,
                 pc_split_speakers=0.1,
                 frame_energy_thres=0.025,
                 do_audio_load=True,
                 trim=None,
                 select_speaker=None,
                 select_file=None,
                 verbose=True):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax,
                                 n_group=n_audio_channel)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

        # Temp Jitter may be True when doing data aug in training
        self.path_in = path_in
        self.split = split
        self.lchunk = segment_length
        self.stride = stride
        self.temp_jitter = temp_jitter
        self.store_in_ram = store_in_ram
        if trim is None or trim <= 0:
            trim = np.inf

        # Get filenames in folder and subfolders
        self.filenames = []
        for dirpath, dirnames, filenames in os.walk(self.path_in):
            for fn in filenames:
                if not fn.endswith(EXTENSION): continue
                new_fn = os.path.join(dirpath, fn)
                new_fn = os.path.relpath(new_fn, self.path_in)
                self.filenames.append(new_fn)
        self.filenames.sort()
        random.seed(seed)
        random.shuffle(self.filenames)

        # Get speakers & utterances
        self.speakers = {}
        self.utterances = {}
        for fullfn in self.filenames:
            spk, ut = self.filename_split(fullfn)
            if spk not in self.speakers:
                self.speakers[spk] = len(self.speakers)
            if ut not in self.utterances:
                self.utterances[ut] = len(self.utterances)
        self.n_max_speakers = len(self.speakers)

        # Split
        lutterances = list(self.utterances.keys())
        lutterances.sort()
        random.shuffle(lutterances)
        lspeakers = list(self.speakers.keys())
        lspeakers.sort()
        random.shuffle(lspeakers)
        isplit_ut = int(len(lutterances) * pc_split_utterances)
        isplit_spk = int(len(lspeakers) * pc_split_speakers)
        if split == 'train':
            spk_del = lspeakers[-2 * isplit_spk:]
            ut_del = lutterances[-2 * isplit_ut:]
        elif split == 'valid':
            spk_del = lspeakers[:-2 * isplit_spk] + lspeakers[-isplit_spk:]
            ut_del = lutterances[:-2 * isplit_ut] + lutterances[-isplit_ut:]
        elif split == 'train+valid':
            spk_del = lspeakers[-isplit_spk:]
            ut_del = lutterances[-isplit_ut:]
        elif split == 'test':
            spk_del = lspeakers[:-isplit_spk]
            ut_del = lutterances[:-isplit_ut]
        else:
            print('Not implemented split', split)
            sys.exit()
        if split_speakers:
            for spk in spk_del:
                del self.speakers[spk]
        if split_utterances:
            for ut in ut_del:
                del self.utterances[ut]

        # Filter filenames by speaker and utterance
        filenames_new = []
        for filename in self.filenames:
            spk, ut = self.filename_split(filename)
            if spk in self.speakers and ut in self.utterances:
                filenames_new.append(filename)
        self.filenames = filenames_new

        # Select speaker
        if select_speaker is not None:
            select_speaker = select_speaker.split(',')
            filenames_new = []
            for filename in self.filenames:
                spk, ut = self.filename_split(filename)
                if spk in select_speaker and spk in self.speakers:
                    filenames_new.append(filename)
            if len(filenames_new) == 0:
                print('\nERROR: Selected an invalid speaker. Options are:',
                      list(self.speakers.keys()))
                sys.exit()
            self.filenames = filenames_new

        # Select specific file
        if select_file is not None:
            select_file = select_file.split(',')
            filenames_new = []
            for filename in self.filenames:
                _, file = os.path.split(filename[:-len(EXTENSION)])
                if file in select_file:
                    filenames_new.append(filename)
            if len(filenames_new) == 0:
                print('\nERROR: Selected an invalid file. Options are:',
                      self.filenames[:int(np.min([50, len(self.filenames)]))],
                      '... (without folder and without extension))')
                sys.exit()
            self.filenames = filenames_new

        # Indices!
        self.audios = [None] * len(self.filenames)
        self.indices = []
        duration = {}
        start = time.time()
        if do_audio_load:
            for i, filename in enumerate(self.filenames):
                if verbose:
                    if i % 1000 == 0:
                        print('Read {} out of {} files'.format(
                            i + 1, len(self.filenames)))
                    # print('\rRead audio {:5.1f}%'.format(
                    #     100 * (i + 1) / len(self.filenames)), end='')
                # Info
                spk, ut = self.filename_split(filename)
                ispk, iut = self.speakers[spk], self.utterances[ut]
                # Load
                if spk not in duration:
                    duration[spk] = 0
                if duration[spk] >= trim:
                    continue
                x = torch.load(os.path.join(self.path_in, filename))
                if self.store_in_ram:
                    self.audios[i] = x.clone()
                x = x.float()
                # Process
                for j in range(0, len(x), stride):
                    if j + self.lchunk >= len(x):
                        continue
                    else:
                        xx = x[j:j + self.lchunk]
                    if (xx.pow(2).sum() /
                            self.lchunk).sqrt().item() >= frame_energy_thres:
                        info = [i, j, 0, ispk, iut]
                        self.indices.append(torch.LongTensor(info))
                    duration[spk] += stride / sampling_rate
                    if duration[spk] >= trim:
                        break
                self.indices[-1][2] = 1
            if verbose:
                print()
            self.indices = torch.stack(self.indices)
        print("Time elapsed: {}".format(time.time() - start))

        # Print
        if verbose:
            totalduration = 0
            for key in duration.keys():
                totalduration += duration[key]
            print(
                'Loaded {:s}: {:.1f} h, {:d} spk, {:d} ut, {:d} files, {:d} frames (fet={:.1e},'
                .format(split, totalduration / 3600, len(self.speakers),
                        len(self.utterances), len(self.filenames),
                        len(self.indices), frame_energy_thres),
                end='')
            if trim is None or trim > 1e12:
                print(' no trim)')
            else:
                print(' trim={:.1f}s)'.format(trim))
            if select_speaker is not None:
                print('Selected speaker(s):', select_speaker)
            if select_file is not None:
                print('Selected file(s):', select_file)
Example #6
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 n_audio_channel,
                 path_in,
                 split,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 stride,
                 temp_jitter=False,
                 store_in_ram=False,
                 seed=0,
                 split_utterances=True,
                 pc_split_utterances=0.1,
                 split_speakers=False,
                 pc_split_speakers=0.1,
                 frame_energy_thres=0.025,
                 do_audio_load=True,
                 trim=None,
                 select_speaker=None,
                 select_file=None,
                 verbose=True):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax,
                                 n_group=n_audio_channel)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

        # Temp Jitter may be True when doing data aug in training
        self.path_in = path_in
        self.split = split
        self.lchunk = segment_length
        self.stride = stride
        self.temp_jitter = temp_jitter
        self.store_in_ram = store_in_ram
        if trim is None or trim <= 0:
            trim = np.inf

        # Get filenames in folder and subfolders
        self.filenames = []
        for dirpath, dirnames, filenames in os.walk(self.path_in):
            for fn in filenames:
                if not fn.endswith(EXTENSION): continue
                new_fn = os.path.join(dirpath, fn)
                new_fn = os.path.relpath(new_fn, self.path_in)
                self.filenames.append(new_fn)
        self.filenames.sort()
        random.seed(seed)
        random.shuffle(self.filenames)

        # Get speakers & utterances
        self.speakers = {}
        self.utterances = {}
        for fullfn in self.filenames:
            spk, ut = self.filename_split(fullfn)
            if spk not in self.speakers:
                self.speakers[spk] = len(self.speakers)
            if ut not in self.utterances:
                self.utterances[ut] = len(self.utterances)
        self.n_max_speakers = len(self.speakers)

        # Split
        lutterances = list(self.utterances.keys())
        lutterances.sort()
        random.shuffle(lutterances)
        lspeakers = list(self.speakers.keys())
        lspeakers.sort()
        random.shuffle(lspeakers)
        isplit_ut = int(len(lutterances) * pc_split_utterances)
        isplit_spk = int(len(lspeakers) * pc_split_speakers)
        if split == 'train':
            spk_del = lspeakers[-2 * isplit_spk:]
            ut_del = lutterances[-2 * isplit_ut:]
        elif split == 'valid':
            spk_del = lspeakers[:-2 * isplit_spk] + lspeakers[-isplit_spk:]
            ut_del = lutterances[:-2 * isplit_ut] + lutterances[-isplit_ut:]
        elif split == 'train+valid':
            spk_del = lspeakers[-isplit_spk:]
            ut_del = lutterances[-isplit_ut:]
        elif split == 'test':
            spk_del = lspeakers[:-isplit_spk]
            ut_del = lutterances[:-isplit_ut]
        else:
            print('Not implemented split', split)
            sys.exit()
        if split_speakers:
            for spk in spk_del:
                del self.speakers[spk]
        if split_utterances:
            for ut in ut_del:
                del self.utterances[ut]

        # Filter filenames by speaker and utterance
        filenames_new = []
        for filename in self.filenames:
            spk, ut = self.filename_split(filename)
            if spk in self.speakers and ut in self.utterances:
                filenames_new.append(filename)
        self.filenames = filenames_new

        # Select speaker
        if select_speaker is not None:
            select_speaker = select_speaker.split(',')
            filenames_new = []
            for filename in self.filenames:
                spk, ut = self.filename_split(filename)
                if spk in select_speaker and spk in self.speakers:
                    filenames_new.append(filename)
            if len(filenames_new) == 0:
                print('\nERROR: Selected an invalid speaker. Options are:',
                      list(self.speakers.keys()))
                sys.exit()
            self.filenames = filenames_new

        # Select specific file
        if select_file is not None:
            select_file = select_file.split(',')
            filenames_new = []
            for filename in self.filenames:
                _, file = os.path.split(filename[:-len(EXTENSION)])
                if file in select_file:
                    filenames_new.append(filename)
            if len(filenames_new) == 0:
                print('\nERROR: Selected an invalid file. Options are:',
                      self.filenames[:int(np.min([50, len(self.filenames)]))],
                      '... (without folder and without extension))')
                sys.exit()
            self.filenames = filenames_new

        # Indices!
        self.audios = [None] * len(self.filenames)
        self.indices = []
        duration = {}
        start = time.time()
        if do_audio_load:
            for i, filename in enumerate(self.filenames):
                if verbose:
                    if i % 1000 == 0:
                        print('Read {} out of {} files'.format(
                            i + 1, len(self.filenames)))
                    # print('\rRead audio {:5.1f}%'.format(
                    #     100 * (i + 1) / len(self.filenames)), end='')
                # Info
                spk, ut = self.filename_split(filename)
                ispk, iut = self.speakers[spk], self.utterances[ut]
                # Load
                if spk not in duration:
                    duration[spk] = 0
                if duration[spk] >= trim:
                    continue
                x = torch.load(os.path.join(self.path_in, filename))
                if self.store_in_ram:
                    self.audios[i] = x.clone()
                x = x.float()
                # Process
                for j in range(0, len(x), stride):
                    if j + self.lchunk >= len(x):
                        continue
                    else:
                        xx = x[j:j + self.lchunk]
                    if (xx.pow(2).sum() /
                            self.lchunk).sqrt().item() >= frame_energy_thres:
                        info = [i, j, 0, ispk, iut]
                        self.indices.append(torch.LongTensor(info))
                    duration[spk] += stride / sampling_rate
                    if duration[spk] >= trim:
                        break
                self.indices[-1][2] = 1
            if verbose:
                print()
            self.indices = torch.stack(self.indices)
        print("Time elapsed: {}".format(time.time() - start))

        # Print
        if verbose:
            totalduration = 0
            for key in duration.keys():
                totalduration += duration[key]
            print(
                'Loaded {:s}: {:.1f} h, {:d} spk, {:d} ut, {:d} files, {:d} frames (fet={:.1e},'
                .format(split, totalduration / 3600, len(self.speakers),
                        len(self.utterances), len(self.filenames),
                        len(self.indices), frame_energy_thres),
                end='')
            if trim is None or trim > 1e12:
                print(' no trim)')
            else:
                print(' trim={:.1f}s)'.format(trim))
            if select_speaker is not None:
                print('Selected speaker(s):', select_speaker)
            if select_file is not None:
                print('Selected file(s):', select_file)

    def filename_split(self, fullfn):
        aux = os.path.split(fullfn)[-1][:-len(EXTENSION)].split('_')
        return aux[0], aux[1]

    def get_mel(self, audio):
        audio = audio.unsqueeze(0)
        audio = torch.autograd.Variable(audio, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_whole_audio(self, idx):
        # Load file
        if self.store_in_ram:
            x = self.audios[idx]
        else:
            x = torch.load(os.path.join(self.path_in, self.filenames[idx]))
        assert x is not None
        x = x.float()
        # Info
        spk, ut = self.filename_split(self.filenames[idx])
        ispk, iut = self.speakers[spk], self.utterances[ut]
        y = torch.LongTensor([ispk])
        ichap = torch.LongTensor([iut])
        last = torch.LongTensor([1])
        return x, y, ichap, last

    def __getitem__(self, index):
        if self.split == 'test':
            return self.get_whole_audio(index)
        i, j, last, ispk, ichap = self.indices[index, :]
        # Load file
        if self.store_in_ram:
            tmp = self.audios[i]
        else:
            tmp = torch.load(os.path.join(self.path_in, self.filenames[i]))
        # Temporal jitter
        if self.temp_jitter:
            j = j + np.random.randint(-self.stride // 2, self.stride // 2)
            if j < 0:
                j = 0
            elif j > len(tmp) - self.lchunk:
                j = np.max([0, len(tmp) - self.lchunk])
        # Get frame
        if j + self.lchunk > len(tmp):
            x = tmp[j:].float()
            x = torch.cat([x, torch.zeros(self.lchunk - len(x))])
        else:
            x = tmp[j:j + self.lchunk].float()
        # Get info
        y = torch.LongTensor([ispk])

        # mel = self.get_mel(x)

        return x, y, ichap, last

    def __len__(self):
        if self.split == 'test':
            return len(self.filenames)
        return self.indices.size(0)
Example #7
0
    parser.add_argument("-d",
                        "--denoiser_strength",
                        default=0.0,
                        type=float,
                        help='Removes model bias. Start with 0.1 and adjust')

    args = parser.parse_args()
    with open(args.config) as f:
        data = f.read()
    config = json.loads(data)
    global data_config
    data_config = config["data_config"]
    data_config['split'] = 'train'
    global squeezewave_config
    squeezewave_config = config['squeezewave_config']

    stft = TacotronSTFT(filter_length=data_config['filter_length'],
                        hop_length=data_config['hop_length'],
                        win_length=data_config['win_length'],
                        sampling_rate=data_config['sampling_rate'],
                        mel_fmin=data_config['mel_fmin'],
                        mel_fmax=data_config['mel_fmax'],
                        n_group=squeezewave_config['n_audio_channel'])

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        os.chmod(args.output_dir, 0o775)

    main(args.squeezewave_path, args.sigma, args.output_dir, args.is_fp16,
         args.denoiser_strength)