コード例 #1
0
ファイル: augment.py プロジェクト: SJTMusicTeam/mellotron
def agumentation(arpabet_dict,
                 audio_paths,
                 target_spk_id_list,
                 output_path,
                 ljs=False):

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    # Step1: Basic Setups

    if not ljs:
        # Whether to use lj speech
        checkpoint_path = "mellotron_libritts.pt"
    else:
        checkpoit_path = "mellotron_ljs.pt"
    if torch.cuda.is_available():
        tacotron = load_model(hparams).cuda().eval()
    else:
        tacotron = load_model(hparams).eval()
    tacotron.load_state_dict(
        torch.load(checkpoint_path, map_location="cpu")['state_dict'])

    waveglow_path = 'waveglow_256channels_v4.pt'
    if torch.cuda.is_available():
        waveglow = torch.load(waveglow_path)['model'].cuda().eval()
        denoiser = Denoiser(waveglow).cuda().eval()
    else:
        waveglow = torch.load(waveglow_path,
                              map_location="cpu")['model'].eval().cpu()
        denoiser = Denoiser(waveglow).eval()

    arpabet_dict = cmudict.CMUDict(arpabet_dict)
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(1)

    # Step2: Load
    for file_idx in range(len(dataloader)):
        source_scp = open(os.path.join(output_path, "source.scp"),
                          "w",
                          encoding="utf-8")

        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]
        source_scp.write("{} {}\n".format(file_idx, audio_path))

        # get audio path, encoded text, pitch contour and mel for gst
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,
                             arpabet_dict))[None, :]
        pitch_contour = dataloader[file_idx][3][None]
        if torch.cuda.is_available():
            text_encoded = text_encoded.cuda()
            pitch_contour = pitch_contour.cuda()
        mel = load_mel(audio_path)
        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]]))

        # Step3: Perform speaker transfer
        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)

        for spk_id in target_spk_id_list:
            speaker_id = torch.LongTensor([spk_id])

            if torch.cuda.is_available():
                speaker_id = speaker_id.cuda()

            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention(
                    (text_encoded, mel, speaker_id, pitch_contour * 0.4,
                     rhythm))

            with torch.no_grad():
                audio = denoiser(
                    waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

            sf.write(
                os.path.join(output_path, "{}-{}.wav".format(file_idx,
                                                             spk_id)),
                audio.detach().cpu().numpy().T, hparams.sampling_rate)
コード例 #2
0
def inference(dirname, outdir, checkpoint_path, sentence_list, parallel=False):
    # 멜로트론 로딩
    mellotron = load_model(hparams).cuda().eval()
    mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    # 보코더 로딩
    vocoder = get_vocoder()

    # 오디오 filelist 로딩
    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = f'data/{dirname}.txt'
    dataloader = TextMelLoader(audio_paths,
                               hparams,
                               speaker_ids=speaker_id_map)
    os.makedirs(f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}',
                exist_ok=True)

    with open('data/VCTK/speaker-dict.json') as f:
        speakers = json.load(f)

    new_filelist = []
    t0 = time.time()
    cnt = 0
    for file_idx in range(len(dataloader)):
        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]
        if not parallel:
            for sent_txt in sentence_list:
                text = sent_txt

                # get audio path, encoded text, pitch contour and mel for gst
                text_encoded = torch.LongTensor(
                    text_to_sequence(text, hparams.text_cleaners,
                                     arpabet_dict))[None, :].cuda()
                pitch_contour = dataloader[file_idx][3][None].cuda()
                mel = load_mel(audio_path)
                print(audio_path, text)

                # 스피커 id
                # speaker_name = os.path.basename(audio_path).split('_')[1]
                # speaker_id = speakers.index(speaker_name)
                speaker_id = int(sid)
                # speaker_id_mapped = speaker_id_map[speaker_id]
                speaker_id = torch.LongTensor([speaker_id]).cuda()

                # 멜로트론 합성
                with torch.no_grad():
                    mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference(
                        (text_encoded, mel, speaker_id, pitch_contour))

                    # wav 합성
                    text_save = text[:100] if len(text) > 100 else text
                    sample_name = f'{os.path.splitext(os.path.basename(audio_path))[0]}-{text_save}.wav'
                    vocoder_infer(
                        mel_outputs_postnet, vocoder,
                        f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}'
                    )

                new_filelist.append(
                    f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}\n'
                )
                cnt += 1
        else:
            # get audio path, encoded text, pitch contour and mel for gst
            text_encoded = torch.LongTensor(
                text_to_sequence(text, hparams.text_cleaners,
                                 arpabet_dict))[None, :].cuda()
            pitch_contour = dataloader[file_idx][3][None].cuda()
            mel = load_mel(audio_path)
            print(audio_path, text)

            # 스피커 id
            # speaker_name = os.path.basename(audio_path).split('_')[1]
            # speaker_id = speakers.index(speaker_name)
            speaker_id = int(sid)
            speaker_id_mapped = speaker_id_map[speaker_id]
            speaker_id = torch.LongTensor([speaker_id_mapped]).cuda()

            # 멜로트론 합성
            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference(
                    (text_encoded, mel, speaker_id, pitch_contour))

                # wav 합성
                text_save = text[:10] if len(text) > 10 else text
                sample_name = f'{os.path.splitext(os.path.basename(audio_path))[0]}-{text_save}.wav'
                vocoder_infer(
                    mel_outputs_postnet, vocoder,
                    f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}'
                )

            new_filelist.append(
                f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}\n'
            )
            cnt += 1

    with open(f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}.txt',
              'w') as f:
        f.writelines(new_filelist)
    t1 = time.time()
    print(f'Average inference time: {(t1 - t0) / cnt:.6f}')
コード例 #3
0
hparams = create_hparams()
hparams.batch_size = 1
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                    hparams.win_length, hparams.n_mel_channels,
                    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax)
# speaker = "fv02"
checkpoint_path = '/mnt/sdc1/pitchtron/grl_200224/checkpoint_291000'
f0s_meta_path = '/mnt/sdc1/pitchtron/single_init_200123/f0s_combined.txt'
# "models/pitchtron_libritts.pt"
pitchtron = load_model(hparams).cuda().eval()
pitchtron.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = '/home/admin/projects/pitchtron_init_with_single/models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_pfp_single_sample.txt'
test_set = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)
dataloader = DataLoader(test_set,
                        num_workers=1,
                        shuffle=False,
                        batch_size=hparams.batch_size,
                        pin_memory=False,
                        drop_last=False,
                        collate_fn=datacollate)
speaker_ids = TextMelLoader(
    "filelists/wav_less_than_12s_158_speakers_train.txt", hparams).speaker_ids
# speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda()

# Load mean f0
コード例 #4
0
    def __init__(self, coordinator, data_paths, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self.data_paths = data_paths
        self.data_path_to_id = {
            data_path: _id
            for _id, data_path in enumerate(data_paths)
        }
        prefixes_dict = {}
        offset_dict = {}
        for data_path in data_paths:
            prefixes = []
            with open(os.path.join(data_path, 'ids.train'), 'r') as fi:
                for line in fi:
                    line = line.strip()
                    if line:
                        prefixes.append(line)
            prefixes_dict[data_path] = prefixes
            offset_dict[data_path] = 0
        self._prefixes_dict = prefixes_dict
        self._offset_dict = offset_dict

        self._placeholders = [
            tf.placeholder(tf.float32, [None, None, hparams.num_labs],
                           'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets'),
            tf.placeholder(tf.string, [None], 'prefixes'),
            tf.placeholder(tf.int32, [None], 'speaker_ids'),
            tf.placeholder(tf.int32, [None], 'target_lengths')
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(8, [
            tf.float32, tf.int32, tf.float32, tf.float32, tf.string, tf.int32,
            tf.int32
        ],
                             name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.prefixes, self.speaker_ids, self.target_lengths = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)
        self.prefixes.set_shape(self._placeholders[4].shape)
        self.speaker_ids.set_shape(self._placeholders[5].shape)
        self.target_lengths.set_shape(self._placeholders[6].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
コード例 #5
0
import re
import numpy as np
import music21 as m21
import torch
import torch.nn.functional as F
from text import text_to_sequence, get_arpabet, cmudict

CMUDICT_PATH = "data/cmu_dictionary"
CMUDICT = cmudict.CMUDict(CMUDICT_PATH)
PHONEME2GRAPHEME = {
    'AA': ['a', 'o', 'ah'],
    'AE': ['a', 'e'],
    'AH': ['u', 'e', 'a', 'h', 'o'],
    'AO': ['o', 'u', 'au'],
    'AW': ['ou', 'ow'],
    'AX': ['a'],
    'AXR': ['er'],
    'AY': ['i'],
    'EH': ['e', 'ae'],
    'EY': ['a', 'ai', 'ei', 'e', 'y'],
    'IH': ['i', 'e', 'y'],
    'IX': ['e', 'i'],
    'IY': ['ea', 'ey', 'y', 'i'],
    'OW': ['oa', 'o'],
    'OY': ['oy'],
    'UH': ['oo'],
    'UW': ['oo', 'u', 'o'],
    'UX': ['u'],
    'B': ['b'],
    'CH': ['ch', 'tch'],
    'D': ['d', 'e', 'de'],
コード例 #6
0
ファイル: datafeeder.py プロジェクト: wql7654/bigdata_exam
    def __init__(self, coordinator, metadata_filename, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._offset = 0

        # Load metadata:
        self._datadir = os.path.dirname(metadata_filename)
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = []
            for line in f:
                sp = line.strip().split('|')
                if int(sp[2]) >= hparams.outputs_per_step * hparams.max_iters:
                    continue

                try:
                    text_to_sequence(sp[3], self._cleaner_names)
                except:
                    continue

                self._metadata.append(sp)
            # self._metadata = [line.strip().split('|') for line in f]
            hours = sum(
                (int(x[2])
                 for x in self._metadata)) * hparams.frame_shift_ms / (3600 *
                                                                       1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets')
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32],
                             name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
コード例 #7
0
  def __init__(self, coordinator, metadata_filename_pos, metadata_filename_neg, hparams):
    super(DataFeeder, self).__init__()
    self._coord = coordinator
    self._hparams = hparams
    self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    self._offset = 0

    # Load metadata:
    # load data from both positive filename and negative filename
    self._datadir = os.path.dirname(metadata_filename_pos)
    #self._datadir_neg = os.path.dirname(metadata_filename_neg)

    with open(metadata_filename_pos, encoding='utf-16') as f:
      self._metadata_pos = [line.strip().split('|') for line in f]
      hours = sum((int(x[2]) for x in self._metadata_pos)) * hparams.frame_shift_ms / (3600 * 1000)
      log('Loaded positive metadata for %d examples (%.2f hours)' % (len(self._metadata_pos), hours))
    
    with open(metadata_filename_neg, encoding='utf-16') as f:
      self._metadata_neg = [line.strip().split('|') for line in f]
      hours = sum((int(x[2]) for x in self._metadata_neg)) * hparams.frame_shift_ms / (3600 * 1000)
      log('Loaded negative metadata for %d examples (%.2f hours)' % (len(self._metadata_neg), hours))
 


    # Create placeholders for inputs and targets. Don't specify batch size because we want to
    # be able to feed different sized batches at eval time.
    self._placeholders = [
      tf.placeholder(tf.int32, [None, None], 'inputs_pos'),
      tf.placeholder(tf.int32, [None], 'input_lengths_pos'),
      tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets_pos'),
      tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets_pos'),     
      tf.placeholder(tf.int32, [None, None], 'inputs_neg'),
      tf.placeholder(tf.int32, [None], 'input_lengths_neg'),
      tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets_neg'),
      tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets_neg'),
      tf.placeholder(tf.int32, [None, 4], 'pos_labels'),
      tf.placeholder(tf.int32, [None, 4], 'neg_labels')
    ]
     
    # Create queue for buffering data:
    queue = tf.FIFOQueue(16, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue')
    self._enqueue_op = queue.enqueue(self._placeholders)
    self.inputs_pos, self.input_lengths_pos, self.mel_targets_pos, self.linear_targets_pos,self.inputs_neg, self.input_lengths_neg, self.mel_targets_neg, self.linear_targets_neg = queue.dequeue()
    
    self.inputs_pos.set_shape(self._placeholders[0].shape)
    self.input_lengths_pos.set_shape(self._placeholders[1].shape)
    self.mel_targets_pos.set_shape(self._placeholders[2].shape)
    self.linear_targets_pos.set_shape(self._placeholders[3].shape)
    
    self.inputs_neg.set_shape(self._placeholders[0].shape)
    self.input_lengths_neg.set_shape(self._placeholders[1].shape)
    self.mel_targets_neg.set_shape(self._placeholders[2].shape)
    self.linear_targets_neg.set_shape(self._placeholders[3].shape)

    self.labels_pos.set_shape(self._placeholders[8].shape)
    self.labels_pos.set_shape(self._placeholders[8].shape)


    # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
    # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
    # synthesis (useful for proper nouns, etc.)
    if hparams.use_cmudict:
      cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
      if not os.path.isfile(cmudict_path):
        raise Exception('If use_cmudict=True, you must download cmu dictionary first. ' +
          'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b'  % self._datadir)
      self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
      log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
    else:
      self._cmudict = None
コード例 #8
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)
    # else:
    #     torch.cuda.set_device('cuda:1')

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    waveglow_path = 'waveglow_256channels_universal_v5.pt'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().float()
    # waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()

    # ---------------------- MELLOTRON CODE BLOCK --------------------------
    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = 'data/examples_filelist.txt'
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(hparams.n_frames_per_step)
    file_idx = 0
    audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    def load_mel(path):
        audio, sampling_rate = librosa.core.load(path,
                                                 sr=hparams.sampling_rate)
        audio = torch.from_numpy(audio)
        if sampling_rate != hparams.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, stft.sampling_rate))
        audio_norm = audio.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = stft.mel_spectrogram(audio_norm)
        melspec = melspec.cuda()
        return melspec

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    mel = load_mel(audio_path)
    print(audio_path, text)
    inference_batch = datacollate([dataloader[file_idx]])

    # ---------------------- MELLOTRON CODE BLOCK (END) --------------------------

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

                    # if not is_overflow and (iteration % 2 == 0):
                    log_audio(model, iteration, logger, waveglow,
                              inference_batch, text_encoded, mel)

            iteration += 1
コード例 #9
0
ファイル: datafeeder.py プロジェクト: ishine/n_tacotron
    def __init__(self, coordinator, training_path, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._offset = 0
        self._offset_person_id = 0
        self._batch_in_queue = 0
        self._datasets = hparams.datasets

        # Load metadata:
        #self._datadir = os.path.dirname(metadata_filename)
        #with open(metadata_filename, encoding='utf-8') as f:
        #  self._metadata = [line.strip().split('|') for line in f]
        #  hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
        #  log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
        # self._metadata
        # self._datadir
        self._metadata = []
        global_parson_id = 0
        for dataset in self._datasets:
            metadata_filename = os.path.join(training_path, dataset,
                                             'train.txt')
            datadir = os.path.dirname(metadata_filename)
            #exist_person_id correlate the global_person_id with current person_id
            exist_person_id = {}
            with open(metadata_filename, encoding='utf-8') as f:
                metadata = [line.strip().split('|') for line in f]
                hours = sum(
                    (int(x[2])
                     for x in metadata)) * hparams.frame_shift_ms / (3600 *
                                                                     1000)
                log('Loaded ' + dataset +
                    ' metadata for %d examples (%.2f hours)' %
                    (len(metadata), hours))
                for item in metadata:
                    #item=[vctk-spec-23918.npy,vctk-mel-23918.npy,329,They say that vital evidence was not heard in court.,60]
                    person_id = item[4]
                    item[0] = os.path.join(datadir, item[0])
                    item[1] = os.path.join(datadir, item[1])
                    if not person_id in exist_person_id:
                        exist_person_id[person_id] = global_parson_id
                        global_parson_id += 1
                        self._metadata.append([])
                    global_person_id_crrt = exist_person_id[person_id]
                    self._metadata[global_person_id_crrt].append(item)

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets')
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(100, [tf.int32, tf.int32, tf.float32, tf.float32],
                             name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
コード例 #10
0
def test_cmudict_no_keep_ambiguous():
    c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False)
    assert len(c) == 5
    assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2']
    assert c.lookup('adverse') == None
コード例 #11
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self.train_offset = 0
        self.test_offset = 0

        # Load metadata:
        self._mel_dir = os.path.join(os.path.dirname(metadata_filename),
                                     'mels')
        self._datadir = os.path.dirname(metadata_filename)
        self._linear_dir = os.path.join(os.path.dirname(metadata_filename),
                                        'linear')
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|') for line in f]
            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            hours = sum(
                (int(x[4]) for x in self._metadata)) * frame_shift_ms / 3600
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        #Train test split
        if hparams.gst_test_size is None:
            assert hparams.gst_test_batches is not None

        test_size = (hparams.gst_test_size if hparams.gst_test_size is not None
                     else hparams.gst_test_batches * hparams.batch_size)
        indices = np.arange(len(self._metadata))
        train_indices, test_indices = train_test_split(
            indices,
            test_size=test_size,
            random_state=hparams.gst_data_random_state)

        #Make sure test_indices is a multiple of batch_size else round up
        len_test_indices = _round_up(len(test_indices), hparams.batch_size)
        extra_test = test_indices[len_test_indices:]
        test_indices = test_indices[:len_test_indices]
        train_indices = np.concatenate([train_indices, extra_test])

        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])

        self.test_steps = len(self._test_meta) // hparams.batch_size

        if hparams.gst_test_size is None:
            assert hparams.gst_test_batches == self.test_steps

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32,
                           shape=(None, None),
                           name='token_targets'),
            tf.placeholder(tf.float32,
                           shape=(None, None, hparams.num_freq),
                           name='linear_targets'),
            tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(
            8,
            [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32],
            name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue(
        )

        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.token_targets.set_shape(self._placeholders[3].shape)
        self.linear_targets.set_shape(self._placeholders[4].shape)
        self.targets_lengths.set_shape(self._placeholders[5].shape)

        # Create eval queue for buffering eval data
        eval_queue = tf.FIFOQueue(
            1,
            [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32],
            name='eval_queue')
        self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
        self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
         self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue()

        self.eval_inputs.set_shape(self._placeholders[0].shape)
        self.eval_input_lengths.set_shape(self._placeholders[1].shape)
        self.eval_mel_targets.set_shape(self._placeholders[2].shape)
        self.eval_token_targets.set_shape(self._placeholders[3].shape)
        self.eval_linear_targets.set_shape(self._placeholders[4].shape)
        self.eval_targets_lengths.set_shape(self._placeholders[5].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(os.path.dirname(metadata_filename),
                                        'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download cmu dictionary first. '
                    +
                    'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b'
                    % self._datadir)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None