コード例 #1
0
ファイル: ui.py プロジェクト: usnistgov/ocr-pipeline
def create_models(dataset_dir):
    """Initialize the app (available for localhost only)

    Parameters:
        dataset_dir (:func:`str`): Path to the training set
    """
    logger.debug("Creating models...")

    if not local_exec:
        logger.error("Models can only be generated locally")
        exit(1)

    # Modify the configuration for local execution
    app_config['root'] = os.environ['ROOT']

    # Generate inline models and train classifier
    denoiser = Denoiser(app_config)

    if not exists(dataset_dir) or not isdir(dataset_dir):
        logger.error(dataset_dir+" is not a valid directory")
        exit(2)

    dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)]

    denoiser.generate_models(dataset)
    logger.info("Inline models generated")

    denoiser.train(dataset)
    logger.info("Classifier trained")
コード例 #2
0
    def __init__(self,
                 ds_name,
                 ds_path,
                 lr,
                 iterations,
                 batch_size,
                 print_freq,
                 k,
                 eps,
                 is_normalized,
                 adv_momentum,
                 store_adv=None,
                 load_adv_dir=None,
                 load_adv_name=None,
                 load_dir=None,
                 load_name=None,
                 save_dir=None):

        self.data_processor = Preprocessor(ds_name, ds_path, is_normalized)

        # Load Data
        self.train_data, self.test_data, self.N_train, self.N_test = self.data_processor.datasets(
        )
        self.train_loader = DataLoader(self.train_data,
                                       batch_size=batch_size,
                                       shuffle=True)
        self.test_loader = DataLoader(self.test_data, batch_size=batch_size)

        # Other Variables
        self.save_dir = save_dir
        self.store_adv = store_adv

        # Set Model Hyperparameters
        self.learning_rate = lr
        self.iterations = iterations
        self.print_freq = print_freq
        self.cuda = torch.cuda.is_available()

        # Load Model to Conduct Adversarial Training
        adversarial_model = self.load_model(self.cuda, load_adv_dir,
                                            load_adv_name, TEST)
        self.adversarial_generator = Attacks(adversarial_model, eps,
                                             self.N_train, self.N_test,
                                             self.data_processor.get_const(),
                                             adv_momentum, is_normalized,
                                             store_adv)

        # Load Target Model
        self.target_model = self.load_model(self.cuda, load_dir, load_name,
                                            TEST)

        # Load Denoiser
        self.denoiser = Denoiser(x_h=32, x_w=32)
        self.denoiser = self.denoiser.cuda()
コード例 #3
0
def test_reducing_by_stats(audio_file, out_lib):
    y, sr = sf.read(audio_file)
    y_power = Denoiser.reduce_noise_power(y, sr)
    y_cent_s = Denoiser.reduce_noise_centroid_s(y, sr)
    y_cent_mb = Denoiser.reduce_noise_centroid_mb(y, sr)
    y_mfcc_d = Denoiser.reduce_noise_mfcc_down(y, sr)
    y_mfcc_u = Denoiser.reduce_noise_mfcc_up(y, sr)
    sf.write(out_lib + '/power.wav', y_power, sr)
    sf.write(out_lib + '/cent_s.wav', y_cent_s, sr)
    sf.write(out_lib + '/cent_mb.wav', y_cent_mb, sr)
    sf.write(out_lib + '/mfcc_d.wav', y_mfcc_d, sr)
    sf.write(out_lib + '/mfcc_u.wav', y_mfcc_u, sr)
コード例 #4
0
def reduce_by_example_to_mp3(audio_file, noise_file, out_file):
    # data1, rate1 = sf.read(audio_file)
    # noise_data1, _ = sf.read(noise_file)

    audio = AudioSegment.from_wav(audio_file)
    noise_audio = AudioSegment.from_wav(noise_file)

    data = Denoiser.seg_to_numpy(audio)
    noise_data = Denoiser.seg_to_numpy(noise_audio)
    rate = audio.frame_rate

    denoised_data = Denoiser.reduce_by_example(data, noise_data, rate)

    denoised_audio = Denoiser.numpy_to_seg_like_seg(denoised_data, audio)
    denoised_audio.export(out_file, format='mp3')
コード例 #5
0
def main(waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(glob.glob('*.npy')):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.from_numpy(np.load(file_path))
        mel = torch.unsqueeze(mel, 0).cuda()
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze().cpu().numpy()
        audio_path = os.path.join(output_dir, f'waveglow_{file_name}.wav')
        write(audio_path, sampling_rate, audio.astype('int16'))
コード例 #6
0
def test_reducing_by_example(audio_file, noise_file, out_file):
    data, rate = sf.read(audio_file)
    noise_data, _ = sf.read(noise_file)

    denoised_audio = Denoiser.reduce_by_example(data, noise_data, rate)

    sf.write(out_file, denoised_audio, rate)
コード例 #7
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
コード例 #8
0
ファイル: inference.py プロジェクト: zwjgit/VocGAN
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers,
                        ratios=hp.model.generator_ratio, mult = hp.model.mult,
                        out_band = hp.model.out_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=True)

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        audio = model.inference(mel)

        audio = audio.squeeze(0)  # collapse all dimension except time axis
        if args.d:
            denoiser = Denoiser(model).cuda()
            audio = denoiser(audio, 0.01)
        audio = audio.squeeze()
        audio = audio[:-(hp.audio.hop_length*10)]
        audio = MAX_WAV_VALUE * audio
        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
        audio = audio.short()
        audio = audio.cpu().detach().numpy()

        out_path = args.input.replace('.npy', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
        write(out_path, hp.audio.sampling_rate, audio)
コード例 #9
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()

    model.load_state_dict(checkpoint['model_g'])
    model.eval()

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        audio = model(mel)
        # For multi-band inference
        print(audio.shape)
        audio = audio.squeeze(0)  # collapse all dimension except time axis
        if args.d:
            denoiser = Denoiser(model).cuda()
            audio = denoiser(audio, 0.1)
        audio = audio.squeeze()
        audio = audio[:-(hp.audio.hop_length * 10)]
        audio = MAX_WAV_VALUE * audio
        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1)
        audio = audio.short()
        audio = audio.cpu().detach().numpy()

        out_path = args.input.replace(
            '.npy', '_hifi_GAN_epoch%04d.wav' % checkpoint['epoch'])
        write(out_path, hp.audio.sampling_rate, audio)
コード例 #10
0
def load_waveglow(chk_pt_path):
    waveglow = torch.load(chk_pt_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    return waveglow, denoiser
コード例 #11
0
    def load_model(self):

        ####TODO#### 1.학습된 모델 불러오기
        # 학습된 tacotron 모델 주소를 load하고
        # 모델에 hparam과 statedict를 load한다
        checkpoint_path = "/home/ubuntu/test/TTS/checkpoint_28000"
        self.model = train.load_model(self.hparams)
        self.model.load_state_dict(
            torch.load(checkpoint_path,
                       map_location=torch.device("cpu"))['state_dict'])

        # pass

        ####TODO####
        # _ = self.model.cpu().eval().half()
        _ = self.model.cpu().eval()

        #waveglow model load
        # waveglow_path = "/home/multicam/checkpoints/waveglow.pt"
        waveglow_path = "/home/ubuntu/test/TTS/waveglow.pt"
        self.waveglow = torch.load(waveglow_path,
                                   map_location=torch.device("cpu"))['model']
        self.waveglow.cpu().eval()
        #self.waveglow.cpu().eval().half()
        for k in self.waveglow.convinv:
            k.float()
        self.denoiser = Denoiser(self.waveglow)
コード例 #12
0
ファイル: txtdenoiser.py プロジェクト: mansaj/ocr-pipeline
class TXTDenoiser(Command):
    """Command to clean TXT files
    """
    def __init__(self, filename, logger, config):
        super(TXTDenoiser, self).__init__(filename, logger, config)
        self.denoiser = Denoiser(config)

        self.logger.debug("Denoiser initialized")

    def execute(self):
        """Execute the command
        """
        try:
            self.logger.debug("::: Text cleaning :::")
            # super(TXTDenoiser, self).get_file()

            txt_dir = join(self.unzipped, "txt")
            txt_files = [
                join(txt_dir, f) for f in listdir(txt_dir)
                if isfile(join(txt_dir, f)) and f.endswith(".txt")
            ]

            if len(txt_files) != 1:
                self.logger.error("Incorrect number of text files")
                self.finalize()
                return -1

            text_data = self.denoiser.cleanse(txt_files[0], False)

            # Writing classified lines
            base_filename = splitext(basename(txt_files[0]))[0]
            clean_filename = join(txt_dir, base_filename + ".clean.txt")
            garbage_filename = join(txt_dir, base_filename + ".grbge.txt")
            unclassified_filename = join(txt_dir,
                                         base_filename + ".unclss.txt")

            with codecs.open(clean_filename, "wb",
                             encoding="utf-8") as clean_file:
                for line in text_data.get_clean_lines():
                    clean_file.write(line + "\n")

            with codecs.open(garbage_filename, "wb",
                             encoding="utf-8") as garbage_file:
                for line in text_data.get_garbage_lines():
                    garbage_file.write(line + "\n")

            if len(text_data.get_unclassified_lines()) > 0:
                with codecs.open(unclassified_filename, "wb",
                                 encoding="utf-8") as unclassified_file:
                    for line in text_data.get_unclassified_lines():
                        unclassified_file.write(line + "\n")
        except Exception, e:
            print e

            self.logger.error("Cleaner has stopped unexpectedly: " + e.message)
            self.finalize()
            return -2

        self.finalize()
        return 0
コード例 #13
0
ファイル: time_check.py プロジェクト: OSLL/web_speech_trainer
def check_exec_time(data, noise, rate, out_file):
    start_time = time.time()

    denoised = Denoiser.reduce_by_example(data, noise, rate)

    sf.write(out_file, denoised, rate)
    return (len(denoised) / rate, time.time() - start_time)
コード例 #14
0
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory):

  # Make synthesis paths

  if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print("Creating directory " + output_directory + "...")

  hparams = create_hparams()
  hparams.sampling_rate = 22050

  print("Loading models...")
  model = load_model(hparams)
  model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
  _ = model.cuda().eval().half()

  waveglow = torch.load(waveglow_path)['model']
  waveglow.cuda().eval().half()
  for k in waveglow.convinv:
      k.float()
  denoiser = Denoiser(waveglow)

  genlist = []
  with open(text_file) as file:
    for line in file:
      genlist.append(line.strip())

  for entry in genlist:
    wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav"

    epi = epitran.Epitran('eng-Latn', ligatures = True)
    if hparams.preprocessing == "ipa":
      entry = ipa.convert(english_cleaners(entry))
      foreign_words = re.findall(r"[^ ]{0,}\*", entry)
      for word in foreign_words:
        entry = entry.replace(word, epi.transliterate(word[0:len(word)-1]))
    if hparams.preprocessing == "arpabet":
      entry = make_arpabet(entry)

    # Text sequencer
    if hparams.preprocessing is not None:
      sequence = np.array(text_to_sequence(entry, None))[None, :]
    else:
      sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
      torch.from_numpy(sequence)).cuda().long()

    # Synthesis
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    with torch.no_grad():
      audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    audio_denoised = denoiser(audio, strength=0.01)[:, 0]

    # Save audio
    print ("Saving " + wav_name)
    write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
コード例 #15
0
def test_reducing_by_length(audio_file, noise_file, out_lib):
    data, rate = sf.read(audio_file)
    noise, _ = sf.read(noise_file)
    croped_noise = noise[:]
    length = noise.shape[0]

    for i in range(3):
        croped_noise = noise[:length >> i]
        denoised = Denoiser.reduce_by_example(data, croped_noise, rate)
        print(croped_noise.shape[0] / rate)
        sf.write(out_lib + 'denoised{}.wav'.format(i), denoised, rate)
コード例 #16
0
ファイル: ui.py プロジェクト: williammo2016/ocr-pipeline
def create_models(dataset_dir):
    """Initialize the app (available for localhost only)

    Parameters:
        dataset_dir (:func:`str`): Path to the training set
    """
    logger.debug("Creating models...")

    if not local_exec:
        logger.error("Models can only be generated locally")
        exit(1)

    # Modify the configuration for local execution
    app_config['root'] = os.environ['ROOT']

    # Generate inline models and train classifier
    denoiser = Denoiser(app_config)

    if not exists(dataset_dir) or not isdir(dataset_dir):
        logger.error(dataset_dir + " is not a valid directory")
        exit(2)

    dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)]

    denoiser.generate_models(dataset)
    logger.info("Inline models generated")

    denoiser.train(dataset)
    logger.info("Classifier trained")
コード例 #17
0
ファイル: txtdenoiser.py プロジェクト: pdessauw/ocr-pipeline
class TXTDenoiser(Command):
    """Command to clean TXT files
    """

    def __init__(self, filename, logger, config):
        super(TXTDenoiser, self).__init__(filename, logger, config)
        self.denoiser = Denoiser(config)

    def execute(self):
        """Execute the command
        """
        try:
            self.logger.debug("::: Text cleaning :::")
            super(TXTDenoiser, self).get_file()

            txt_dir = join(self.unzipped, "txt")
            txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]

            if len(txt_files) != 1:
                self.logger.error("Incorrect number of text files")
                self.finalize()
                return -1

            text_data = self.denoiser.cleanse(txt_files[0], False)

            # Writing classified lines
            base_filename = splitext(basename(txt_files[0]))[0]
            clean_filename = join(txt_dir, base_filename+".clean.txt")
            garbage_filename = join(txt_dir, base_filename+".grbge.txt")
            unclassified_filename = join(txt_dir, base_filename+".unclss.txt")

            with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
                for line in text_data.get_clean_lines():
                    clean_file.write(line+"\n")

            with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
                for line in text_data.get_garbage_lines():
                    garbage_file.write(line+"\n")

            if len(text_data.get_unclassified_lines()) > 0:
                with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
                    for line in text_data.get_unclassified_lines():
                        unclassified_file.write(line+"\n")
        except Exception, e:
            print e

            self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
            self.finalize()
            return -2

        self.finalize()
        return 0
コード例 #18
0
    def __init__(self, lang):
        tacotron2 = load_and_setup_model('Tacotron2',
                                         parser,
                                         args.tacotron2,
                                         args.amp_run,
                                         args.cpu_run,
                                         forward_is_infer=True)
        waveglow = load_and_setup_model('WaveGlow',
                                        parser,
                                        args.waveglow,
                                        args.amp_run,
                                        args.cpu_run,
                                        forward_is_infer=True)

        if args.cpu_run:
            denoiser = Denoiser(waveglow, args.cpu_run)
        else:
            denoiser = Denoiser(waveglow, args.cpu_run).cuda()

        jitted_tacotron2 = torch.jit.script(tacotron2)

        self.language = lang
コード例 #19
0
 def __init__(self):
     hparams = create_hparams()
     hparams.sampling_rate = 22050
     checkpoint_path = constants.TACOTRON_PT
     self.model = load_model(hparams)
     self.model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
     _ = self.model.cuda().eval().half()
     waveglow_path = constants.WAVEGLOW_PT
     self.waveglow = torch.load(waveglow_path)['model']
     self.waveglow.cuda().eval().half()
     for k in self.waveglow.convinv:
         k.float()
     self.denoiser = Denoiser(self.waveglow)
コード例 #20
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    for m in waveglow.modules():
        if 'Conv' in str(type(m)):
            setattr(m, 'padding_mode', 'zeros')
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()



    for i, file_path in enumerate(mel_files):

        file_name = os.path.splitext(os.path.basename(file_path))[0]
        #print(file_name)
        mel = torch.load(file_path)
        # print("mel",mel)
        #print(mel.shape)
        mel = torch.autograd.Variable(mel.cuda())
        # print("mel",mel)
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        # print("mel",mel)

        print(torch.min(mel),torch.max(mel))
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)


            k.append(abs(audio).max().item())
            #print(min(k),max(k))
            #audio = audio*18000*abs(audio).max()/0.99
            #print("audio",audio)
            #print((audio).min().item(),(audio).max().item())
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_synthesis_sig0.7_d_0.1.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
コード例 #21
0
ファイル: inference.py プロジェクト: SomeUserName1/tacotron2
def main(text):
    hparams = create_hparams()
    hparams.sampling_rate = 22050
    hparams.gate_threshold = 0.1
    hparams.max_decoder_steps = 5000

    # #### Load model from checkpoint
    checkpoint_path = "tacotron2_statedict.pt"
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()

    # #### Load WaveGlow for mel2audio synthesis and denoiser
    waveglow_path = 'waveglow_256channels.pt'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()

    for m in waveglow.modules():
        if 'Conv' in str(type(m)):
            setattr(m, 'padding_mode', 'zeros')

    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)

    # #### Prepare text input
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    # #### Decode text input and plot results
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    plot_data((mel_outputs.float().data.cpu().numpy()[0],
               mel_outputs_postnet.float().data.cpu().numpy()[0],
               alignments.float().data.cpu().numpy()[0].T))

    # #### Synthesize audio from spectrogram using WaveGlow
    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

    # #### (Optional) Remove WaveGlow bias
    audio_denoised = denoiser(audio, strength=0.01)[:, 0]

    # save
    if (os.path.isfile("out.wav")):
        x, sr = librosa.load("out.wav")
        out = np.append(x, audio[0].data.cpu().numpy().astype(np.float32))
    else:
        out = audio[0].data.cpu().numpy().astype(np.float32)

    librosa.output.write_wav('./out.wav', out, 22050)
コード例 #22
0
ファイル: web_app.py プロジェクト: jireh-father/tacotron2
def init_model():
    print("init model!!!!")
    global tacotron2_model
    global waveglow_model
    global denoiser

    tacotron2_path = "outdir_finetune/checkpoint_62500"
    #    tacotron2_path = "outdir_korean/checkpoint_8800"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000"
    #    tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000"
    #    tacotron2_path = "outdir_lj_korean/checkpoint_5000"
    #    tacotron2_path = "outdir_longtrain/checkpoint_439500"
    waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000"
    #   waveglow_path = "../waveglow/checkpoints/waveglow_335000"
    # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000"
    sampling_rate = 22050
    denoiser_strength = 0.0
    hparams = create_hparams()
    hparams.sampling_rate = sampling_rate
    hparams.training = False

    tacotron2_model = load_model(hparams)
    tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
    _ = tacotron2_model.cuda().eval().half()

    # with open("waveglow/config.json") as f:
    #     data = f.read()
    # import json
    # config = json.loads(data)
    # waveglow_config = config["waveglow_config"]
    #
    # waveglow_model = glow.WaveGlow(**waveglow_config)
    #
    # checkpoint_dict = torch.load(waveglow_path, map_location='cpu')
    # model_for_loading = checkpoint_dict['model']
    # waveglow_model.load_state_dict(model_for_loading.state_dict())
    #
    # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict'])
    # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    # waveglow_model.cuda().eval().half()

    waveglow_model = torch.load(waveglow_path)['model']
    waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    waveglow_model.cuda().eval().half()
    for k in waveglow_model.convinv:
        k.float()
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow_model)
コード例 #23
0
ファイル: text2speech.py プロジェクト: Harishgeth/cookietts
    def load_waveglow(self, vocoder_path, config_fpath):
        # Load config file
        with open(config_fpath) as f:
            data = f.read()
        config = json.loads(data)
        train_config = config["train_config"]
        data_config = config["data_config"]
        dist_config = config["dist_config"]
        vocoder_config = {
            **config["waveglow_config"], 'win_length':
            data_config['win_length'],
            'hop_length': data_config['hop_length']
        }
        print(vocoder_config)
        print(f"Config File from '{config_fpath}' successfully loaded.")

        # import the correct model core
        if self.is_ax(vocoder_config):
            from efficient_model_ax import WaveGlow
        else:
            if vocoder_config["yoyo"]:
                from efficient_model import WaveGlow
            else:
                from glow import WaveGlow

        # initialize model
        print(f"intializing WaveGlow model... ", end="")
        waveglow = WaveGlow(**vocoder_config).cuda()
        print(f"Done!")

        # load checkpoint from file
        print(f"loading WaveGlow checkpoint... ", end="")
        checkpoint = torch.load(vocoder_path)
        waveglow.load_state_dict(
            checkpoint['model']
        )  # and overwrite initialized weights with checkpointed weights
        waveglow.cuda().eval().half(
        )  # move to GPU and convert to half precision
        print(f"Done!")

        print(f"initializing Denoiser... ", end="")
        denoiser = Denoiser(waveglow)
        print(f"Done!")
        vocoder_iters = checkpoint['iteration']
        print(f"WaveGlow trained for {vocoder_iters} iterations")
        speaker_lookup = checkpoint['speaker_lookup']  # ids lookup
        training_sigma = train_config['sigma']

        return waveglow, denoiser, training_sigma, speaker_lookup
コード例 #24
0
ファイル: inference.py プロジェクト: Welsun/chinese_tacotron
def text2audio(waveglow_path, sigma, output_dir, sampling_rate, mel):
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()

    denoiser = Denoiser(waveglow).cuda()

    with torch.no_grad():
        audio = waveglow.infer(mel.cuda(), sigma=sigma)
        # if denoiser_strength > 0:
        #     audio = denoiser(audio, denoiser_strength)
        #audio = audio * MAX_WAV_VALUE
    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    sf.write(os.path.join(output_dir, "pred2.wav"), audio, sampling_rate)
コード例 #25
0
 def tacotron2_init(self):
     self.plot_wav_data = False
     # set parameters
     self.hparams = create_hparams()
     self.hparams.sampling_rate = 22050
     # load tacotron2
     self.model = load_model(self.hparams)
     self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict'])
     _ = self.model.cuda().eval().half()
     # load waveglow
     self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model']
     self.waveglow.cuda().eval().half()
     for k in self.waveglow.convinv:
         k.float()
     self.denoiser = Denoiser(self.waveglow)
コード例 #26
0
ファイル: inference.py プロジェクト: dodohow1011/waveglow_2
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    hparams = create_hparams()
    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    testset = TextMelLoader(text_files, hparams)
    collate_fn = TextMelCollate()

    test_loader = DataLoader(testset,
                             num_workers=0,
                             shuffle=False,
                             sampler=None,
                             batch_size=1,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=collate_fn)
    waveglow = torch.load(waveglow_path)['model']
    # waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, batch in enumerate(test_loader):
        text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
            batch)
        enc_outputs, _ = Taco2(
            (text_padded, input_lengths, mel_padded, max_len, output_lengths))
        # mel = torch.autograd.Variable(mel.cuda())
        # mel = torch.unsqueeze(mel, 0)
        # mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma)
            '''if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE'''
        # audio = audio.squeeze()
        # mel = mel.cpu().numpy()
        # audio = audio.astype('int16')
        print(mel)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        print(mel_path)
コード例 #27
0
ファイル: inference.py プロジェクト: yhgon/SqueezeWave
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    tic_prepare= time.time()
    mel_files = files_to_list(mel_files)
    squeezewave = torch.load(squeezewave_path)['model']
    squeezewave = squeezewave.remove_weightnorm(squeezewave)
    squeezewave.cuda().eval()
    if is_fp16:
        from apex import amp
        squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(squeezewave).cuda()
        
    toc_prepare = time.time()
    dur_prepare = toc_prepare - tic_prepare
    print("prepare model {:3.2}sec".format(dur_prepare) )
    

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        tic=time.time()
        
        with torch.no_grad():
            audio = squeezewave.infer(mel, sigma=sigma).float()
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        toc=time.time()
        dur = toc -tic
        
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        
        len_wav = len(audio)
        sec_wav = len_wav/sampling_rate
        samples_sec =  len_wav / dur
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_s{}.wav".format(file_name,sigma))
        write(audio_path, sampling_rate, audio)
        print("{} it took {:4.3f}sec  for  {:4.3f}sec {:4.2f}K sample 22Khz Audio files :   RTF {:4.3f} {:4.3f}X  {:4.2f}Ksamples/sec  "
              .format(audio_path, dur, sec_wav, len_wav/1000,  dur/sec_wav,  sec_wav/dur , samples_sec/1000  ) ) 
コード例 #28
0
ファイル: inference.py プロジェクト: ruaruaruabick/waveglow
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)  #测试集mel谱list
    waveglow = torch.load(waveglow_path)['model']  #加载模型
    waveglow = waveglow.remove_weightnorm(waveglow)  #?移除权重归一化
    waveglow.cuda().eval()  #cuda()拷贝进gpu #?变成测试模式,dropout和BN在训练时和测不一样
    #apex加速
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
    # denoiser_strength=0
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()
    for i, file_path in enumerate(mel_files):
        #file_name-对应的wav
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        #加载MFCC特征,80个滤波器
        mel = torch.load(file_path)
        #mel={key:mel[key].cuda() for key in mel}
        #封装数据
        mel = torch.autograd.Variable(mel.cuda())
        #80,375 -> 1*80*375
        mel = torch.unsqueeze(mel, 0)
        #变成fp16数据以便apex加速
        mel = mel.half() if is_fp16 else mel
        #反向传播不会自动求导
        with torch.no_grad():
            #生成1*96000Tensor数据,x为原始音频,z为mel谱
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            #为了转成wav?
            audio = audio * MAX_WAV_VALUE
        #变成1维数据
        audio = audio.squeeze()
        #在cpu中转成numpy
        audio = audio.cpu().numpy()
        #改变类型
        audio = audio.astype('int16')
        #生成数据存储位置
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        #写入音频
        print(audio_path)
コード例 #29
0
ファイル: plc_exam.py プロジェクト: AugggRush/newPLC
def inference_plc(mel, waveglow, sigma, is_fp16, denoiser_strength):

    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()
    mel = torch.autograd.Variable(mel.cuda())
    mel = mel.half() if is_fp16 else mel
    with torch.no_grad():
        audio = waveglow.infer(mel, sigma=sigma)
        if denoiser_strength > 0:
            audio = denoiser(audio, denoiser_strength)
        audio = audio * MAX_WAV_VALUE
    audio = audio.squeeze()

    return audio
コード例 #30
0
    def __init__(self):
        for module_path in './waveglow/', './waveglow/tacotron2':
            if module_path not in sys.path:
                sys.path.insert(0, module_path)

        # Disable deprecation warnings
        import warnings
        warnings.simplefilter('ignore')

        self.waveglow = torch.load('waveglow_256channels_ljs_v2.pt')['model']
        self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
        self.waveglow.cuda().eval()

        from denoiser import Denoiser
        self.denoiser = Denoiser(self.waveglow).cuda()

        # Re-enable warnings
        warnings.resetwarnings()
コード例 #31
0
def main(tacotron2_path, waveglow_path, sigma, output_dir, sampling_rate,
         denoiser_strength, text, file_idx, inference_name, zip_file, hparams):
    hparams.sampling_rate = sampling_rate

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    random.seed(hparams.seed)

    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
    _ = model.cuda().eval().half()

    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow)

    sequence = np.array(text_to_sequence(
        text, ['transliteration_cleaners']))[None, :]
    print(sequence)
    # sequence2 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
    # sequence3 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
    # print(np.array_equal(sequence, sequence2))
    # print(np.array_equal(sequence, sequence3))
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    mel_outputs, mel_outputs_postnet2, _, alignments = model.inference(
        sequence)

    MAX_WAV_VALUE = 32768.0
    print(mel_outputs_postnet.cpu().data.numpy()[0][0][:30])
    print(mel_outputs_postnet2.cpu().data.numpy()[0][0][:30])
    if np.array_equal(mel_outputs_postnet.cpu().data.numpy(),
                      mel_outputs_postnet2.cpu().data.numpy()):
        print("same!!")
    else:
        print("different!!")
コード例 #32
0
    def __init__(self, lang):
        self.language = lang
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        with open('config.json', 'r') as f:
            self.config = json.load(f)

        self.waveglow_path = self.config.get('model').get('waveglow')
        self.waveglow = torch.load(self.waveglow_path)['model']
        self.waveglow.cuda().eval().half()

        for m in self.waveglow.modules():
            if 'Conv' in str(type(m)):
                setattr(m, 'padding_mode', 'zeros')
                
        for k in self.waveglow.convinv:
            k.float()
        self.denoiser = Denoiser(self.waveglow)
        self.update_model(lang)
コード例 #33
0
ファイル: txtdenoiser.py プロジェクト: pdessauw/ocr-pipeline
 def __init__(self, filename, logger, config):
     super(TXTDenoiser, self).__init__(filename, logger, config)
     self.denoiser = Denoiser(config)