def __init__(self, wspecifier, filetype='mat', write_num_frames=None, compress=False, compression_method=2): self.writer_scp = None # Used for writing scp self.filename = None self.filetype = filetype self.kwargs = {} if filetype == 'mat': if compress: self.writer = kaldiio.WriteHelper( wspecifier, compression_method=compression_method) else: self.writer = kaldiio.WriteHelper(wspecifier) elif filetype in ['hdf5', 'sound.hdf5']: # ark,scp:out.ark,out.scp -> {'ark': 'out.ark', 'scp': 'out.scp'} ark_scp, filepath = wspecifier.split(':', 1) if ark_scp not in ['ark', 'scp,ark', 'ark,scp']: raise ValueError( '{} is not allowed: {}'.format(ark_scp, wspecifier)) ark_scps = ark_scp.split(',') filepaths = filepath.split(',') if len(ark_scps) != len(filepaths): raise ValueError( 'Mismatch: {} and {}'.format(ark_scp, filepath)) spec_dict = dict(zip(ark_scps, filepaths)) if filetype == 'sound.hdf5': self.writer = SoundHDF5File(spec_dict['ark'], 'w') else: self.writer = h5py.File(spec_dict['ark'], 'w') self.filename = spec_dict['ark'] if 'scp' in spec_dict: self.writer_scp = io.open( spec_dict['scp'], 'w', encoding='utf-8') else: raise ValueError('Not supporting: filetype={}'.format(filetype)) if write_num_frames is not None: if ':' not in write_num_frames: raise ValueError('Must include ":", write_num_frames={}' .format(write_num_frames)) nframes_type, nframes_file = write_num_frames.split(':', 1) if nframes_type != 'ark,t': raise ValueError( 'Only supporting text mode. ' 'e.g. --write-num-frames=ark,t:foo.txt :' '{}'.format(nframes_type)) self.writer_nframe = io.open(nframes_file, 'w', encoding='utf-8') else: self.writer_nframe = None
def __init__( self, wspecifier, write_num_frames=None, compress=False, compression_method=2 ): if compress: self.writer = kaldiio.WriteHelper( wspecifier, compression_method=compression_method ) else: self.writer = kaldiio.WriteHelper(wspecifier) self.writer_scp = None if write_num_frames is not None: self.writer_nframe = get_num_frames_writer(write_num_frames) else: self.writer_nframe = None
def test_load_inputs_and_targets_legacy_format(tmpdir): # batch = [("F01_050C0101_PED_REAL", # {"input": [{"feat": "some/path.ark:123"}], # "output": [{"tokenid": "1 2 3 4"}], ark = str(tmpdir.join('test.ark')) scp = str(tmpdir.join('test.scp')) desire_xs = [] desire_ys = [] with kaldiio.WriteHelper('ark,scp:{},{}'.format(ark, scp)) as f: for i in range(10): x = np.random.random((100, 100)).astype(np.float32) uttid = 'uttid{}'.format(i) f[uttid] = x desire_xs.append(x) desire_ys.append(np.array([1, 2, 3, 4])) batch = [] with open(scp, 'r') as f: for line in f: uttid, path = line.strip().split() batch.append((uttid, {'input': [{'feat': path, 'name': 'input1'}], 'output': [{'tokenid': '1 2 3 4', 'name': 'target1'}]})) load_inputs_and_targets = LoadInputsAndTargets() xs, ys = load_inputs_and_targets(batch) for x, xd in zip(xs, desire_xs): np.testing.assert_array_equal(x, xd) for y, yd in zip(ys, desire_ys): np.testing.assert_array_equal(y, yd)
def feats_scp(tmp_path): p = tmp_path / "feats.scp" p2 = tmp_path / "feats.ark" with kaldiio.WriteHelper(f"ark,scp:{p2},{p}") as w: w["a"] = np.random.randn(100, 80) w["b"] = np.random.randn(150, 80) return str(p)
def generate_json_data(config, mode, nexamples): """Generate Json data for test.""" # pylint: disable=too-many-locals tmpdir = Path(tempfile.mkdtemp()) ark = str(tmpdir.joinpath('test.ark')) scp = str(tmpdir.joinpath('test.scp')) ilens = 100 nfeat = 40 nexamples = nexamples desire_xs = [] desire_ilens = [] desire_ys = [] desire_olens = [] with kaldiio.WriteHelper('ark,scp:{},{}'.format(ark, scp)) as out_f: for i in range(nexamples): # pylint: disable=invalid-name x = np.random.random((ilens, nfeat)).astype(np.float32) uttid = 'uttid{}'.format(i) out_f[uttid] = x desire_xs.append(x) desire_ilens.append(ilens) desire_ys.append(np.array([1, 2, 3, 10])) desire_olens.append(4) dummy_json = {} dummy_json['utts'] = {} with open(scp, 'r') as out_f: for line in out_f: uttid, path = line.strip().split() dummy_json['utts'][uttid] = { 'input': [{ 'feat': path, 'name': 'input1', 'shape': [ilens, nfeat] }], 'output': [{ 'tokenid': '1 2 3 10', 'name': 'output1', 'shape': [4, 10] }] } path = tmpdir.joinpath('{}.json'.format(mode)) path.touch(exist_ok=True) path = str(path.resolve()) with open(path, 'w') as out_f: json.dump(dummy_json, out_f, cls=JsonNumpyEncoder) config['data'][mode]['paths'] = [path] return desire_xs, desire_ilens, desire_ys, desire_olens
def main(args): if args['--datadir']: data_dir = args['--datadir'] else: data_dir = hp.data.eval_path device = torch.device(hp.device) print('[INFO] device: %s' % device) dataset_name = os.path.basename(os.path.normpath(data_dir)) print('[INFO] dataset: %s' % dataset_name) # Load model embed_net = SpeechEmbedder().to(device) embed_net.load_state_dict(torch.load(hp.model.model_path)) embed_net.eval() # Features eval_gen = DL.ARKUtteranceGenerator(data_dir, apply_vad=True) eval_loader = DataLoader(eval_gen, batch_size=hp.test.M, shuffle=False, num_workers=hp.test.num_workers, drop_last=False) dwriter = kaldiio.WriteHelper('ark,scp:%s_dvecs.ark,%s_dvecs.scp' % (dataset_name, dataset_name)) cnt = 0 processed = [] for key_bt, feat_bt in eval_loader: feat_bt = feat_bt.to(device) t_start = time.time() # feat dim [M_files, n_chunks_in_file, frames, n_mels] mf, nchunks, frames, nmels = feat_bt.shape print(feat_bt.shape) stack_shape = (mf * nchunks, frames, nmels) feat_stack = torch.reshape(feat_bt, stack_shape) dvec_stack = embed_net(feat_stack) dvec_bt = torch.reshape( dvec_stack, (mf, dvec_stack.size(0) // mf, dvec_stack.size(1))) for key, dvec in zip(key_bt, dvec_bt): mean_dvec = torch.mean(dvec, dim=0).detach() mean_dvec = mean_dvec.cpu().numpy() dwriter(key, mean_dvec) processed.append(key) print('%d. Processed: %s' % (cnt, key)) cnt += 1 t_end = time.time() print('Elapsed: %.4f' % (t_end - t_start))
def predict(self, input_source, input_reference): """Compute prediction""" # inference out_dir = Path(tempfile.mkdtemp()) out_path = out_dir / Path( os.path.basename(str(input_source)).split(".")[0] + "_converted_gen.wav" ) src_wav_path = input_source ref_wav_path = input_reference feat_writer = kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + "/feats.1") ) src_mel, src_lf0 = extract_logmel(src_wav_path, self.mean, self.std) ref_mel, _ = extract_logmel(ref_wav_path, self.mean, self.std) src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(self.device) src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(self.device) ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(self.device) out_filename = os.path.basename(src_wav_path).split(".")[0] with torch.no_grad(): z, _, _, _ = self.encoder.encode(src_mel) lf0_embs = self.encoder_lf0(src_lf0) spk_emb = self.encoder_spk(ref_mel) output = self.decoder(z, lf0_embs, spk_emb) feat_writer[out_filename + "_converted"] = output.squeeze(0).cpu().numpy() feat_writer[out_filename + "_source"] = src_mel.squeeze(0).cpu().numpy().T feat_writer[out_filename + "_reference"] = ( ref_mel.squeeze(0).cpu().numpy().T ) feat_writer.close() print("synthesize waveform...") cmd = [ "parallel-wavegan-decode", "--checkpoint", "./vocoder/checkpoint-3000000steps.pkl", "--feats-scp", f"{str(out_dir)}/feats.1.scp", "--outdir", str(out_dir), ] subprocess.call(cmd) return out_path
def __init__( self, storage_path: Pathlike, compression_method: Optional[int] = None, *args, **kwargs, ): if not is_module_available("kaldiio"): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldiio' first.") import kaldiio super().__init__() self.storage_dir = Path(storage_path) self.storage_dir.mkdir(parents=True, exist_ok=True) self.storage_path_ = str(self.storage_dir / "feats.scp") self.storage = kaldiio.WriteHelper( f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp", compression_method=compression_method, )
def main(config, args): device = set_device(1) # g-vector extractor model = get_instance(module_model, config['model']) chkpt = torch.load(args.resume) try: model.load_state_dict(chkpt['model']) except: model.load_state_dict(chkpt) model = model.to(device) config['dataset']['args']['wav_scp'] = os.path.join(args.data, 'wav.scp') config['dataset']['args']['utt2spk'] = None testset = get_instance(dataset, config['dataset']) testloader = DataLoader(testset, batch_size=1, shuffle=False, num_workers=4, drop_last=False) model.eval() utt2embd = {} for i, (utt, data) in enumerate(tqdm(testloader, ncols=80)): utt = utt[0] data = data.float().to(device) with torch.no_grad(): embd = model.extractor(data) embd = embd.squeeze(0).cpu().numpy() utt2embd[utt] = embd embd_wfile = 'ark,scp:{0}/embedding.ark,{0}/embedding.scp'.format( args.data) with kaldiio.WriteHelper(embd_wfile) as writer: for utt, embd in utt2embd.items(): writer(utt, embd)
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=True, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) import kaldiio import time with torch.no_grad(), \ kaldiio.WriteHelper('ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f: ys = [] xs = [] for idx, utt_id in enumerate(js.keys()): logging.info('(%d/%d) decoding ' + utt_id, idx, len(js.keys())) batch = [(utt_id, js[utt_id])] data = load_inputs_and_targets(batch) feat = data[0][0] ys.append(data[1][0]) # x = torch.LongTensor(x).to(device) # decode and write start_time = time.time() # include the inference here # have the layer specification here # skeleton model.inference(x, args, layer) scores, outs = model.inference(feat, ys, args, train_args.char_list) xs.append(scores) logging.info("inference speed = %s msec / frame." % ( (time.time() - start_time) / (int(outs.size(0)) * 1000))) logging.warning("output length reaches maximum length (%s)." % utt_id) logging.info('(%d/%d) %s (size:%d->%d)' % ( idx + 1, len(js.keys()), utt_id, len(feat), outs.size(0))) f[utt_id] = outs.cpu().numpy() from espnet.nets.pytorch_backend.nets_utils import th_accuracy preds = torch.stack(xs).view(len(xs), -1) labels = torch.LongTensor(ys).view(len(xs), 1) acc = th_accuracy(preds, labels, -1) logging.warn("Final acc is (%.2f)" % (acc*100))
def decode(args): """Decode with the given args :param Namespace args: The program arguments """ set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show arguments for key in sorted(vars(args).keys()): logging.info('args: ' + key + ': ' + str(vars(args)[key])) # define model tacotron2 = Tacotron2(idim, odim, train_args) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, tacotron2) tacotron2.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # check directory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) load_inputs_and_targets = LoadInputsAndTargets( mode='tts', load_input=False, sort_in_input_length=False, use_speaker_embedding=train_args.use_speaker_embedding, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf) with torch.no_grad(), kaldiio.WriteHelper( 'ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f: for idx, utt_id in enumerate(js.keys()): batch = [(utt_id, js[utt_id])] with using_transform_config({'train': False}): data = load_inputs_and_targets(batch) if train_args.use_speaker_embedding: spemb = data[1][0] spemb = torch.FloatTensor(spemb).to(device) else: spemb = None x = data[0][0] x = torch.LongTensor(x).to(device) # decode and write outs, _, _ = tacotron2.inference(x, args, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warning("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) f[utt_id] = outs.cpu().numpy()
def main(argv): """Load the model, generate kernel and bandpass plots.""" parser = get_parser() args = parser.parse_args(argv) if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") if torch.cuda.is_available() and ("cuda" in args.device): device = args.device else: device = "cpu" if args.toolkit == "speechbrain": from speechbrain.dataio.preprocess import AudioNormalizer from speechbrain.pretrained import EncoderClassifier # Prepare spk2utt for mean x-vector spk2utt = dict() with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader: for line in reader: details = line.split() spk2utt[details[0]] = details[1:] # TODO(nelson): The model inference can be moved into functon. classifier = EncoderClassifier.from_hparams( source=args.pretrained_model, run_opts={"device": device}) audio_norm = AudioNormalizer() wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp")) os.makedirs(args.out_folder, exist_ok=True) writer_utt = kaldiio.WriteHelper( "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder)) writer_spk = kaldiio.WriteHelper( "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format( args.out_folder)) for speaker in tqdm(spk2utt): xvectors = list() for utt in spk2utt[speaker]: in_sr, wav = wav_scp[utt] # Amp Normalization -1 ~ 1 amax = np.amax(np.absolute(wav)) wav = wav.astype(np.float32) / amax # Freq Norm wav = audio_norm(torch.from_numpy(wav), in_sr).to(device) # X-vector Embedding embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0] writer_utt[utt] = np.squeeze(embeds) xvectors.append(embeds) # Speaker Normalization embeds = np.mean(np.stack(xvectors, 0), 0) writer_spk[speaker] = embeds writer_utt.close() writer_spk.close() elif args.toolkit == "espnet": raise NotImplementedError( "Follow details at: https://github.com/espnet/espnet/issues/3040") else: raise ValueError( f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi" )
def decode(args): """Decode with E2E-TTS model.""" set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show arguments for key in sorted(vars(args).keys()): logging.info('args: ' + key + ': ' + str(vars(args)[key])) # define model model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) assert isinstance(model, TTSInterface) logging.info(model) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, model) model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # check directory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) load_inputs_and_targets = LoadInputsAndTargets( mode='tts', load_input=False, sort_in_input_length=False, use_speaker_embedding=train_args.use_speaker_embedding, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False} # Switch the mode of preprocessing ) # define function for plot prob and att_ws def _plot_and_save(array, figname, figsize=(6, 4), dpi=150): import matplotlib.pyplot as plt shape = array.shape if len(shape) == 1: # for eos probability plt.figure(figsize=figsize, dpi=dpi) plt.plot(array) plt.xlabel("Frame") plt.ylabel("Probability") plt.ylim([0, 1]) elif len(shape) == 2: # for tacotron 2 attention weights, whose shape is (out_length, in_length) plt.figure(figsize=figsize, dpi=dpi) plt.imshow(array, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") elif len(shape) == 4: # for transformer attention weights, whose shape is (#leyers, #heads, out_length, in_length) plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi) for idx1, xs in enumerate(array): for idx2, x in enumerate(xs, 1): plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2) plt.imshow(x, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") else: raise NotImplementedError("Support only from 1D to 4D array.") plt.tight_layout() if not os.path.exists(os.path.dirname(figname)): # NOTE: exist_ok = True is needed for parallel process decoding os.makedirs(os.path.dirname(figname), exist_ok=True) plt.savefig(figname) plt.close() with torch.no_grad(), \ kaldiio.WriteHelper('ark,scp:{o}.ark,{o}.scp'.format(o=args.out)) as f: for idx, utt_id in enumerate(js.keys()): batch = [(utt_id, js[utt_id])] data = load_inputs_and_targets(batch) if train_args.use_speaker_embedding: spemb = data[1][0] spemb = torch.FloatTensor(spemb).to(device) else: spemb = None x = data[0][0] x = torch.LongTensor(x).to(device) # decode and write start_time = time.time() outs, probs, att_ws = model.inference(x, args, spemb=spemb) logging.info("inference speed = %s msec / frame." % ( (time.time() - start_time) / (int(outs.size(0)) * 1000))) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warning("output length reaches maximum length (%s)." % utt_id) logging.info('(%d/%d) %s (size:%d->%d)' % ( idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) f[utt_id] = outs.cpu().numpy() # plot prob and att_ws if probs is not None: _plot_and_save(probs.cpu().numpy(), os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id) if att_ws is not None: _plot_and_save(att_ws.cpu().numpy(), os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id)
"""Example of loading a pre-trained APC model.""" import torch from apc_model import APCModel from utils import PrenetConfig, RNNConfig # added by Sameer import kaldiio import sys feats_scp = sys.argv[1] segments = sys.argv[2] scp_file = sys.argv[3] ark_file = scp_file.replace('.scp', '.ark') writer = kaldiio.WriteHelper('ark,scp:%s,%s' % (ark_file, scp_file)) if segments: reader = kaldiio.ReadHelper('scp:%s' % feats_scp, segments=segments) else: reader = kaldiio.ReadHelper('scp:%s' % feats_scp) def main(): prenet_config = None rnn_config = RNNConfig(input_size=80, hidden_size=512, num_layers=3, dropout=0., residual=True) # Sameer Added residual=True pretrained_apc = APCModel(mel_dim=80,
def decode(args, teacher_args): """Decode with E2E-TTS model.""" set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show arguments for key in sorted(vars(args).keys()): logging.info("args: " + key + ": " + str(vars(args)[key])) train_args.encoder_resume = None # define model model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args, args, teacher_args) assert isinstance(model, TTSInterface) logging.info(model) # load trained model parameters logging.info("reading model parameters from " + args.model) if args.use_amp: checkpoint = torch.load(args.model) model.load_state_dict(checkpoint['model']) else: torch_load(args.model, model) model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # read json data with open(args.json, "rb") as f: js = json.load(f)["utts"] # check directory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) from io_utils_fcl import LoadInputsAndTargets load_inputs_and_targets = LoadInputsAndTargets( mode="tts", load_input=False, sort_in_input_length=False, use_second_target=False, use_speaker_embedding=train_args.use_speaker_embedding, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing pad_eos=args.pad_eos, ) os.makedirs(os.path.dirname(args.out), exist_ok=True) # define writer instances feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=args.out)) inference_speeds = [] # start decoding for idx, utt_id in enumerate(js.keys()): # setup inputs batch = [(utt_id, js[utt_id])] data = load_inputs_and_targets(batch) # print(data) x = torch.LongTensor(data[0][0]).to(device) spemb = None if train_args.use_speaker_embedding: spemb = torch.FloatTensor(data[1][0]).to(device) # decode and write start_time = time.time() outs = model.inference(x, args, spemb=spemb) inference_speed = int(outs.size(0)) / (time.time() - start_time) inference_speeds.append(inference_speed) logging.info( "inference speed = %.1f frames / sec." % (inference_speed) ) feat_writer[utt_id] = outs.cpu().numpy() logging.info( "average inference speed = %.1f frames / sec." % (sum(inference_speeds)/(idx+1)) ) avg_infer_speed = sum(inference_speeds)/(idx+1) exp_name = args.model.split('/')[-3] fp = open(f'{exp_name}.txt','w') fp.write(str(avg_infer_speed)) fp.close() # close file object feat_writer.close()
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) # FIXME(kamo): I think we shouldn't depend on kaldi-format any more. # How about numpy or HDF5? # >>> with NpyScpWriter() as f: with kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format( o=output_dir / "norm/feats")) as f, kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=output_dir / "denorm/feats")) as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() # TODO(kamo): Now att_ws is not used. outs, probs, att_ws = tts.inference( **_data, threshold=threshold, maxlenratio=maxlenratio, minlenratio=minlenratio, ) outs_denorm = normalize.inverse(outs[None])[0][0] insize = next(iter(_data.values())).size(0) logging.info("inference speed = {} msec / frame.".format( (time.perf_counter() - start_time) / (int(outs.size(0)) * 1000))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() g[key] = outs_denorm.cpu().numpy() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")
def convert(args): src_wav_path = args.source_wav ref_wav_path = args.reference_wav out_dir = args.converted_wav_path os.makedirs(out_dir, exist_ok=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256) encoder_lf0 = Encoder_lf0() encoder_spk = Encoder_spk() decoder = Decoder_ac(dim_neck=64) encoder.to(device) encoder_lf0.to(device) encoder_spk.to(device) decoder.to(device) checkpoint_path = args.model_path checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder_spk.load_state_dict(checkpoint["encoder_spk"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() encoder_spk.eval() decoder.eval() mel_stats = np.load('./mel_stats/stats.npy') mean = mel_stats[0] std = mel_stats[1] feat_writer = kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + '/feats.1')) src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std) ref_mel, _ = extract_logmel(ref_wav_path, mean, std) src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device) src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device) ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device) out_filename = os.path.basename(src_wav_path).split('.')[0] with torch.no_grad(): z, _, _, _ = encoder.encode(src_mel) lf0_embs = encoder_lf0(src_lf0) spk_emb = encoder_spk(ref_mel) output = decoder(z, lf0_embs, spk_emb) feat_writer[out_filename + '_converted'] = output.squeeze(0).cpu().numpy() feat_writer[out_filename + '_source'] = src_mel.squeeze(0).cpu().numpy().T feat_writer[out_filename + '_reference'] = ref_mel.squeeze(0).cpu().numpy().T feat_writer.close() print('synthesize waveform...') cmd = ['parallel-wavegan-decode', '--checkpoint', \ './vocoder/checkpoint-3000000steps.pkl', \ '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)] subprocess.call(cmd)
def convert(cfg): src_wav_paths = glob( '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p225/*mic1.flac' ) # modified to absolute wavs path, can select any unseen speakers src_wav_paths = select_wavs(src_wav_paths) tar1_wav_paths = glob( '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p231/*mic1.flac' ) # can select any unseen speakers tar2_wav_paths = glob( '/Dataset/VCTK-Corpus/wav48_silence_trimmed/p243/*mic1.flac' ) # can select any unseen speakers # tar1_wav_paths = select_wavs(tar1_wav_paths) # tar2_wav_paths = select_wavs(tar2_wav_paths) tar1_wav_paths = [sorted(tar1_wav_paths)[0]] tar2_wav_paths = [sorted(tar2_wav_paths)[0]] print('len(src):', len(src_wav_paths), 'len(tar1):', len(tar1_wav_paths), 'len(tar2):', len(tar2_wav_paths)) tmp = cfg.checkpoint.split('/') steps = tmp[-1].split('-')[-1].split('.')[0] out_dir = f'test/{tmp[-3]}-{tmp[-2]}-{steps}' out_dir = Path(utils.to_absolute_path(out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) encoder_lf0 = Encoder_lf0() encoder_spk = Encoder_spk() decoder = Decoder_ac(dim_neck=64) encoder.to(device) encoder_lf0.to(device) encoder_spk.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder_spk.load_state_dict(checkpoint["encoder_spk"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() encoder_spk.eval() decoder.eval() mel_stats = np.load('./data/mel_stats.npy') mean = mel_stats[0] std = mel_stats[1] feat_writer = kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + '/feats.1')) for i, src_wav_path in tqdm(enumerate(src_wav_paths, 1)): if i > 10: break mel, lf0 = extract_logmel(src_wav_path, mean, std) if i % 2 == 1: ref_wav_path = random.choice(tar2_wav_paths) tar = 'tarMale_' else: ref_wav_path = random.choice(tar1_wav_paths) tar = 'tarFemale_' ref_mel, _ = extract_logmel(ref_wav_path, mean, std) mel = torch.FloatTensor(mel.T).unsqueeze(0).to(device) lf0 = torch.FloatTensor(lf0).unsqueeze(0).to(device) ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device) out_filename = os.path.basename(src_wav_path).split('.')[0] with torch.no_grad(): z, _, _, _ = encoder.encode(mel) lf0_embs = encoder_lf0(lf0) spk_embs = encoder_spk(ref_mel) output = decoder(z, lf0_embs, spk_embs) logmel = output.squeeze(0).cpu().numpy() feat_writer[out_filename] = logmel feat_writer[out_filename + '_src'] = mel.squeeze(0).cpu().numpy().T feat_writer[out_filename + '_ref'] = ref_mel.squeeze(0).cpu().numpy().T subprocess.call(['cp', src_wav_path, out_dir]) feat_writer.close() print('synthesize waveform...') cmd = ['parallel-wavegan-decode', '--checkpoint', \ '/vocoder/checkpoint-3000000steps.pkl', \ '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)] subprocess.call(cmd)
def test_load_inputs_and_targets_legacy_format_multi_inputs(tmpdir): # batch = [("F01_050C0101_PED_REAL", # {"input": [{"feat": "some/path1.ark:123", # "name": "input1"} # {"feat": "some/path2.ark:123" # "name": "input2"}], # "output": [{"tokenid": "1 2 3 4"}], ark_1 = str(tmpdir.join("test_1.ark")) scp_1 = str(tmpdir.join("test_1.scp")) ark_2 = str(tmpdir.join("test_2.ark")) scp_2 = str(tmpdir.join("test_2.scp")) desire_xs_1 = [] desire_xs_2 = [] desire_ys = [] with kaldiio.WriteHelper("ark,scp:{},{}".format(ark_1, scp_1)) as f: for i in range(10): x = np.random.random((100, 100)).astype(np.float32) uttid = "uttid{}".format(i) f[uttid] = x desire_xs_1.append(x) desire_ys.append(np.array([1, 2, 3, 4])) with kaldiio.WriteHelper("ark,scp:{},{}".format(ark_2, scp_2)) as f: for i in range(10): x = np.random.random((100, 100)).astype(np.float32) uttid = "uttid{}".format(i) f[uttid] = x desire_xs_2.append(x) desire_ys.append(np.array([1, 2, 3, 4])) batch = [] with open(scp_1, "r") as f: lines_1 = f.readlines() with open(scp_2, "r") as f: lines_2 = f.readlines() for line_1, line_2 in zip(lines_1, lines_2): uttid, path_1 = line_1.strip().split() uttid, path_2 = line_2.strip().split() batch.append(( uttid, { "input": [ { "feat": path_1, "name": "input1" }, { "feat": path_2, "name": "input2" }, ], "output": [{ "tokenid": "1 2 3 4", "name": "target1" }], }, )) load_inputs_and_targets = LoadInputsAndTargets() xs_1, xs_2, ys = load_inputs_and_targets(batch) for x, xd in zip(xs_1, desire_xs_1): np.testing.assert_array_equal(x, xd) for x, xd in zip(xs_2, desire_xs_2): np.testing.assert_array_equal(x, xd) for y, yd in zip(ys, desire_ys): np.testing.assert_array_equal(y, yd)
def gta_inference(args): set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show arguments for key in sorted(vars(args).keys()): logging.info("args: " + key + ": " + str(vars(args)[key])) # define model model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) assert isinstance(model, TTSInterface) logging.info(model) # load trained model parameters logging.info("reading model parameters from " + args.model) torch_load(args.model, model) model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # read json data with open(args.json, "rb") as f: js = json.load(f)["utts"] # check directory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) use_sortagrad = train_args.sortagrad == -1 or train_args.sortagrad > 0 if use_sortagrad: train_args.batch_sort_key = "input" if args.batch_size is not None: assert args.batch_size > 0 batch_size = args.batch_size else: batch_size = args.batch_size # make minibatch list (variable length) train_batchset = make_batchset( js, batch_size, train_args.maxlen_in, train_args.maxlen_out, train_args.minibatches, batch_sort_key=train_args.batch_sort_key, min_batch_size=train_args.ngpu if train_args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=train_args.batch_count, batch_bins=train_args.batch_bins, batch_frames_in=train_args.batch_frames_in, batch_frames_out=train_args.batch_frames_out, batch_frames_inout=train_args.batch_frames_inout, swap_io=True, iaxis=0, oaxis=0, ) load_tr = LoadInputsAndTargets( mode="tts", use_speaker_embedding=train_args.use_speaker_embedding, use_second_target=train_args.use_second_target, use_character_embedding=train_args.use_character_embedding, use_intonation_type=train_args.use_intonation_type, preprocess_conf=train_args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing keep_all_data_on_mem=train_args.keep_all_data_on_mem, ) converter = CustomConverter() # hack to make batchsize argument as 1 # actual bathsize is included in a list def transform(data, loader, converter): batch, utt_list = loader(data, return_uttid=True) batch = converter([batch]) return batch, utt_list train_dataset = TransformDataset( train_batchset, lambda data: transform(data, load_tr, converter)) feat_writer = kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=args.out)) for batch, utt_list in train_dataset: x = batch for key in x.keys(): x[key] = x[key].to(device) outputs = model.gta_inference(**x) olens = x['olens'] batch_size = olens.shape[0] for i in range(batch_size): utt_id = utt_list[i] mlspec = outputs[i] ol = olens[i] feat_writer[utt_id] = mlspec[:ol].cpu().numpy() feat_writer.close()
def __init__(self, wspecifier, filetype='mat', write_num_frames=None, compress=False, compression_method=2, pcm_format='wav'): self.writer_scp = None # Used for writing scp self.filename = None self.filetype = filetype # Used for filetype='sound' or 'sound.hdf5' self.pcm_format = pcm_format self.kwargs = {} if filetype == 'mat': if compress: self.writer = kaldiio.WriteHelper( wspecifier, compression_method=compression_method) else: self.writer = kaldiio.WriteHelper(wspecifier) elif filetype in ['hdf5', 'sound.hdf5', 'sound']: # 1. Create spec_dict # e.g. # ark,scp:out.ark,out.scp -> {'ark': 'out.ark', 'scp': 'out.scp'} ark_scp, filepath = wspecifier.split(':', 1) if ark_scp not in ['ark', 'scp,ark', 'ark,scp']: raise ValueError( '{} is not allowed: {}'.format(ark_scp, wspecifier)) ark_scps = ark_scp.split(',') filepaths = filepath.split(',') if len(ark_scps) != len(filepaths): raise ValueError( 'Mismatch: {} and {}'.format(ark_scp, filepath)) spec_dict = dict(zip(ark_scps, filepaths)) # 2. Set writer self.filename = spec_dict['ark'] if filetype == 'sound.hdf5': self.writer = SoundHDF5File(spec_dict['ark'], 'w', format=self.pcm_format) elif filetype == 'hdf5': self.writer = h5py.File(spec_dict['ark'], 'w') elif filetype == 'sound': # Use "ark" value as directory to save wav files # e.g. ark,scp:dirname,wav.scp # -> The wave files are found in dirname/*.wav wavdir = spec_dict['ark'] if not os.path.exists(wavdir): os.makedirs(wavdir) self.writer = None else: # Cannot reach raise RuntimeError # 3. Set writer_scp if 'scp' in spec_dict: self.writer_scp = io.open( spec_dict['scp'], 'w', encoding='utf-8') else: raise ValueError('Not supporting: filetype={}'.format(filetype)) if write_num_frames is not None: if ':' not in write_num_frames: raise ValueError('Must include ":", write_num_frames={}' .format(write_num_frames)) nframes_type, nframes_file = write_num_frames.split(':', 1) if nframes_type != 'ark,t': raise ValueError( 'Only supporting text mode. ' 'e.g. --write-num-frames=ark,t:foo.txt :' '{}'.format(nframes_type)) self.writer_nframe = io.open(nframes_file, 'w', encoding='utf-8') else: self.writer_nframe = None
#read from feats.scp #add feats scp direc if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "feats_file", help="path to the feats.scp we're going to use as example") parser.add_argument("target_file", help="path to target file (without the scp extension)") parser.add_argument("--filler", type=int, default=1, help="value to fille the matrix with (1 or 0)") parser.parse_args() args, leftovers = parser.parse_known_args() with kaldiio.ReadHelper('scp:{}'.format(args.feats_file)) as reader: feats = {} for key, numpy_array in reader: feats[key] = numpy_array with kaldiio.WriteHelper('ark,scp:{}.ark,{}.scp'.format( args.target_file, args.target_file)) as writer: for key, value in feats.items(): vec = np.full(len(value), args.filler, dtype=np.float32) writer(key, vec)
def decode(args): """Decode with E2E-TTS model.""" set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show arguments for key in sorted(vars(args).keys()): logging.info("args: " + key + ": " + str(vars(args)[key])) # define model model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) assert isinstance(model, TTSInterface) logging.info(model) # load trained model parameters logging.info("reading model parameters from " + args.model) torch_load(args.model, model) model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # read json data with open(args.json, "rb") as f: js = json.load(f)["utts"] # check directory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) load_inputs_and_targets = LoadInputsAndTargets( mode="tts", load_input=False, sort_in_input_length=False, use_speaker_embedding=train_args.use_speaker_embedding, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # define function for plot prob and att_ws def _plot_and_save(array, figname, figsize=(6, 4), dpi=150): import matplotlib.pyplot as plt shape = array.shape if len(shape) == 1: # for eos probability plt.figure(figsize=figsize, dpi=dpi) plt.plot(array) plt.xlabel("Frame") plt.ylabel("Probability") plt.ylim([0, 1]) elif len(shape) == 2: # for tacotron 2 attention weights, whose shape is (out_length, in_length) plt.figure(figsize=figsize, dpi=dpi) plt.imshow(array, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") elif len(shape) == 4: # for transformer attention weights, # whose shape is (#leyers, #heads, out_length, in_length) plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi) for idx1, xs in enumerate(array): for idx2, x in enumerate(xs, 1): plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2) plt.imshow(x, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") else: raise NotImplementedError("Support only from 1D to 4D array.") plt.tight_layout() if not os.path.exists(os.path.dirname(figname)): # NOTE: exist_ok = True is needed for parallel process decoding os.makedirs(os.path.dirname(figname), exist_ok=True) plt.savefig(figname) plt.close() # define function to calculate focus rate # (see section 3.3 in https://arxiv.org/abs/1905.09263) def _calculate_focus_rete(att_ws): if att_ws is None: # fastspeech case -> None return 1.0 elif len(att_ws.shape) == 2: # tacotron 2 case -> (L, T) return float(att_ws.max(dim=-1)[0].mean()) elif len(att_ws.shape) == 4: # transformer case -> (#layers, #heads, L, T) return float(att_ws.max(dim=-1)[0].mean(dim=-1).max()) else: raise ValueError("att_ws should be 2 or 4 dimensional tensor.") # define function to convert attention to duration def _convert_att_to_duration(att_ws): if len(att_ws.shape) == 2: # tacotron 2 case -> (L, T) pass elif len(att_ws.shape) == 4: # transformer case -> (#layers, #heads, L, T) # get the most diagonal head according to focus rate att_ws = torch.cat([att_w for att_w in att_ws], dim=0) # (#heads * #layers, L, T) diagonal_scores = att_ws.max(dim=-1)[0].mean( dim=-1) # (#heads * #layers,) diagonal_head_idx = diagonal_scores.argmax() att_ws = att_ws[diagonal_head_idx] # (L, T) else: raise ValueError("att_ws should be 2 or 4 dimensional tensor.") # calculate duration from 2d attention weight durations = torch.stack( [att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]) return durations.view(-1, 1).float() # define writer instances feat_writer = kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=args.out)) if args.save_durations: dur_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format( o=args.out.replace("feats", "durations"))) if args.save_focus_rates: fr_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format( o=args.out.replace("feats", "focus_rates"))) # start decoding for idx, utt_id in enumerate(js.keys()): # setup inputs batch = [(utt_id, js[utt_id])] data = load_inputs_and_targets(batch) x = torch.LongTensor(data[0][0]).to(device) spemb = None if train_args.use_speaker_embedding: spemb = torch.FloatTensor(data[1][0]).to(device) # decode and write start_time = time.time() outs, probs, att_ws = model.inference(x, args, spemb=spemb) logging.info("inference speed = %.1f frames / sec." % (int(outs.size(0)) / (time.time() - start_time))) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warning("output length reaches maximum length (%s)." % utt_id) focus_rate = _calculate_focus_rete(att_ws) logging.info("(%d/%d) %s (size: %d->%d, focus rate: %.3f)" % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0), focus_rate)) feat_writer[utt_id] = outs.cpu().numpy() if args.save_durations: ds = _convert_att_to_duration(att_ws) dur_writer[utt_id] = ds.cpu().numpy() if args.save_focus_rates: fr_writer[utt_id] = np.array(focus_rate).reshape(1, 1) # plot and save prob and att_ws if probs is not None: _plot_and_save( probs.cpu().numpy(), os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id, ) if att_ws is not None: _plot_and_save( att_ws.cpu().numpy(), os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id, ) # close file object feat_writer.close() if args.save_durations: dur_writer.close() if args.save_focus_rates: fr_writer.close()
def main(cmd_args): parser = get_parser() args, _ = parser.parse_known_args(cmd_args) # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') logging.warning('Skip DEBUG/INFO messages') # display PYTHONPATH logging.info('python path = ' + os.environ.get('PYTHONPATH', '(None)')) # set random seed logging.info('random seed = %d' % args.seed) random.seed(args.seed) np.random.seed(args.seed) set_deterministic_pytorch(args) logging.info("total speaker is %d" % args.nClasses) spk_model = SpeakerNet(nClasses=args.nClasses, nPerSpeaker=args.nPerSpeaker, trainfunc=args.trainfunc, nOut=512) if args.spk_model is not None: spk_model.loadParameters(args.spk_model) else: spk_model = None spk_model.eval() mean = np.array([[ -1.7101e+08, -1.727767e+08, -1.654258e+08, -1.568423e+08, -1.47768e+08, -1.355978e+08, -1.337955e+08, -1.290715e+08, -1.292888e+08, -1.333105e+08, -1.380836e+08, -1.388845e+08, -1.445241e+08, -1.438754e+08, -1.428372e+08, -1.428697e+08, -1.417773e+08, -1.400568e+08, -1.448087e+08, -1.459874e+08, -1.47229e+08, -1.490556e+08, -1.499799e+08, -1.522063e+08, -1.590756e+08, -1.618226e+08, -1.651485e+08, -1.684847e+08, -1.692581e+08, -1.714363e+08, -1.763494e+08, -1.776152e+08, -1.789162e+08, -1.805202e+08, -1.798933e+08, -1.818852e+08, -1.852947e+08, -1.860893e+08, -1.873477e+08, -1.889484e+08, -1.873008e+08, -1.891793e+08, -1.917609e+08, -1.932594e+08, -1.934982e+08, -1.90069e+08, -1.967007e+08, -1.955583e+08, -1.932292e+08, -2.001965e+08, -1.926799e+08, -2.013976e+08, -1.932717e+08, -1.997551e+08, -1.955731e+08, -1.958617e+08, -1.967825e+08, -1.952326e+08, -1.931164e+08, -1.947601e+08, -1.94064e+08, -1.937533e+08, -1.93948e+08, -1.940927e+08, -1.945755e+08, -1.955468e+08, -1.96344e+08, -1.963595e+08, -1.971519e+08, -1.991344e+08, -1.989762e+08, -2.000582e+08, -2.019397e+08, -2.019519e+08, -2.024301e+08, -2.031892e+08, -2.029932e+08, -2.029679e+08, -2.033156e+08, -2.033823e+08, -2.03208e+08, -2.036384e+08, -2.03879e+08, -2.04647e+08, -2.06028e+08, -2.060116e+08, -2.070609e+08, -2.071168e+08, -2.083309e+08, -2.092469e+08, -2.103796e+08, -2.122868e+08, -2.135678e+08, -2.144521e+08, -2.158103e+08, -2.171439e+08, -2.176665e+08, -2.191257e+08, -2.193856e+08, -2.21079e+08, -2.226874e+08, -2.247855e+08, -2.267768e+08, -2.286809e+08, -2.311216e+08, -2.33142e+08, -2.352095e+08, -2.373178e+08, -2.393992e+08, -2.415607e+08, -2.436022e+08, -2.450806e+08, -2.462217e+08, -2.47608e+08, -2.483978e+08, -2.495429e+08, -2.495807e+08, -2.501201e+08, -2.504308e+08, -2.506836e+08, -2.518955e+08, -2.528667e+08, -2.538843e+08, -2.553601e+08, -2.571577e+08, -2.592016e+08, -2.737314e+08, -3.25694e+08 ]]) var = np.array([[ 3.875797e+08, 3.972777e+08, 3.76892e+08, 3.590407e+08, 3.36797e+08, 2.982351e+08, 2.993923e+08, 2.900205e+08, 2.903182e+08, 3.00258e+08, 3.139445e+08, 3.133095e+08, 3.316776e+08, 3.290742e+08, 3.259625e+08, 3.292938e+08, 3.253266e+08, 3.20113e+08, 3.353506e+08, 3.40549e+08, 3.424283e+08, 3.454718e+08, 3.482779e+08, 3.577333e+08, 3.827005e+08, 3.899876e+08, 4.01662e+08, 4.141465e+08, 4.154033e+08, 4.238292e+08, 4.437099e+08, 4.463138e+08, 4.495017e+08, 4.545714e+08, 4.517053e+08, 4.601415e+08, 4.730579e+08, 4.755685e+08, 4.813327e+08, 4.884872e+08, 4.809006e+08, 4.883675e+08, 5.00223e+08, 5.064776e+08, 5.080264e+08, 4.91717e+08, 5.215152e+08, 5.169479e+08, 5.060737e+08, 5.381505e+08, 5.023963e+08, 5.430141e+08, 5.040811e+08, 5.339064e+08, 5.142676e+08, 5.158492e+08, 5.202875e+08, 5.131353e+08, 5.043084e+08, 5.129934e+08, 5.087678e+08, 5.064136e+08, 5.083315e+08, 5.083852e+08, 5.09834e+08, 5.150194e+08, 5.177091e+08, 5.167306e+08, 5.197394e+08, 5.282414e+08, 5.270312e+08, 5.324564e+08, 5.408028e+08, 5.407178e+08, 5.426285e+08, 5.456758e+08, 5.454526e+08, 5.462478e+08, 5.481372e+08, 5.508704e+08, 5.496423e+08, 5.518889e+08, 5.532486e+08, 5.56079e+08, 5.627578e+08, 5.617894e+08, 5.666932e+08, 5.67652e+08, 5.73079e+08, 5.768822e+08, 5.817027e+08, 5.912957e+08, 5.977753e+08, 6.0268e+08, 6.094717e+08, 6.166043e+08, 6.196362e+08, 6.269311e+08, 6.276106e+08, 6.369116e+08, 6.44361e+08, 6.551513e+08, 6.656342e+08, 6.762929e+08, 6.899264e+08, 7.008929e+08, 7.117181e+08, 7.238042e+08, 7.350025e+08, 7.47482e+08, 7.59422e+08, 7.681328e+08, 7.75756e+08, 7.834833e+08, 7.868992e+08, 7.938968e+08, 7.929719e+08, 7.966068e+08, 7.983973e+08, 7.993377e+08, 8.061261e+08, 8.111478e+08, 8.169364e+08, 8.25449e+08, 8.366562e+08, 8.486715e+08, 9.377093e+08, 1.289456e+09 ]]) num_sum = 8.478675e+07 with kaldiio.ReadHelper("scp:%s" % args.read_file) as reader, kaldiio.WriteHelper( 'ark,scp:%s.ark,%s.scp' % (args.write_file, args.write_file)) as writer: for key, numpy_array in reader: with torch.no_grad(): length = len(numpy_array) numpy_array = numpy_array[20:-20] # numpy_array = numpy_array[20:] # numpy_array = numpy_array[:-20] # numpy_array = numpy_array - mean/num_sum # numpy_array = numpy_array / ( var/num_sum - (mean/num_sum)**2) torch_array = torch.from_numpy(numpy_array).unsqueeze( 0).float() logging.info(torch_array.size()) writer[key] = spk_model(torch_array).squeeze(0).numpy()