def test1(): x_size = 10 x = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x1 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x2 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x3 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x4 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x5 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x6 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x7 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x8 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 x9 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05 y = torch.cat([x, x1, x2, x3, x4, x5, x6, x7, x8, x9]) yy = y.unsqueeze(0).repeat(10, 1, 1) c = torch.stack(y.split([x_size] * 10), 0).mean(1, keepdim=True) cc = c.repeat(1, x_size * 10, 1) cc = cc.permute(1, 0, 2) yy = yy.permute(1, 0, 2) xx = F.cosine_similarity(cc, yy, dim=-1) print(xx) se = SpeakerEncoder(12, 10, 10) print(se.similarity_matrix(y)) x = se.similarity_matrix(y) plt.figure(figsize=(8, 4)) plt.imshow(x.data, cmap='gray', interpolation='nearest', aspect='auto') plt.colorbar() plt.title('similarity_matrix.png') plt.show()
def setup_model(self): config = self.config model = SpeakerEncoder( config.data.n_mel, config.model.num_layers, config.model.hidden_size, config.model.embedding_size) optimizer = Adam( config.training.learning_rate_init, parameters=model.parameters(), grad_clip=ClipGradByGlobalNorm(3)) self.model = DataParallel(model) if self.parallel else model self.model_core = model self.optimizer = optimizer
def __init__(self, dataset): # Built up the model self.tok = dataset.tok self.feature_size = 2048 if args.model == 'init': self.encoder = SpeakerEncoder(self.feature_size).cuda() ctx_size = self.feature_size self.decoder = SpeakerDecoderTran(self.tok.vocab_size, ctx_size, heads=1).cuda() elif args.model == 'heads': self.encoder = MultiCtxEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = MultiCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda() elif args.model == 'crossatt': self.encoder = CrossAttEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = MultiCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda() elif args.model == 'newheads': self.encoder = MultiCtxEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = NewCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda() elif args.model == 'newcross': self.encoder = NewAttEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = NewCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda() elif args.model == 'dynamic': self.encoder = MultiCtxEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = DynamicDecoderFC(self.tok.vocab_size, ctx_size).cuda() elif args.model == 'dynamicmh': self.encoder = MultiCtxEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = DynamicDecoderMH(self.tok.vocab_size, ctx_size).cuda() elif args.model == 'dmc': self.encoder = MultiCtxEncoder(self.feature_size).cuda() ctx_size = self.encoder.ctx_dim self.decoder = DynamicDecoderMHC(self.tok.vocab_size, ctx_size).cuda() if args.baseline == 'linear': self.critic = LinearAct(args.hid_dim, 1).cuda() # Optimizer self.optim = args.optimizer(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=args.lr) # Logs self.output = args.output os.makedirs(self.output, exist_ok=True) self.writer = SummaryWriter(log_dir=self.output) # Tensorboard summary writer # Loss self.softmax_loss = torch.nn.CrossEntropyLoss(ignore_index=self.tok.pad_id)
def clone_as_averaged_model(model, ema): assert ema is not None averaged_model = SpeakerEncoder(input_size=hparams.num_mels) if use_cuda: averaged_model = averaged_model.cuda() averaged_model.load_state_dict(model.state_dict()) for name, param in averaged_model.named_parameters(): if name in ema.shadow: param.data = ema.shadow[name].clone() return averaged_model
def test_audio_speaker_encoder(sr=8000): mels = [] paths = [speaker1_path, speaker2_path, speaker3_path, speaker4_path] for audio_path in paths: x, sr = librosa.load(audio_path, sr=sr) x, index = librosa.effects.trim(x, 10) a = (160 * 0.01 + 0.025) / 4 audios = split_audio(x, sr=sr,seg_length=a) mels += get_split_mels(audios, sr=sr,)[:6] print(mels) mels = np.stack(mels, axis=0) mels = np.transpose(mels, [0, 2, 1]) mels = mels.transpose(1, 0, 2) speaker_encoder = SpeakerEncoder(40, 4, 6) mels = torch.from_numpy(mels).float() d_vector, sim_matrix = speaker_encoder(mels) plt.figure(figsize=(8, 4)) plt.imshow(sim_matrix.data, interpolation='nearest', aspect='auto') plt.colorbar() plt.title('similarity_matrix.png') plt.show()
def load_model(config, weights_fpath: Path): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model will be loaded and will run on this device. Outputs will however always be on the cpu. If None, will default to your GPU if it"s available, otherwise your CPU. """ # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. global _model _model = SpeakerEncoder(config.data.n_mels, config.model.num_layers, config.model.hidden_size, config.model.embedding_size) model_state_dict = paddle.load(weights_fpath + ".pdparams") _model.set_state_dict(model_state_dict) _model.eval() print(f"Loaded encoder {weights_fpath}")
from model import SpeakerEncoder from util import random_batch, get_split_mels, split_audio, test_random_batch from data_preprocess import get_feature import librosa import numpy as np from train import load_checkpoint import torch import matplotlib.pyplot as plt speaker = 30 utter = 10 speaker_encoder = SpeakerEncoder(40, n=speaker, m=utter) checkpoint = torch.load('/home/zeng/work/mywork/GE2E/checkpoints/checkpoint_step000112000_ema.pth') audio_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p364/p364_001.wav' speaker1_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p230/p230_008.wav' speaker2_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p225/p225_011.wav' speaker3_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p226/p226_022.wav' speaker4_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p345/p345_018.wav' wav_path = '/home/zeng/work/data/VCTK-Corpus/wav48/' speaker_encoder.load_state_dict(checkpoint["state_dict"]) def test_tisv(): x = random_batch(speaker_num=10, utter_num=10, train=False) print(x.shape) x = torch.from_numpy(x).float() d_vector, sim_matrix = speaker_encoder(x) print(d_vector) plt.figure(figsize=(8, 4)) plt.imshow(sim_matrix.data, interpolation='nearest', aspect='auto') plt.colorbar()
hparams.parse(args["--hparams"]) print(hparams_debug_string()) # Presets if hparams.preset is not None and hparams.preset != "": preset = hparams.presets[hparams.preset] import json hparams.parse_json(json.dumps(preset)) print("Override hyper parameters with preset \"{}\": {}".format( hparams.preset, json.dumps(preset, indent=4))) os.makedirs(checkpoint_dir, exist_ok=True) # Model model = SpeakerEncoder(input_size=hparams.num_mels) print(model) if use_cuda: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=hparams.steps) if checkpoint_restore_parts is not None: restore_parts(checkpoint_restore_parts, model) # Load checkpoints if checkpoint_path is not None: load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer) # Setup summary writer for tensorboard