Beispiel #1
0
def test1():
    x_size = 10
    x = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x1 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x2 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x3 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x4 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x5 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x6 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x7 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x8 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    x9 = torch.randn(1, 12).repeat(x_size, 1) + torch.randn(x_size, 12) * 0.05
    y = torch.cat([x, x1, x2, x3, x4, x5, x6, x7, x8, x9])
    yy = y.unsqueeze(0).repeat(10, 1, 1)
    c = torch.stack(y.split([x_size] * 10), 0).mean(1, keepdim=True)
    cc = c.repeat(1, x_size * 10, 1)
    cc = cc.permute(1, 0, 2)
    yy = yy.permute(1, 0, 2)
    xx = F.cosine_similarity(cc, yy, dim=-1)
    print(xx)

    se = SpeakerEncoder(12, 10, 10)
    print(se.similarity_matrix(y))
    x = se.similarity_matrix(y)
    plt.figure(figsize=(8, 4))
    plt.imshow(x.data, cmap='gray', interpolation='nearest', aspect='auto')
    plt.colorbar()
    plt.title('similarity_matrix.png')
    plt.show()
Beispiel #2
0
 def setup_model(self):
     config = self.config
     model = SpeakerEncoder(
         config.data.n_mel, 
         config.model.num_layers,
         config.model.hidden_size,
         config.model.embedding_size)
     optimizer = Adam(
         config.training.learning_rate_init, 
         parameters=model.parameters(),
         grad_clip=ClipGradByGlobalNorm(3))
     self.model = DataParallel(model) if self.parallel else model
     self.model_core = model
     self.optimizer = optimizer
Beispiel #3
0
    def __init__(self, dataset):
        # Built up the model
        self.tok = dataset.tok
        self.feature_size = 2048

        if args.model == 'init':
            self.encoder = SpeakerEncoder(self.feature_size).cuda()
            ctx_size = self.feature_size
            self.decoder = SpeakerDecoderTran(self.tok.vocab_size, ctx_size, heads=1).cuda()
        elif args.model == 'heads':
            self.encoder = MultiCtxEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = MultiCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda()
        elif args.model == 'crossatt':
            self.encoder = CrossAttEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = MultiCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda()
        elif args.model == 'newheads':
            self.encoder = MultiCtxEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = NewCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda()
        elif args.model == 'newcross':
            self.encoder = NewAttEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = NewCtxDecoder(self.tok.vocab_size, ctx_size, heads=2).cuda()
        elif args.model == 'dynamic':
            self.encoder = MultiCtxEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = DynamicDecoderFC(self.tok.vocab_size, ctx_size).cuda()
        elif args.model == 'dynamicmh':
            self.encoder = MultiCtxEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = DynamicDecoderMH(self.tok.vocab_size, ctx_size).cuda()
        elif args.model == 'dmc':
            self.encoder = MultiCtxEncoder(self.feature_size).cuda()
            ctx_size = self.encoder.ctx_dim
            self.decoder = DynamicDecoderMHC(self.tok.vocab_size, ctx_size).cuda()

        if args.baseline == 'linear':
            self.critic = LinearAct(args.hid_dim, 1).cuda()

        # Optimizer
        self.optim = args.optimizer(list(self.encoder.parameters()) + list(self.decoder.parameters()),
                                    lr=args.lr)

        # Logs
        self.output = args.output
        os.makedirs(self.output, exist_ok=True)
        self.writer = SummaryWriter(log_dir=self.output)     # Tensorboard summary writer

        # Loss
        self.softmax_loss = torch.nn.CrossEntropyLoss(ignore_index=self.tok.pad_id)
Beispiel #4
0
def clone_as_averaged_model(model, ema):
    assert ema is not None
    averaged_model = SpeakerEncoder(input_size=hparams.num_mels)
    if use_cuda:
        averaged_model = averaged_model.cuda()
    averaged_model.load_state_dict(model.state_dict())
    for name, param in averaged_model.named_parameters():
        if name in ema.shadow:
            param.data = ema.shadow[name].clone()
    return averaged_model
Beispiel #5
0
def test_audio_speaker_encoder(sr=8000):
    mels = []
    paths = [speaker1_path, speaker2_path, speaker3_path, speaker4_path]
    for audio_path in paths:
        x, sr = librosa.load(audio_path, sr=sr)
        x, index = librosa.effects.trim(x, 10)
        a = (160 * 0.01 + 0.025) / 4
        audios = split_audio(x, sr=sr,seg_length=a)
        mels += get_split_mels(audios, sr=sr,)[:6]

    print(mels)
    mels = np.stack(mels, axis=0)
    mels = np.transpose(mels, [0, 2, 1])
    mels = mels.transpose(1, 0, 2)

    speaker_encoder = SpeakerEncoder(40, 4, 6)
    mels = torch.from_numpy(mels).float()
    d_vector, sim_matrix = speaker_encoder(mels)
    plt.figure(figsize=(8, 4))
    plt.imshow(sim_matrix.data, interpolation='nearest', aspect='auto')
    plt.colorbar()
    plt.title('similarity_matrix.png')
    plt.show()
Beispiel #6
0
def load_model(config, weights_fpath: Path):
    """
    Loads the model in memory. If this function is not explicitely called, it will be run on the 
    first call to embed_frames() with the default weights file.
    
    :param weights_fpath: the path to saved model weights.
    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
    If None, will default to your GPU if it"s available, otherwise your CPU.
    """
    # TODO: I think the slow loading of the encoder might have something to do with the device it
    #   was saved on. Worth investigating.
    global _model
    _model = SpeakerEncoder(config.data.n_mels, config.model.num_layers,
                            config.model.hidden_size,
                            config.model.embedding_size)
    model_state_dict = paddle.load(weights_fpath + ".pdparams")
    _model.set_state_dict(model_state_dict)
    _model.eval()
    print(f"Loaded encoder {weights_fpath}")
Beispiel #7
0
from model import SpeakerEncoder
from util import random_batch, get_split_mels, split_audio, test_random_batch
from data_preprocess import get_feature
import librosa
import numpy as np
from train import load_checkpoint
import torch
import matplotlib.pyplot as plt

speaker = 30
utter = 10
speaker_encoder = SpeakerEncoder(40, n=speaker, m=utter)
checkpoint = torch.load('/home/zeng/work/mywork/GE2E/checkpoints/checkpoint_step000112000_ema.pth')
audio_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p364/p364_001.wav'
speaker1_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p230/p230_008.wav'
speaker2_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p225/p225_011.wav'
speaker3_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p226/p226_022.wav'
speaker4_path = '/home/zeng/work/data/VCTK-Corpus/wav48/p345/p345_018.wav'
wav_path = '/home/zeng/work/data/VCTK-Corpus/wav48/'
speaker_encoder.load_state_dict(checkpoint["state_dict"])


def test_tisv():
    x = random_batch(speaker_num=10, utter_num=10, train=False)
    print(x.shape)
    x = torch.from_numpy(x).float()
    d_vector, sim_matrix = speaker_encoder(x)
    print(d_vector)
    plt.figure(figsize=(8, 4))
    plt.imshow(sim_matrix.data, interpolation='nearest', aspect='auto')
    plt.colorbar()
Beispiel #8
0
    hparams.parse(args["--hparams"])
    print(hparams_debug_string())

    # Presets
    if hparams.preset is not None and hparams.preset != "":
        preset = hparams.presets[hparams.preset]
        import json

        hparams.parse_json(json.dumps(preset))
        print("Override hyper parameters with preset \"{}\": {}".format(
            hparams.preset, json.dumps(preset, indent=4)))

    os.makedirs(checkpoint_dir, exist_ok=True)

    # Model
    model = SpeakerEncoder(input_size=hparams.num_mels)
    print(model)
    if use_cuda:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=hparams.steps)

    if checkpoint_restore_parts is not None:
        restore_parts(checkpoint_restore_parts, model)

    # Load checkpoints
    if checkpoint_path is not None:
        load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer)

    # Setup summary writer for tensorboard