def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 output = model.inference(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 # compute d vectors by passing LSTM hidden # output = model.forward(dummy_input, dummy_hidden) # assert output.shape[0] == 4 # assert output.shape[1] == 20 # assert output.shape[2] == 256 # check normalization output_norm = T.nn.functional.normalize(output, dim=1, p=2) assert_diff = (output_norm - output).sum().item() assert output.type() == "torch.FloatTensor" assert (abs(assert_diff) < 1e-4), f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch dummy_input = T.rand(1, 240, 80) # B x T x D output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5) assert output.shape[0] == 1 assert output.shape[1] == 256 assert len(output.shape) == 2
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval ap = AudioProcessor(**c.audio) model = SpeakerEncoder( input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"], lstm_dim=c.model["lstm_dim"], num_lstm_layers=c.model["num_lstm_layers"], ) optimizer = RAdam(model.parameters(), lr=c.lr) if c.loss == "ge2e": criterion = GE2ELoss(loss_method="softmax") elif c.loss == "angleproto": criterion = AngleProtoLoss() else: raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets) global_step = args.restore_step _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
def setup_model(c): model = SpeakerEncoder( c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"], ) return model
def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config.audio.resample = True # create a dummy speaker encoder model = SpeakerEncoder(**config.model_params) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) x_vector = manager.compute_x_vector(mel.T) assert x_vector.shape[1] == 256 # compute x_vector directly from an input file x_vector = manager.compute_x_vector_from_clip(sample_wav_path) x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) x_vector = torch.FloatTensor(x_vector) x_vector2 = torch.FloatTensor(x_vector2) assert x_vector.shape[0] == 256 assert (x_vector - x_vector2).sum() == 0.0 # compute x_vector from a list of wav files. x_vector3 = manager.compute_x_vector_from_clip( [sample_wav_path, sample_wav_path2]) x_vector3 = torch.FloatTensor(x_vector3) assert x_vector3.shape[0] == 256 assert (x_vector - x_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
#print(f'wav_file: {wav_file}') if os.path.exists(wav_file): wav_files.append(wav_file) print(f'Count of wavs imported: {len(wav_files)}') else: # Parse all wav files in data_path wav_path = data_path wav_files = glob.glob(data_path + '/**/*.wav', recursive=True) output_files = [ wav_file.replace(wav_path, args.output_path).replace('.wav', '.npy') for wav_file in wav_files ] for output_file in output_files: os.makedirs(os.path.dirname(output_file), exist_ok=True) model = SpeakerEncoder(**c.model) model.load_state_dict(torch.load(args.model_path)['model']) model.eval() if args.use_cuda: model.cuda() for idx, wav_file in enumerate(tqdm(wav_files)): mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) if args.use_cuda: mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) np.save(output_files[idx], embedd.detach().cpu().numpy())
def setup_model(c): model = SpeakerEncoder(c.model['input_dim'], c.model['proj_dim'], c.model['lstm_dim'], c.model['num_lstm_layers']) return model