def inference_embeddings(audio_reader, speaker_id): speaker_feat = generate_features_for_unseen_speakers(audio_reader, target_speaker=speaker_id) # batch_size => None (for inference). m = triplet_softmax_model(num_speakers_softmax=len(c.AUDIO.SPEAKERS_TRAINING_SET), emb_trainable=False, normalize_embeddings=True, batch_size=None) checkpoints = natsorted(glob('checkpoints/*.h5')) print(m.summary()) if len(checkpoints) != 0: checkpoint_file = checkpoints[-1] initial_epoch = int(checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1]) logger.info('Initial epoch is {}.'.format(initial_epoch)) logger.info('Loading checkpoint: {}.'.format(checkpoint_file)) m.load_weights(checkpoint_file) # latest one. emb_sp1 = m.predict(np.vstack(speaker_feat))[0] logger.info('Emb1.shape = {}'.format(emb_sp1.shape)) np.set_printoptions(suppress=True) emb1 = np.mean(emb_sp1, axis=0) print('*' * 80) print(emb1) print('*' * 80)
def inference_unseen_speakers(audio_reader, sp1, sp2): sp1_feat = generate_features_for_unseen_speakers(audio_reader, target_speaker=sp1) sp2_feat = generate_features_for_unseen_speakers(audio_reader, target_speaker=sp2) # batch_size => None (for inference). m = triplet_softmax_model(num_speakers_softmax=len( c.AUDIO.SPEAKERS_TRAINING_SET), emb_trainable=False, normalize_embeddings=True, batch_size=None) checkpoints = natsorted(glob('checkpoints/*.h5')) # compile_triplet_softmax_model(m, loss_on_softmax=False, loss_on_embeddings=False) print(m.summary()) if len(checkpoints) != 0: checkpoint_file = checkpoints[-1] initial_epoch = int( checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1]) logger.info('Initial epoch is {}.'.format(initial_epoch)) logger.info('Loading checkpoint: {}.'.format(checkpoint_file)) m.load_weights(checkpoint_file) # latest one. emb_sp1 = m.predict(np.vstack(sp1_feat))[0] emb_sp2 = m.predict(np.vstack(sp2_feat))[0] logger.info('Checking that L2 norm is 1.') logger.info(np.mean(np.linalg.norm(emb_sp1, axis=1))) logger.info(np.mean(np.linalg.norm(emb_sp2, axis=1))) from scipy.spatial.distance import cosine # note to myself: # embeddings are sigmoid-ed. # so they are between 0 and 1. # A hypersphere is defined on tanh. logger.info('Emb1.shape = {}'.format(emb_sp1.shape)) logger.info('Emb2.shape = {}'.format(emb_sp2.shape)) emb1 = np.mean(emb_sp1, axis=0) emb2 = np.mean(emb_sp2, axis=0) logger.info('Cosine = {}'.format(cosine(emb1, emb2)))
def load_model(): m = triplet_softmax_model(num_speakers_softmax=len( c.AUDIO.SPEAKERS_TRAINING_SET), emb_trainable=False, normalize_embeddings=True, batch_size=None) checkpoints = natsorted(glob('checkpoints/*.h5')) print(m.summary()) if len(checkpoints) != 0: checkpoint_file = checkpoints[-1] initial_epoch = int( checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1]) logger.info('Initial epoch is {}.'.format(initial_epoch)) logger.info('Loading checkpoint: {}.'.format(checkpoint_file)) m.load_weights(checkpoint_file) # latest one. return m