Esempio n. 1
0
def inference_embeddings(audio_reader, speaker_id):
    speaker_feat = generate_features_for_unseen_speakers(audio_reader, target_speaker=speaker_id)

    # batch_size => None (for inference).
    m = triplet_softmax_model(num_speakers_softmax=len(c.AUDIO.SPEAKERS_TRAINING_SET),
                              emb_trainable=False,
                              normalize_embeddings=True,
                              batch_size=None)

    checkpoints = natsorted(glob('checkpoints/*.h5'))
    print(m.summary())

    if len(checkpoints) != 0:
        checkpoint_file = checkpoints[-1]
        initial_epoch = int(checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info('Initial epoch is {}.'.format(initial_epoch))
        logger.info('Loading checkpoint: {}.'.format(checkpoint_file))
        m.load_weights(checkpoint_file)  # latest one.

    emb_sp1 = m.predict(np.vstack(speaker_feat))[0]

    logger.info('Emb1.shape = {}'.format(emb_sp1.shape))

    np.set_printoptions(suppress=True)
    emb1 = np.mean(emb_sp1, axis=0)

    print('*' * 80)
    print(emb1)
    print('*' * 80)
Esempio n. 2
0
def inference_unseen_speakers(audio_reader, sp1, sp2):
    sp1_feat = generate_features_for_unseen_speakers(audio_reader,
                                                     target_speaker=sp1)
    sp2_feat = generate_features_for_unseen_speakers(audio_reader,
                                                     target_speaker=sp2)

    # batch_size => None (for inference).
    m = triplet_softmax_model(num_speakers_softmax=len(
        c.AUDIO.SPEAKERS_TRAINING_SET),
                              emb_trainable=False,
                              normalize_embeddings=True,
                              batch_size=None)

    checkpoints = natsorted(glob('checkpoints/*.h5'))

    # compile_triplet_softmax_model(m, loss_on_softmax=False, loss_on_embeddings=False)
    print(m.summary())

    if len(checkpoints) != 0:
        checkpoint_file = checkpoints[-1]
        initial_epoch = int(
            checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info('Initial epoch is {}.'.format(initial_epoch))
        logger.info('Loading checkpoint: {}.'.format(checkpoint_file))
        m.load_weights(checkpoint_file)  # latest one.

    emb_sp1 = m.predict(np.vstack(sp1_feat))[0]
    emb_sp2 = m.predict(np.vstack(sp2_feat))[0]

    logger.info('Checking that L2 norm is 1.')
    logger.info(np.mean(np.linalg.norm(emb_sp1, axis=1)))
    logger.info(np.mean(np.linalg.norm(emb_sp2, axis=1)))

    from scipy.spatial.distance import cosine

    # note to myself:
    # embeddings are sigmoid-ed.
    # so they are between 0 and 1.
    # A hypersphere is defined on tanh.

    logger.info('Emb1.shape = {}'.format(emb_sp1.shape))
    logger.info('Emb2.shape = {}'.format(emb_sp2.shape))

    emb1 = np.mean(emb_sp1, axis=0)
    emb2 = np.mean(emb_sp2, axis=0)

    logger.info('Cosine = {}'.format(cosine(emb1, emb2)))
Esempio n. 3
0
def load_model():
    m = triplet_softmax_model(num_speakers_softmax=len(
        c.AUDIO.SPEAKERS_TRAINING_SET),
                              emb_trainable=False,
                              normalize_embeddings=True,
                              batch_size=None)

    checkpoints = natsorted(glob('checkpoints/*.h5'))
    print(m.summary())

    if len(checkpoints) != 0:
        checkpoint_file = checkpoints[-1]
        initial_epoch = int(
            checkpoint_file.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info('Initial epoch is {}.'.format(initial_epoch))
        logger.info('Loading checkpoint: {}.'.format(checkpoint_file))
        m.load_weights(checkpoint_file)  # latest one.
    return m