Example #1
0
def main(libri_dir=DATASET_DIR):
    libri = read_librispeech_structure(libri_dir)
    batch = stochastic_mini_batch(libri, batch_size=BATCH_NUM_TRIPLETS)
    batch_size = BATCH_NUM_TRIPLETS * 3  # A triplet has 3 parts.
    x, y = batch.to_inputs()
    b = x[0]
    num_frames = b.shape[0]
    print('num_frames = ', num_frames)

    model = convolutional_model(batch_input_shape=[batch_size * num_frames] + list(b.shape[1:]),
                                batch_size=batch_size, num_frames=num_frames)
    model.compile(optimizer='adam', loss=deep_speaker_loss)

    print(model.summary())
    grad_steps = 0
    orig_time = time()
    while True:
        batch = stochastic_mini_batch(libri, batch_size=BATCH_NUM_TRIPLETS)
        x, _ = batch.to_inputs()

        # output.shape = (3, 383, 32, 32, 3) something like this
        # explanation  = (batch_size, num_frames, width, height, channels)
        x = np.reshape(x, (batch_size * num_frames, b.shape[2], b.shape[2], b.shape[3]))

        # we don't need to use the targets y, because we know by the convention that:
        # we have [anchors, positive examples, negative examples]. The loss only uses x and
        # can determine if a sample is an anchor, positive or negative sample.
        stub_targets = np.random.uniform(size=(x.shape[0], 1))
        # result = model.predict(x, batch_size=x.shape[0])
        # print(result.shape)
        # np.set_printoptions(precision=2)
        # print(result[0:20, 0:5])
        loss = model.train_on_batch(x, stub_targets)
        print('batch #{0} processed in {1:.2f}s, training loss = {2}.'.format(grad_steps, time() - orig_time, loss))
        grad_steps += 1
        orig_time = time()
Example #2
0
def main(libri_dir=c.DATASET_DIR):
    logging.info('Looking for audio [wav] files in {}.'.format(libri_dir))
    libri = read_librispeech_structure(libri_dir)

    if len(libri) == 0:
        logging.warning(
            'Have you converted flac files to wav? If not, run audio/convert_flac_2_wav.sh'
        )
        exit(1)

    batch = stochastic_mini_batch(libri, batch_size=c.BATCH_NUM_TRIPLETS)
    batch_size = c.BATCH_NUM_TRIPLETS * 3  # A triplet has 3 parts.
    x, y = batch.to_inputs()
    b = x[0]
    num_frames = b.shape[0]
    logging.info('num_frames = {}'.format(num_frames))

    batch_shape = [batch_size * num_frames] + list(b.shape[1:])
    logging.info('batch shape: {}'.format(batch_shape))
    logging.info('batch size: {}'.format(batch_size))
    model = convolutional_model(batch_input_shape=batch_shape,
                                batch_size=batch_size,
                                num_frames=num_frames)
    logging.info(model.summary())

    logging.info('Compiling the model...')
    model.compile(optimizer='adam', loss=deep_speaker_loss)
    logging.info('[DONE]')

    grad_steps = 0
    last_checkpoint = get_last_checkpoint_if_any(c.CHECKPOINT_FOLDER)
    if last_checkpoint is not None:
        logging.info('Found checkpoint [{}]. Resume from here...'.format(
            last_checkpoint))
        model.load_weights(last_checkpoint)
        grad_steps = int(last_checkpoint.split('_')[-2])
        logging.info('[DONE]')

    logging.info('Starting training...')
    orig_time = time()

    while True:
        grad_steps += 1
        batch = stochastic_mini_batch(libri, batch_size=c.BATCH_NUM_TRIPLETS)
        x, _ = batch.to_inputs()

        # output.shape = (3, 383, 32, 32, 3) something like this
        # explanation  = (batch_size, num_frames, width, height, channels)
        logging.info('x.shape before reshape: {}'.format(x.shape))
        x = np.reshape(
            x, (batch_size * num_frames, b.shape[2], b.shape[2], b.shape[3]))
        logging.info('x.shape after  reshape: {}'.format(x.shape))

        # we don't need to use the targets y, because we know by the convention that:
        # we have [anchors, positive examples, negative examples]. The loss only uses x and
        # can determine if a sample is an anchor, positive or negative sample.
        stub_targets = np.random.uniform(size=(x.shape[0], 1))
        # result = model.predict(x, batch_size=x.shape[0])
        # logging.info(result.shape)
        # np.set_printoptions(precision=2)
        # logging.info(result[0:20, 0:5])

        logging.info('-' * 80)
        logging.info('== Presenting batch #{0}'.format(grad_steps))
        logging.info(batch.libri_batch)
        loss = model.train_on_batch(x, stub_targets)
        logging.info(
            '== Processed in {0:.2f}s by the network, training loss = {1}.'.
            format(time() - orig_time, loss))
        orig_time = time()

        # record training loss
        with open(c.LOSS_FILE, "a") as f:
            f.write("{0},{1}\n".format(grad_steps, loss))

        # checkpoints are really heavy so let's just keep the last one.
        create_dir_and_delete_content(c.CHECKPOINT_FOLDER)
        model.save_weights('{0}/model_{1}_{2:.5f}.h5'.format(
            c.CHECKPOINT_FOLDER, grad_steps, loss))