Ejemplo n.º 1
0
def start_training(working_dir):
    pre_training_phase=True
    ensures_dir(CHECKPOINTS_MTL_DIR)
    ensures_dir(CHECKPOINTS_MTL_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    logger.info('Started training.')
    kc = KerasFormatConverter(working_dir)
 
    num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
    logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}')
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax)
    base_model = dsm.m
    x = base_model.output
    x = Dense(1024, name='shared')(x)
    y=Dense(1024,name='speaker_task')(x)
    speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y)
    gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x)
    model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out])
    
    model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'})
    training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR)
    if training_checkpoint is not None:
        initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info(f'Initial epoch is {initial_epoch}.')
        logger.info(f'Loading softmax checkpoint: {training_checkpoint}.')
        model.load_weights(training_checkpoint)  # latest one.
    else:
        initial_epoch = 0
    fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
Ejemplo n.º 2
0
def start_training(working_dir, pre_training_phase=True):
    ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
    ensures_dir(CHECKPOINTS_TRIPLET_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    if pre_training_phase:
        logger.info('Softmax pre-training.')
        kc = KerasFormatConverter(working_dir)
        num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
        dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if pre_training_checkpoint is not None:
            initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
            logger.info(f'Initial epoch is {initial_epoch}.')
            logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
        else:
            initial_epoch = 0
        fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
    else:
        logger.info('Training with the triplet loss.')
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
        triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if triplet_checkpoint is not None:
            logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
            dsm.m.load_weights(triplet_checkpoint)
        elif pre_training_checkpoint is not None:
            logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
            # If `by_name` is True, weights are loaded into layers only if they share the
            # same name. This is useful for fine-tuning or transfer-learning models where
            # some of the layers have changed.
            dsm.m.load_weights(pre_training_checkpoint, by_name=True)
        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
        fit_model(dsm, working_dir, NUM_FRAMES)
 def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'):
     self.ext = ext
     self.cache_dir = os.path.join(cache_dir, 'audio-fbanks')
     ensures_dir(self.cache_dir)
     if audio_dir is not None:
         self.build_cache(os.path.expanduser(audio_dir), sample_rate)
     self.speakers_to_utterances = defaultdict(dict)
     for cache_file in find_files(self.cache_dir, ext='npy'):
         # /path/to/speaker_utterance.npy
         speaker_id, utterance_id = Path(cache_file).stem.split('_')
         self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
Ejemplo n.º 4
0
def make_student_prediction(model):
    students = find_files(AUDIOBASE)
    ensures_dir(PREDICTED_BASE)
    for stud in students:
        ensure_dir_for_filename(stud)
        mfcc = sample_from_mfcc(read_mfcc(stud, SAMPLE_RATE), NUM_FRAMES)
        predict = model.m.predict(np.expand_dims(mfcc, axis=0))
        sp, utt = extract_speaker_and_utterance_ids(stud)
        utt = utt.split('.wav')[-2]
        filename = os.path.join(PREDICTED_BASE, f'{sp}/{utt}.npy')
        ensure_dir_for_filename(filename)
        np.save(filename, predict)
Ejemplo n.º 5
0
 def __init__(self, working_dir, load_test_only=False):
     self.working_dir = working_dir
     self.output_dir = os.path.join(self.working_dir, 'keras-inputs')
     ensures_dir(self.output_dir)
     self.categorical_speakers = load_pickle(os.path.join(self.output_dir, 'categorical_speakers.pkl'))
     if not load_test_only:
         self.kx_train = load_npy(os.path.join(self.output_dir, 'kx_train.npy'))
         self.ky_train = load_npy(os.path.join(self.output_dir, 'ky_train.npy'))
     self.kx_test = load_npy(os.path.join(self.output_dir, 'kx_test.npy'))
     self.ky_test = load_npy(os.path.join(self.output_dir, 'ky_test.npy'))
     self.audio = Audio(cache_dir=self.working_dir, audio_dir=None)
     if self.categorical_speakers is None:
         self.categorical_speakers = SparseCategoricalSpeakers(self.audio.speaker_ids)
Ejemplo n.º 6
0
def main(args):
    ensures_dir(args.working_dir)

    if args.preprocess:
        if args.audio_dir is None:
            return Audio(cache_dir=args.working_dir, audio_dir=args.audio_dir, sample_rate=args.sample_rate)
    if args.build_keras_inputs:
        counts_per_speaker = [int(b) for b in args.counts_per_speaker.split(',')]
        kc = KerasFormatConverter(args.working_dir)
        kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
        kc.persist_to_disk()

    if args.train_embedding:
        if args.pre_training_phase:
            start_training(args.working_dir, pre_training_phase=args.pre_training_phase, epochs=args.epochs_pretrain)
        start_training(args.working_dir,  pre_training_phase=False, epochs=args.epochs_triplet)
    if args.train_classifier:
        start_training(args.working_dir, pre_training_phase=False, classify=True, epochs=args.epochs_classifier)
Ejemplo n.º 7
0
def build_audio_cache(working_dir, audio_dir, sample_rate):
    ensures_dir(working_dir)
    if audio_dir is None:
        audio_dir = os.path.join(working_dir, 'LibriSpeech')
    Audio(cache_dir=working_dir, audio_dir=audio_dir, sample_rate=sample_rate)