def start_training(working_dir): pre_training_phase=True ensures_dir(CHECKPOINTS_MTL_DIR) ensures_dir(CHECKPOINTS_MTL_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] logger.info('Started training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax) base_model = dsm.m x = base_model.output x = Dense(1024, name='shared')(x) y=Dense(1024,name='speaker_task')(x) speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y) gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x) model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'}) training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR) if training_checkpoint is not None: initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {training_checkpoint}.') model.load_weights(training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
def start_training(working_dir, pre_training_phase=True): ensures_dir(CHECKPOINTS_SOFTMAX_DIR) ensures_dir(CHECKPOINTS_TRIPLET_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] if pre_training_phase: logger.info('Softmax pre-training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.') dsm.m.load_weights(pre_training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) else: logger.info('Training with the triplet loss.') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.') dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) fit_model(dsm, working_dir, NUM_FRAMES)
def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'): self.ext = ext self.cache_dir = os.path.join(cache_dir, 'audio-fbanks') ensures_dir(self.cache_dir) if audio_dir is not None: self.build_cache(os.path.expanduser(audio_dir), sample_rate) self.speakers_to_utterances = defaultdict(dict) for cache_file in find_files(self.cache_dir, ext='npy'): # /path/to/speaker_utterance.npy speaker_id, utterance_id = Path(cache_file).stem.split('_') self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
def make_student_prediction(model): students = find_files(AUDIOBASE) ensures_dir(PREDICTED_BASE) for stud in students: ensure_dir_for_filename(stud) mfcc = sample_from_mfcc(read_mfcc(stud, SAMPLE_RATE), NUM_FRAMES) predict = model.m.predict(np.expand_dims(mfcc, axis=0)) sp, utt = extract_speaker_and_utterance_ids(stud) utt = utt.split('.wav')[-2] filename = os.path.join(PREDICTED_BASE, f'{sp}/{utt}.npy') ensure_dir_for_filename(filename) np.save(filename, predict)
def __init__(self, working_dir, load_test_only=False): self.working_dir = working_dir self.output_dir = os.path.join(self.working_dir, 'keras-inputs') ensures_dir(self.output_dir) self.categorical_speakers = load_pickle(os.path.join(self.output_dir, 'categorical_speakers.pkl')) if not load_test_only: self.kx_train = load_npy(os.path.join(self.output_dir, 'kx_train.npy')) self.ky_train = load_npy(os.path.join(self.output_dir, 'ky_train.npy')) self.kx_test = load_npy(os.path.join(self.output_dir, 'kx_test.npy')) self.ky_test = load_npy(os.path.join(self.output_dir, 'ky_test.npy')) self.audio = Audio(cache_dir=self.working_dir, audio_dir=None) if self.categorical_speakers is None: self.categorical_speakers = SparseCategoricalSpeakers(self.audio.speaker_ids)
def main(args): ensures_dir(args.working_dir) if args.preprocess: if args.audio_dir is None: return Audio(cache_dir=args.working_dir, audio_dir=args.audio_dir, sample_rate=args.sample_rate) if args.build_keras_inputs: counts_per_speaker = [int(b) for b in args.counts_per_speaker.split(',')] kc = KerasFormatConverter(args.working_dir) kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker) kc.persist_to_disk() if args.train_embedding: if args.pre_training_phase: start_training(args.working_dir, pre_training_phase=args.pre_training_phase, epochs=args.epochs_pretrain) start_training(args.working_dir, pre_training_phase=False, epochs=args.epochs_triplet) if args.train_classifier: start_training(args.working_dir, pre_training_phase=False, classify=True, epochs=args.epochs_classifier)
def build_audio_cache(working_dir, audio_dir, sample_rate): ensures_dir(working_dir) if audio_dir is None: audio_dir = os.path.join(working_dir, 'LibriSpeech') Audio(cache_dir=working_dir, audio_dir=audio_dir, sample_rate=sample_rate)