def create_net(self): from .vgg_thin import resnet_2D_v1 import keras from keras.layers import Conv2D, AveragePooling2D, Reshape, Flatten, Dense from keras.models import Model, Sequential bottleneck_dim = 512 l2_regularization = 0.01 # Import Thin ResNet34 # inputs, x = resnet_2D_v1(self.input, mode='train') from keras import backend backend.set_image_data_format('channels_first') x_fc = Conv2D(bottleneck_dim, (7, 1), strides=(1, 1), activation='relu', kernel_initializer='orthogonal', use_bias=True, trainable=True, padding='same', kernel_regularizer=keras.regularizers.l2(l2_regularization), bias_regularizer=keras.regularizers.l2(l2_regularization), name='x_fc')(x) x = AveragePooling2D((1, 5), strides=(1, 1), name='avg_pool')(x) x = Flatten()(x) x = keras.layers.Dense(bottleneck_dim, activation='relu', kernel_initializer='orthogonal', use_bias=True, trainable=True, kernel_regularizer=keras.regularizers.l2(l2_regularization), bias_regularizer=keras.regularizers.l2(l2_regularization), name='fc6')(x) dense_model = Sequential() add_final_layers(dense_model, self.config) x = dense_model(x) model = Model(inputs, x) adam = keras.optimizers.Adam( lr=wandb.config.learning_rate, # 0.0001 @ VGG beta_1=wandb.config.beta_1, beta_2=wandb.config.beta_2, epsilon=wandb.config.epsilon, decay=wandb.config.decay ) loss_function = get_loss(self.config) model.compile(loss=loss_function, optimizer=adam, metrics=['accuracy']) model.summary() return model
def create_net(self): model = Sequential() model.add( Bidirectional(LSTM(self.n_hidden1, return_sequences=True), input_shape=self.input)) model.add(Dropout(0.50)) model.add(Bidirectional(LSTM(self.n_hidden2))) model.add(Dense(self.n_speakers * 10)) model.add(Dropout(0.25)) model.add(Dense(self.n_speakers * 5)) add_final_layers(model, self.config) loss_function = get_loss(self.config) adam = keras.optimizers.Adam(self.adam_lr, self.adam_beta_1, self.adam_beta_2, self.adam_epsilon, self.adam_decay) model.compile(loss=loss_function, optimizer=adam, metrics=['accuracy']) return model
def create_embeddings(config, checkpoints, x_list, y_list, out_layer=7, seg_size=100): # Prepare return value set_of_embeddings = [] set_of_speakers = [] set_of_num_embeddings = [] set_of_total_times = [] # Values out of the loop metrics = ['accuracy'] loss = get_loss(config) custom_objects = get_custom_objects(config) optimizer = 'adadelta' for checkpoint in checkpoints: logger.info('Run checkpoint: ' + checkpoint) # Load and compile the trained network network_file = get_experiment_nets(checkpoint) model_full = load_model(network_file, custom_objects=custom_objects) model_full.compile(loss=loss, optimizer=optimizer, metrics=metrics) # Get a Model with the embedding layer as output and predict model_partial = Model(inputs=model_full.input, outputs=model_full.layers[out_layer].output) x_cluster_list = [] y_cluster_list = [] for x, y in zip(x_list, y_list): x_cluster = np.asarray(model_partial.predict(x)) x_cluster_list.append(x_cluster) y_cluster_list.append(y) embeddings, speakers, num_embeddings = \ generate_embeddings(x_cluster_list, y_cluster_list, x_cluster_list[0].shape[1]) # Fill return values set_of_embeddings.append(embeddings) set_of_speakers.append(speakers) set_of_num_embeddings.append(num_embeddings) # Calculate the time per utterance time = TimeCalculator.calc_time_all_utterances(y_cluster_list, seg_size) set_of_total_times.append(time) return checkpoints, set_of_embeddings, set_of_speakers, set_of_num_embeddings, set_of_total_times
def create_network_n_speakers(num_speakers, config): # Read parameters from config seg_size = config.getint('pairwise_kldiv', 'seg_size') spectrogram_height = config.getint('pairwise_kldiv', 'spectrogram_height') lr = config.getfloat('pairwise_kldiv', 'adadelta_learning_rate') rho = config.getfloat('pairwise_kldiv', 'adadelta_rho') epsilon = config.getfloat('pairwise_kldiv', 'adadelta_epsilon') # Initialize model model = Sequential() # convolution layer 1 model.add( Conv2D(filters=32, kernel_size=(4, 4), activation='relu', input_shape=(1, seg_size, spectrogram_height), data_format='channels_first')) model.add(BatchNormalization()) model.add(MaxPooling2D(pool_size=(4, 4), strides=(2, 2))) # convolution layer 2 model.add(Conv2D(filters=64, kernel_size=(4, 4), activation='relu')) model.add(BatchNormalization()) model.add(MaxPooling2D(pool_size=(4, 4), strides=(2, 2))) # dense layer model.add(Flatten()) model.add(Dense(units=(num_speakers * 10), activation='relu')) model.add(BatchNormalization()) model.add(Dropout(rate=0.5)) model.add(Dense(units=(num_speakers * 5), activation='relu')) add_final_layers(model, config) loss_function = get_loss(config) # Create Optimizer adadelta = Adadelta(lr=lr, rho=rho, epsilon=epsilon, decay=0.0) # Compile model model.compile(loss=loss_function, optimizer=adadelta, metrics=['accuracy']) return model
def create_net__classification_component(self, model): model.add(Dense(self.config.getint('pairwise_lstm', 'n_dense1'))) model.add(Dropout(0.25)) model.add(Dense(self.config.getint('pairwise_lstm', 'n_dense2'))) # This adds the final (Dense) layer # add_final_layers(model, self.config) loss_function = get_loss(self.config) adam = keras.optimizers.Adam( lr=self.config.getfloat('pairwise_lstm', 'adam_lr'), beta_1=self.config.getfloat('pairwise_lstm', 'adam_beta_1'), beta_2=self.config.getfloat('pairwise_lstm', 'adam_beta_2'), epsilon=self.config.getfloat('pairwise_lstm', 'adam_epsilon'), decay=self.config.getfloat('pairwise_lstm', 'adam_decay')) return model, loss_function, adam
def create_net(self): model = Sequential() model.add( Bidirectional(LSTM(self.n_hidden1, return_sequences=True), input_shape=self.input)) model.add(Dropout(0.50)) model.add(Bidirectional(LSTM(self.n_hidden2))) model.add(Dense(self.dense_factor * 10)) model.add(Dropout(0.25)) model.add(Dense(self.dense_factor * 5)) add_final_layers(model, self.config) loss_function = get_loss(self.config) adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss=loss_function, optimizer=adam, metrics=['accuracy']) model.summary() return model
def get_embeddings(self): short_utterance = self.config.getboolean('test', 'short_utterances') out_layer = self.config.getint('pairwise_lstm', 'out_layer') seg_size = self.config.getint('pairwise_lstm', 'seg_size') vec_size = self.config.getint('pairwise_lstm', 'vec_size') logger = get_logger('lstm', logging.INFO) logger.info('Run pairwise_lstm test') logger.info('out_layer -> ' + str(out_layer)) logger.info('seg_size -> ' + str(seg_size)) logger.info('vec_size -> ' + str(vec_size)) # Load and prepare train/test data x_train, speakers_train, s_list_train = load_test_data( self.get_validation_train_data()) x_test, speakers_test, s_list_test = load_test_data( self.get_validation_test_data()) x_train, speakers_train, = self.prepare_data(x_train, speakers_train) x_test, speakers_test = self.prepare_data(x_test, speakers_test) x_list, y_list, s_list = create_data_lists(short_utterance, x_train, x_test, speakers_train, speakers_test, s_list_train, s_list_test) # Prepare return values set_of_embeddings = [] set_of_speakers = [] speaker_numbers = [] set_of_total_times = [] if self.best: file_regex = self.name + ".*_best\.h5" else: file_regex = self.name + ".*\.h5" checkpoints = list_all_files(get_experiment_nets(), file_regex) # Values out of the loop metrics = [ 'accuracy', 'categorical_accuracy', ] loss = get_loss(self.config) custom_objects = get_custom_objects(self.config) optimizer = 'rmsprop' vector_size = vec_size # Fill return values for checkpoint in checkpoints: logger.info('Running checkpoint: ' + checkpoint) # Load and compile the trained network network_file = get_experiment_nets(checkpoint) model_full = load_model(network_file, custom_objects=custom_objects) model_full.compile(loss=loss, optimizer=optimizer, metrics=metrics) # Get a Model with the embedding layer as output and predict model_partial = Model(inputs=model_full.input, outputs=model_full.layers[out_layer].output) x_cluster_list = [] y_cluster_list = [] for x, y, s in zip(x_list, y_list, s_list): x_cluster = np.asarray(model_partial.predict(x)) x_cluster_list.append(x_cluster) y_cluster_list.append(y) embeddings, speakers, num_embeddings = generate_embeddings( x_cluster_list, y_cluster_list, vector_size) # Fill the embeddings and speakers into the arrays set_of_embeddings.append(embeddings) set_of_speakers.append(speakers) speaker_numbers.append(num_embeddings) # Calculate the time per utterance time = TimeCalculator.calc_time_all_utterances( y_cluster_list, seg_size) set_of_total_times.append(time) logger.info('Pairwise_lstm test done.') return checkpoints, set_of_embeddings, set_of_speakers, speaker_numbers, set_of_total_times