def create_model(self, args): window = self.window window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) initial_layer = tf.layers.conv1d(window_with_channel, args.residual_channels, args.initial_filter_width, 1, args.initial_filter_padding, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) skip_connections = [] dilations = [2**x for x in range(int(np.log2(args.max_dilation))+1)]*args.stack_number print(dilations) current_layer = initial_layer with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(dilations): with tf.name_scope('layer{}'.format(layer_index)): conv_filter = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) conv_gate = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate) skip = tf.layers.conv1d(out, args.skip_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) transformed = tf.layers.conv1d(out, args.residual_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) if args.dilation_layer_dropout: transformed = tf.layers.dropout(transformed, args.dilation_layer_dropout, training=self.is_training) current_layer = transformed + current_layer skip_connections.append(skip) print(skip) with tf.name_scope('postprocessing'): skip_sum = tf.math.add_n(skip_connections) skip = tf.nn.relu(skip_sum) if args.skip_layer_dropout: skip = tf.layers.dropout(skip, args.skip_layer_dropout, training=self.is_training) # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid") # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = common.add_layers_from_string(skip, args.postprocessing) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.nn.relu(skip_sum) print(output_layer.shape) if output_layer.shape.as_list() != [None, self.annotations_per_window, self.bin_count]: print("shape not compatible, adding FC layer") output_layer = tf.nn.relu(output_layer) output_layer = tf.layers.flatten(output_layer) output_layer = tf.layers.dense(output_layer, self.annotations_per_window*self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = tf.reshape(output_layer, [-1, self.annotations_per_window, self.bin_count]) self.note_logits = output_layer self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): spectrogram = self.spectrogram with tf.name_scope('model'): layer = spectrogram if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 2*args.capacity_multiplier, (5, 5), (1, 1), "same", activation=tf.nn.relu) layer = tf.layers.batch_normalization(layer, training=self.is_training) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) layer = tf.layers.conv2d(layer, args.capacity_multiplier, (5, 5), (1, 1), "same", activation=tf.nn.relu) layer = tf.layers.batch_normalization(layer, training=self.is_training) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) layer = tf.layers.conv2d(layer, args.capacity_multiplier, (3, 3), (1, 1), "same", activation=tf.nn.relu) layer = tf.layers.batch_normalization(layer, training=self.is_training) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) layer = tf.layers.conv2d(layer, args.capacity_multiplier, (3, 3), (1, 1), "same", activation=tf.nn.relu) layer = tf.layers.batch_normalization(layer, training=self.is_training) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) layer = tf.layers.conv2d(layer, max(2, args.capacity_multiplier//8), (3, 70), (1, 1), "same", activation=tf.nn.relu) layer = tf.layers.batch_normalization(layer, training=self.is_training) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) layer = tf.layers.conv2d(layer, 1, (1, 1), (1, 1), "same", activation=tf.nn.sigmoid) note_output = tf.squeeze(layer, -1) self.note_logits = note_output self.note_probabilities = note_output annotations = self.annotations[:, :, 0] - args.min_note voicing_ref = tf.cast(tf.greater(annotations, 0), tf.float32) note_ref = tf.tile(tf.reshape(annotations, [-1, self.annotations_per_window, 1]), [1, 1, self.bin_count]) ref_probabilities = tf.exp(-(note_ref-self.note_bins)**2/(args.annotation_smoothing**2)) voicing_weights = tf.tile(tf.expand_dims(voicing_ref, -1), [1, 1, self.bin_count]) ref_probabilities *= voicing_weights def bkld(y_true, y_pred): """KL Divergence where both y_true an y_pred are probabilities """ epsilon = tf.constant(1e-07) y_true = tf.clip_by_value(y_true, epsilon, 1.0 - epsilon) y_pred = tf.clip_by_value(y_pred, epsilon, 1.0 - epsilon) return tf.math.reduce_mean(-1.0*y_true * tf.log(y_pred) - (1.0 - y_true) * tf.log(1.0 - y_pred)) self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = bkld(ref_probabilities, self.note_probabilities) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: # pro spektrogramy začínající na nižší notě než je výstup # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) # offset = args.min_note - spectrogram_min_note spectrogram = common.harmonic_stacking( self, self.spectrogram, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking) else: spectrogram = self.spectrogram[:, :, :self.bin_count, :] # if args.specaugment_prob: # in_shape = tf.shape(spectrogram) # batch_size = in_shape[0] # freq_shape = (batch_size, self.bin_count) # drop_batch = tf.random.uniform((batch_size, 1)) # drop_freq_bands = tf.random.uniform((batch_size, 1), maxval=self.bin_count) # band_size = tf.random.uniform((batch_size, 1), minval=5, maxval=15) # masking_fn = tf.where(np.abs(tf.tile(tf.expand_dims(tf.range(self.bin_count, dtype=tf.float32), 0), [ # batch_size, 1])-drop_freq_bands) < band_size, tf.zeros(freq_shape), tf.ones(freq_shape)) # mask = tf.where(tf.tile(tf.greater(drop_batch, args.specaugment_prob), [1, self.bin_count]), tf.ones(freq_shape), masking_fn) # mask = tf.tile(mask[:, tf.newaxis, :, tf.newaxis], [1, in_shape[1], 1, in_shape[3]]) # tf.summary.image("spectrogram", spectrogram[:,:,:,1:2]) # tf.summary.image("spec_mask", mask[:,:,:,:1]) # spectrogram = spectrogram*tf.cond(self.is_training, lambda: mask, lambda: tf.ones_like(spectrogram)) # tf.summary.image("spectrogram_masked", spectrogram[:,:,:,:1]) print("spectrogram shape", spectrogram.shape) args_context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram if args.architecture.startswith("deep_hcnn"): assert len(args.conv_ctx) <= args.stacks # Prepare kernel sizes (time axis = audio context) args_ctx = np.abs(args.conv_ctx) args_dils = np.abs(args.dilations) ctxs = np.array([ args_ctx[i] if i < len(args_ctx) else args_ctx[-1] for i in range(args.stacks) ]) dils = np.array([ args_dils[i] if i < len(args_dils) else args_dils[-1] for i in range(args.stacks) ]) if args.conv_ctx[0] < 0: ctxs = np.array(list(reversed(ctxs))) if args.dilations[0] < 0: dils = np.array(list(reversed(dils))) print(ctxs) # Cut the unnecessary context needed_context_size = int( np.sum(np.ceil((ctxs - 1) / 2)) + np.ceil((args.last_conv_kernel[0] - 1) / 2)) actual_context_size = args_context_size print("input context", args_context_size, "actual needed context", needed_context_size) if args_context_size < needed_context_size: print( "Warning: provided context is shorter than the needed context field of the network" ) elif args_context_size > needed_context_size: if args.cut_context: print("Cutting the unnecessary context {} --> ".format( layer.shape), end="") diff = args_context_size - needed_context_size layer = layer[:, diff:-diff, :, :] actual_context_size -= diff print(layer.shape, "context now:", actual_context_size) skip = None for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils): kernel = (conv_ctx, args.conv_range) print("add conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, args.filters, kernel, (1, 1), "same", activation=None, dilation_rate=(dil, 1)) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: print("harmonic stacking {} --> ".format(layer.shape), end="") layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) print(layer.shape) layer = common.regularization(layer, args, training=self.is_training) if i < args.stacks - args.residual_end and i % args.residual_hop == 0: if skip is None: print(".- begin residual connection") else: if args.residual_op == "add": print("|- adding residual connection") layer += skip if args.residual_op == "concat": print("|- concatenating residual connection") layer = tf.concat([skip, layer], -1) skip = layer layer = tf.layers.conv2d(layer, 1, args.last_conv_kernel, (1, 1), "same", activation=None) if actual_context_size > 0: layer = layer[:, actual_context_size:-actual_context_size, :, :] self.note_logits = tf.squeeze(layer, -1) print("note_logits shape", self.note_logits.shape) if args.voicing: with tf.name_scope('model_voicing'): # Cut the unnecessary context voicing_layer = spectrogram if args_context_size > 0: voicing_layer = spectrogram[:, args_context_size: -args_context_size, :, :] if args.voicing_input == "only_salience": voicing_layer = tf.stop_gradient(layer) if args.voicing_input == "spectrogram_salience": voicing_layer = tf.concat( [tf.stop_gradient(layer), voicing_layer], axis=-1) if args.voicing_input == "spectrogram_salience_train": voicing_layer = tf.concat([layer, voicing_layer], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) # print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) if args.overtone_stacking > 0 or args.undertone_stacking > 0: # offset = args.min_note - spectrogram_min_note spectrogram = harmonic_stacking(self, self.spectrogram, args.undertone_stacking, args.overtone_stacking) else: spectrogram = self.spectrogram[:, :, :self.bin_count, :] # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0))) print(spectrogram.shape) context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram if args.architecture == "bittner_improved": layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 70), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture == "bittnerlike": layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 70), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture.startswith("deep_simple"): residual = None for i in range(args.stacks): layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx, args.conv_range), (1, 1), "same", activation=None) layer = activation(layer) if args.harmonic_stacking: layer = harmonic_stacking(self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx + 1, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture.startswith("deep_smooth"): residual = None ctx_end = 1 dilations_start = 5 for i in range(args.stacks): conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1 dil_rate = (1, 1) if i < dilations_start else (2**( i - dilations_start), 1) layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (conv_ctx, args.conv_range), (1, 1), "same", activation=None, dilation_rate=dil_rate) print(i, "kernel", (conv_ctx, args.conv_range), "dilation", dil_rate) layer = activation(layer) if args.harmonic_stacking: layer = harmonic_stacking(self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] self.note_logits = tf.squeeze(layer_cut, -1) print("note_logits shape", self.note_logits.shape) if args.voicing: with tf.name_scope('model_voicing'): voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.voicing_last_conv_ctx: voicing_layer = tf.pad( voicing_layer, ((0, 0), (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d( voicing_layer, 1, (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) if args.overtone_stacking > 0 or args.undertone_stacking > 0: # offset = args.min_note - spectrogram_min_note spectrogram = common.harmonic_stacking(self, self.spectrogram, args.undertone_stacking, args.overtone_stacking) else: spectrogram = self.spectrogram[:, :, :self.bin_count, :] context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram if args.architecture.startswith("deep_simple"): residual = None for i in range(args.stacks): layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx, args.conv_range), (1, 1), "same", activation=None) layer = activation(layer) if args.harmonic_stacking: layer = common.harmonic_stacking( self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx + 1, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] self.note_logits = tf.squeeze(layer_cut, -1) if args.architecture.startswith("deep_smooth"): residual = None ctx_end = 1 dilations_start = 5 for i in range(args.stacks): conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1 dil_rate = (1, 1) if i < dilations_start else (2**( i - dilations_start), 1) layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (conv_ctx, args.conv_range), (1, 1), "same", activation=None, dilation_rate=dil_rate) print(i, "kernel", (conv_ctx, args.conv_range), "dilation", dil_rate) layer = activation(layer) if args.harmonic_stacking: layer = common.harmonic_stacking( self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] self.note_logits = tf.squeeze(layer_cut, -1) if args.architecture.startswith("deep_lstm"): residual = None ctx_end = 1 for i in range(args.stacks): conv_ctx = args.conv_ctx if i < ctx_end else 1 layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (conv_ctx, args.conv_range), (1, 1), "same", activation=None) layer = activation(layer) if args.harmonic_stacking: layer = common.harmonic_stacking( self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] # https://www.tensorflow.org/api_docs/python/tf/contrib/cudnn_rnn/CudnnLSTM # cell = tf.contrib.cudnn_rnn.CudnnLSTM(1, 128) # tf.nn.static_rnn( # cell, # inputs, # initial_state=None, # dtype=None, # sequence_length=None, # scope=None # ) # lstm_sizes = [128, 128] # lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes] # # Add dropout to the cell # keep_prob_ = 0.9 # drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms] # # Stack up multiple LSTM layers, for deep learning # cell = tf.contrib.rnn.MultiRNNCell(drops) # self.note_logits = tf.squeeze(layer_cut, -1) print("!!!!!!!") print(layer_cut.shape) # layer_cut = tf.squeeze(layer_cut, 3) layer_cut = spectrogram[:, context_size:-context_size, :, 0] print(layer_cut.shape) cell = tf.nn.rnn_cell.BasicRNNCell(16) # seq_length = tf.fill(tf.shape(layer_cut)[:1], self.annotations_per_window) # print(seq_length) outputs, _ = tf.nn.dynamic_rnn(cell, layer_cut, dtype=tf.float32) # outputs = tf.Print(outputs, [outputs, layer_cut]) print(outputs.shape) # outputs = layer_cut outputs = tf.layers.dense(outputs, self.bin_count, activation=None) self.note_logits = outputs if args.voicing: with tf.name_scope('model_voicing'): voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.voicing_last_conv_ctx: voicing_layer = tf.pad( voicing_layer, ((0, 0), (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d( voicing_layer, 1, (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) # multif0 loss --------- with tf.name_scope("losses"): annotations = self.annotations - args.min_note voicing_ref = tf.cast(tf.greater(annotations[:, :, 0], 0), tf.float32) loss_names = [] losses = [] if self.note_logits is not None: if args.annotation_smoothing > 0: self.note_probabilities = tf.nn.sigmoid(self.note_logits) annotations_per_frame = tf.shape(annotations)[-1] note_bins = tf.tile(tf.expand_dims(self.note_bins, 2), [1, 1, annotations_per_frame, 1]) note_ref = tf.tile( tf.reshape(annotations, [ -1, self.annotations_per_window, annotations_per_frame, 1 ]), [1, 1, 1, self.bin_count]) ref_probabilities = tf.exp(-(note_ref - note_bins)**2 / (2 * args.annotation_smoothing**2)) ref_probabilities = tf.concat([ ref_probabilities[:, :, :1, :], ref_probabilities[:, :, 1:, :] * args.miss_weight ], axis=2) ref_probabilities = tf.reduce_sum(ref_probabilities, axis=2) # self.note_probabilities = ref_probabilities # print(ref_probabilities.eval(), ref_probabilities.shape) voicing_weights = tf.tile(tf.expand_dims(voicing_ref, -1), [1, 1, self.bin_count]) if args.architecture.startswith("deep_simple_focal"): note_loss = focal_loss(self.note_logits, ref_probabilities, weights=voicing_weights) else: note_loss = tf.losses.sigmoid_cross_entropy( ref_probabilities, self.note_logits, weights=voicing_weights) loss_names.append("note_loss") losses.append(note_loss) # Melody input, not compatible with multif0 input # annotations = self.annotations[:, :, 0] - args.min_note # voicing_ref = tf.cast(tf.greater(annotations, 0), tf.float32) # loss_names = [] # losses = [] # if self.note_logits is not None: # if args.annotation_smoothing > 0: # self.note_probabilities = tf.nn.sigmoid(self.note_logits) # note_ref = tf.tile(tf.reshape(annotations, [-1, self.annotations_per_window, 1]), [1, 1, self.bin_count]) # ref_probabilities = tf.exp(-(note_ref-self.note_bins)**2/(2*args.annotation_smoothing**2)) # voicing_weights = tf.tile(tf.expand_dims(voicing_ref, -1), [1, 1, self.bin_count]) # # miss weights # peak_ref = tf.cast(tf.abs(tf.tile(tf.reshape(annotations, [-1, self.annotations_per_window, 1]), [1, 1, self.bin_count]) - self.note_bins) < 0.5, tf.float32) # miss_weights = tf.ones_like(voicing_weights)*args.miss_weight + peak_ref*(1-args.miss_weight) # note_loss = tf.losses.sigmoid_cross_entropy(ref_probabilities, self.note_logits, weights=voicing_weights*miss_weights) # else: # self.note_probabilities = tf.nn.softmax(self.note_logits) # ref_bins = tf.cast(tf.round(annotations * self.bins_per_semitone), tf.int32) # note_loss = tf.losses.sparse_softmax_cross_entropy(ref_bins, self.note_logits, weights=voicing_ref) # loss_names.append("note_loss") # losses.append(note_loss) if args.l2_loss_weight > 0: reg_variables = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.reduce_sum( tf.constant(args.l2_loss_weight) * reg_variables) loss_names.append("l2_loss") losses.append(l2_loss) if self.voicing_logits is not None: voicing_loss = tf.losses.sigmoid_cross_entropy( voicing_ref, self.voicing_logits) loss_names.append("voicing_loss") losses.append(voicing_loss) if len(losses) > 1: for name, loss in zip(loss_names, losses): tf.summary.scalar('metrics/train/' + name, loss) self.loss = tf.math.add_n(losses) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): layer = self.spectrogram print(layer.shape) context_size = int(args.context_width / self.spectrogram_hop_size) with tf.name_scope('model_pitch'): layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 70), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True) note_output = tf.squeeze(layer_cut, -1) print(note_output.shape) self.note_logits = note_output if args.voicing: with tf.name_scope('model_voicing'): voicing_input = tf.concat( [tf.stop_gradient(layer), self.spectrogram], axis=-1) # voicing_input = spectrogram print(voicing_input.shape) voicing_layer = tf.layers.conv2d(voicing_input, 64, (5, 5), (1, 1), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (5, 70), (1, 5), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (5, 12), (1, 12), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (15, 3), (1, 1), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) print(voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (1, 6), (1, 1), "valid", activation=None, use_bias=True) cut_layer = voicing_layer[:, context_size:-context_size, :, :] print(cut_layer.shape) self.voicing_logits = tf.squeeze(cut_layer) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.overtone_stacking > 0 or args.undertone_stacking > 0: spectrogram_windows = [] print("stacking the spectrogram") for i in [1 / (x + 2) for x in range(args.undertone_stacking)] + list( range(1, args.overtone_stacking + 1)): f_ref = 440 # arbitrary reference frequency hz = f_ref * i interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi( f_ref) int_bins = int(round(interval * self.bins_per_semitone)) spec_layer = self.spectrogram[:, :, max(int_bins, 0):self.bin_count + int_bins, :] print(i, "offset", int_bins, "end", self.bin_count + int_bins, "shape", spec_layer.shape) if int_bins < 0: spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (-int_bins, 0), (0, 0))) spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count - spec_layer.shape[2]), (0, 0))) print("padded shape", spec_layer.shape) spectrogram_windows.append(spec_layer) spectrogram = tf.concat(spectrogram_windows, axis=-1) else: spectrogram = self.spectrogram[:, :, :360, :] # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0))) print(spectrogram.shape) context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 70), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True) note_output = tf.squeeze(layer_cut, -1) print(note_output.shape) self.note_logits = note_output if args.voicing: with tf.name_scope('model_voicing'): voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.voicing_last_conv_ctx: voicing_layer = tf.pad( voicing_layer, ((0, 0), (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d( voicing_layer, 1, (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): window = self.window[:, :-1] window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) capacity_multiplier = args.capacity_multiplier if args.multiresolution_convolution: first_layer = [] for i in range(args.multiresolution_convolution): width = 2**(9 - i) capacity = (32 * capacity_multiplier * args.first_layer_capacity ) // args.multiresolution_convolution # bug in capacity computation # capacity = 32//args.multiresolution_convolution*args.first_layer_capacity*capacity_multiplier l = common.bn_conv(window_with_channel, capacity, width, 4, "same", activation=tf.nn.relu, training=self.is_training) print(l.shape, width) first_layer.append(l) audio_net = tf.concat(first_layer, 2) else: if args.variable_stride: first_layer = [] # print(window_with_channel.shape) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 0:512 * 1, :], 32 * capacity_multiplier, 512, 64, "valid", activation=tf.nn.relu, reuse=None, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 1:512 * 2, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 2:512 * 3, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 3:512 * 4, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 4:512 * 5, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 5:512 * 6, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 6:512 * 7, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 7:512 * 9, :], 32 * capacity_multiplier, 512, 4, "same", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 9:512 * 10, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 10:512 * 11, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 11:512 * 12, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 12:512 * 13, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 13:512 * 14, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 14:512 * 15, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 15:512 * 16, :], 32 * capacity_multiplier, 512, 64, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) print(first_layer) audio_net = tf.concat(first_layer, 1) else: audio_net = common.bn_conv(window_with_channel, 32 * capacity_multiplier, 512, 4, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 8 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 16 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = tf.layers.flatten(audio_net) output_layer = tf.layers.dense(audio_net, self.annotations_per_window * self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) self.note_logits = tf.reshape( output_layer, [-1, self.annotations_per_window, self.bin_count]) self.voicing_threshold = tf.Variable(0.15, trainable=False) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.spectrogram == "cqt": spec_bin_count = 360 spec_bins_per_semitone = 5 if args.spectrogram == "YunNingHung_cqt": spec_bin_count = 88 spec_bins_per_semitone = 1 if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: spectrogram = common.harmonic_stacking( self, self.spectrogram, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking, bin_count=spec_bin_count, bins_per_semitone=spec_bins_per_semitone) else: spectrogram = self.spectrogram if args.spectrogram == "cqt": spectrogram = self.spectrogram[:, :, :spec_bin_count, :] args_context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram print("self.spectrogram shape", self.spectrogram.shape) print("spectrogram shape", spectrogram.shape) if args.architecture.startswith("baseline"): # layer = tf.layers.conv2d(layer, args.filters, (args.conv_ctx[0], args.conv_range), strides=(1, 5), padding="same", activation=None) #layer = activation(layer) #layer = tf.layers.average_pooling2d(layer, (5, 1), (5, 1)) layer = tf.layers.flatten(layer) layer = tf.layers.dense(layer, 100, use_bias=(not args.batchnorm)) if args.batchnorm: layer = tf.layers.batch_normalization( layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dense( layer, args.note_range * args.annotations_per_window) layer = tf.reshape( layer, (-1, args.annotations_per_window, args.note_range)) self.note_logits = layer # layer_cut = layer[:, args_context_size:-args_context_size, :, :] # self.note_logits = tf.squeeze(layer_cut, -1) print("note_logits shape", self.note_logits.shape) if args.architecture.startswith("LY"): # batch_size, annotations_per_wind, time, freq def conv_block(self, layer, args, channels, kernel, time_padding): layer = tf.pad(layer, ((0, 0), (time_padding, time_padding), (0, 0), (0, 0))) layer = tf.layers.conv2d(layer, channels, kernel, padding="valid", activation=None, use_bias=False) if args.batchnorm: layer = tf.layers.batch_normalization( layer, training=self.is_training) layer = activation(layer) return layer print(layer.shape) layer = conv_block(self, layer, args, args.filters, (7, spectrogram.shape[2]), 3) print(layer.shape) layer = tf.layers.max_pooling2d(layer, (3, 1), (3, 1)) print(layer.shape) layer = conv_block(self, layer, args, args.filters, (7, 1), 3) print(layer.shape) layer = tf.layers.max_pooling2d(layer, (3, 1), (3, 1)) print(layer.shape) layer = conv_block(self, layer, args, 16 * args.filters, (1, 1), 0) print(layer.shape) layer = conv_block(self, layer, args, 16 * args.filters, (1, 1), 0) print(layer.shape) layer = tf.layers.conv2d(layer, self.note_range, (1, 1), padding="valid", activation=None) print(layer.shape) # squeeze frequency dimension layer = tf.squeeze(layer, 2) self.note_logits = layer print("note_logits shape", self.note_logits.shape) if args.architecture.startswith("deep_hcnn"): assert len(args.conv_ctx) <= args.stacks # Prepare kernel sizes (time axis = audio context) args_ctx = np.abs(args.conv_ctx) args_dils = np.abs(args.dilations) ctxs = np.array([ args_ctx[i] if i < len(args_ctx) else args_ctx[-1] for i in range(args.stacks) ]) dils = np.array([ args_dils[i] if i < len(args_dils) else args_dils[-1] for i in range(args.stacks) ]) if args.conv_ctx[0] < 0: ctxs = np.array(list(reversed(ctxs))) if args.dilations[0] < 0: dils = np.array(list(reversed(dils))) print(ctxs) # Cut the unnecessary context needed_context_size = int( np.sum(np.ceil((ctxs - 1) / 2)) + np.ceil((args.last_conv_kernel[0] - 1) / 2)) actual_context_size = args_context_size print("input context", args_context_size, "actual needed context", needed_context_size) if args_context_size < needed_context_size: print( "Warning: provided context is shorter than the needed context field of the network" ) elif args_context_size > needed_context_size: if args.cut_context: print("Cutting the unnecessary context {} --> ".format( layer.shape), end="") diff = args_context_size - needed_context_size layer = layer[:, diff:-diff, :, :] actual_context_size -= diff print(layer.shape, "context now:", actual_context_size) skip = None for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils): kernel = (conv_ctx, args.conv_range) if i > 0 and args.faster_hcnn: print("add hconv2d {} filters, {} kernel".format( args.filters, kernel)) layer = common.hconv2d( layer, args.filters, kernel, args.undertone_stacking, args.overtone_stacking, 60, # bins per octave padding="same", activation=None, dilation_rate=(dil, 1), use_bias=bool(args.use_bias)) print(layer.shape) else: print("add conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, args.filters, kernel, (1, 1), padding="same", activation=None, dilation_rate=(dil, 1), use_bias=bool(args.use_bias)) print(layer.shape) layer = common.regularization(layer, args, training=self.is_training) layer = activation(layer) if (not args.faster_hcnn) and (args.undertone_stacking > 0 or args.overtone_stacking > 1 ) and i < args.stacking_until: print("harmonic stacking {} --> ".format(layer.shape), end="") layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=360, bins_per_semitone=5) print(layer.shape) if i < args.stacks - args.residual_end and i % args.residual_hop == 0: if skip is None: print(".- begin residual connection") else: if args.residual_op == "add": print("|- adding residual connection") layer += skip if args.residual_op == "concat": print("|- concatenating residual connection") layer = tf.concat([skip, layer], -1) skip = layer if args.last_pooling == "globalavg": layer = tf.layers.average_pooling2d(layer, (1, 360), (1, 360)) if args.last_pooling == "avg": layer = tf.layers.average_pooling2d(layer, (1, 5), (1, 5)) if args.last_pooling == "max": layer = tf.layers.max_pooling2d(layer, (1, 5), (1, 5)) if args.last_pooling == "maxoct": layer = tf.layers.max_pooling2d(layer, (1, 60), (1, 60)) if args.faster_hcnn: print("add last hconv2d {} filters, {} kernel".format( args.filters, kernel)) layer = common.hconv2d( layer, args.note_range, args.last_conv_kernel, args.undertone_stacking, args.overtone_stacking, 12, # bins per semitone padding="valid", activation=None, use_bias=bool(args.use_bias)) print(layer.shape) else: print("add last conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, args.note_range, args.last_conv_kernel, (1, 1), padding="valid", activation=None, use_bias=bool(args.use_bias)) print(layer.shape) if actual_context_size > 0: layer = layer[:, actual_context_size:-actual_context_size, :, :] self.note_logits = tf.squeeze(layer, 2) print("note_logits shape", self.note_logits.shape) if args.class_weighting: weights = self.class_weights else: weights = None self.loss = common.loss_mir(self, args, weights=weights) self.est_notes = tf.constant(0) # placeholder, we compute est_notes on cpu self.training = common.optimizer(self, args)
def create_model(self, args): context_size = int(self.context_width/self.spectrogram_hop_size) with tf.name_scope('model_pitch'): self.note_logits = None self.note_probabilities = self.spectrogram[:, context_size:-context_size, :360, 0] with tf.name_scope('model_voicing'): # voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) if args.harmonic_stacking > 1: spectrogram_windows = [] print("stacking the spectrogram") for i in range(args.harmonic_stacking): f_ref = 440 # arbitrary reference frequency hz = f_ref*(i+1) interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(f_ref) int_bins = int(round(interval*self.bins_per_semitone)) spec_layer = self.spectrogram[:, :, int_bins:self.bin_count+int_bins, :] print(i+1, "offset", int_bins, "end", self.bin_count+int_bins, "shape", spec_layer.shape) spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count-spec_layer.shape[2]), (0, 0))) print("padded shape", spec_layer.shape) spectrogram_windows.append(spec_layer) voicing_layer = tf.concat(spectrogram_windows, axis=-1) else: voicing_layer = self.spectrogram[:, :, :360, :] if args.first_pool_type == "avg": voicing_layer = tf.layers.average_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same") if args.first_pool_type == "max": voicing_layer = tf.layers.max_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same") print("after pooling", voicing_layer.shape) octave = int(int(voicing_layer.shape[2])/6) note = int(int(voicing_layer.shape[2])/6/12) if args.activation is not None: activation = getattr(tf.nn, args.activation) if args.architecture == "full_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_dilated": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave)) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "dilated_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave)) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave_fix": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_note_octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave_octave_temporal": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*7+1, 3), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.last_layer == "conv": print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.last_conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.last_conv_ctx, args.last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (args.last_conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) if args.last_layer == "dense": voicing_layer = tf.layers.flatten(voicing_layer) self.voicing_logits = tf.layers.dense(voicing_layer, args.annotations_per_window) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): receptive_field = args.stack_number * (args.max_dilation * 2) - ( args.stack_number - 1) receptive_field_ms = (receptive_field * 1000) / args.samplerate context_width = self.context_width print("receptive field: {} samples, {:.4f} ms".format( receptive_field, receptive_field_ms)) if self.context_width > receptive_field: context_width = receptive_field diff = self.context_width - receptive_field window = self.window[:, diff:-diff] print("cutting window {}->{}".format(self.window.shape, window.shape)) else: window = self.window print("warning: receptive field larger than context width") window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) initial_layer = window_with_channel if args.initial_filter_width > 0: initial_layer = tf.layers.conv1d(initial_layer, args.residual_channels, args.initial_filter_width, 1, args.initial_filter_padding, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) skip_connections = [] dilations = [2**x for x in range(int(np.log2(args.max_dilation)) + 1) ] * args.stack_number print(dilations) current_layer = initial_layer with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(dilations): with tf.name_scope('layer{}'.format(layer_index)): conv_filter = tf.layers.conv1d( current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) conv_gate = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate) skip = tf.layers.conv1d(out, args.skip_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) transformed = tf.layers.conv1d( out, args.residual_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) if args.dilation_layer_dropout: transformed = tf.layers.dropout( transformed, args.dilation_layer_dropout, training=self.is_training) current_layer = transformed + current_layer skip_connections.append(skip) print(skip) with tf.name_scope('postprocessing'): if args.skip == "add": skip_sum = tf.math.add_n(skip_connections) elif args.skip == "concat": skip_sum = tf.concat(skip_connections, -1) elif args.skip == "last": skip_sum = skip_connections[-1] if context_width: skip_sum = skip_sum[:, context_width:-context_width, :] print("skip output", skip_sum.shape) skip = tf.nn.relu(skip_sum) if args.skip_layer_dropout: skip = tf.layers.dropout(skip, args.skip_layer_dropout, training=self.is_training) # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid") # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = common.add_layers_from_string(self, skip, args.postprocessing) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.nn.relu(skip_sum) print("after skip output processing", output_layer.shape) if output_layer.shape.as_list() != [ None, self.annotations_per_window, self.bin_count ]: print("shape not compatible, adding FC layer") output_layer = tf.nn.relu(output_layer) output_layer = tf.layers.flatten(output_layer) output_layer = tf.layers.dense(output_layer, self.annotations_per_window * self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = tf.reshape( output_layer, [-1, self.annotations_per_window, self.bin_count]) self.note_logits = output_layer self.voicing_threshold = tf.Variable(0.15, trainable=False) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: # for spectrograms where the min. frequency doesn't correspond to output min. note # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) # offset = args.min_note - spectrogram_min_note spectrogram = common.harmonic_stacking( self, self.spectrogram, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking, bin_count=360, bins_per_semitone=5) # else: # spectrogram = self.spectrogram[:, :, :self.bin_count, :] args_context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram print("self.spectrogram shape", self.spectrogram.shape) print("spectrogram shape", spectrogram.shape) if args.architecture.startswith("deep_hcnn"): assert len(args.conv_ctx) <= args.stacks # Prepare kernel sizes (time axis = audio context) args_ctx = np.abs(args.conv_ctx) args_dils = np.abs(args.dilations) ctxs = np.array([ args_ctx[i] if i < len(args_ctx) else args_ctx[-1] for i in range(args.stacks) ]) dils = np.array([ args_dils[i] if i < len(args_dils) else args_dils[-1] for i in range(args.stacks) ]) if args.conv_ctx[0] < 0: ctxs = np.array(list(reversed(ctxs))) if args.dilations[0] < 0: dils = np.array(list(reversed(dils))) print(ctxs) # Cut the unnecessary context needed_context_size = int( np.sum(np.ceil((ctxs - 1) / 2)) + np.ceil((args.last_conv_kernel[0] - 1) / 2)) actual_context_size = args_context_size print("input context", args_context_size, "actual needed context", needed_context_size) if args_context_size < needed_context_size: print( "Warning: provided context is shorter than the needed context field of the network" ) elif args_context_size > needed_context_size: if args.cut_context: print("Cutting the unnecessary context {} --> ".format( layer.shape), end="") diff = args_context_size - needed_context_size layer = layer[:, diff:-diff, :, :] actual_context_size -= diff print(layer.shape, "context now:", actual_context_size) skip = None for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils): kernel = (conv_ctx, args.conv_range) if i > 0 and args.faster_hcnn: print("add hconv2d {} filters, {} kernel".format( args.filters, kernel)) layer = common.hconv2d( layer, args.filters, kernel, args.undertone_stacking, args.overtone_stacking, 60, # bins per semitone padding="same", activation=None, dilation_rate=(dil, 1), use_bias=bool(args.use_bias)) print(layer.shape) else: print("add conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, args.filters, kernel, (1, 1), padding="same", activation=None, dilation_rate=(dil, 1), use_bias=bool(args.use_bias)) print(layer.shape) layer = common.regularization(layer, args, training=self.is_training) layer = activation(layer) if (not args.faster_hcnn) and (args.undertone_stacking > 0 or args.overtone_stacking > 1): print("harmonic stacking {} --> ".format(layer.shape), end="") layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=360, bins_per_semitone=5) print(layer.shape) if i < args.stacks - args.residual_end and i % args.residual_hop == 0: if skip is None: print(".- begin residual connection") else: if args.residual_op == "add": print("|- adding residual connection") layer += skip if args.residual_op == "concat": print("|- concatenating residual connection") layer = tf.concat([skip, layer], -1) skip = layer layer = tf.layers.average_pooling2d(layer, (1, 5), (1, 5)) if args.faster_hcnn: print("add last hconv2d {} filters, {} kernel".format( args.filters, kernel)) layer = common.hconv2d( layer, 1, args.last_conv_kernel, args.undertone_stacking, args.overtone_stacking, 12, # bins per semitone padding="same", activation=None, use_bias=bool(args.use_bias)) print(layer.shape) else: print("add last conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, 1, args.last_conv_kernel, (1, 1), padding="same", activation=None, use_bias=bool(args.use_bias)) print(layer.shape) if actual_context_size > 0: layer = layer[:, actual_context_size:-actual_context_size, :, :] self.note_logits = tf.squeeze(layer, -1) print("note_logits shape", self.note_logits.shape) if args.voicing: raise NotImplementedError else: self.voicing_threshold = tf.Variable(0.5, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss_mf0(self, args) self.est_notes = tf.constant(0) # placeholder, we compute est_notes on cpu self.training = common.optimizer(self, args)
def create_model(self, args): if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: spectrogram = common.harmonic_stacking( self, self.spectrogram, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking, bin_count=229, bins_per_semitone=4) else: spectrogram = self.spectrogram[:, :, :229, :] if args.activation is not None: activation = getattr(tf.nn, args.activation) print("self.spectrogram shape", self.spectrogram.shape) print("spectrogram shape", spectrogram.shape) with tf.name_scope('model'): if args.architecture == "allconv": layer = spectrogram layer = tf.layers.conv2d(layer, args.filters, (3, 3), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=227, bins_per_semitone=4) layer = tf.layers.conv2d(layer, args.filters, (3, 3), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=225, bins_per_semitone=4) layer = tf.layers.max_pooling2d(layer, (1, 2), (1, 2)) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) layer = tf.layers.conv2d(layer, args.filters, (1, 3), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=110, bins_per_semitone=2) layer = tf.layers.conv2d(layer, args.filters, (1, 3), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=108, bins_per_semitone=2) layer = tf.layers.max_pooling2d(layer, (1, 2), (1, 2)) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) layer = tf.layers.conv2d(layer, args.filters * 2, (1, 25), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.conv2d(layer, args.filters * 4, (1, 25), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.conv2d(layer, args.note_range, (1, 1), (1, 1), "valid", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.average_pooling2d(layer, (1, 6), (1, 6)) # layer.shape (?, 1, 1, 88) = (batch_size, annot_per_window, freq, channels) layer = tf.squeeze(layer, 2) self.note_logits = layer if args.architecture == "vggnet": layer = spectrogram print("spectrogram", layer.shape) layer = tf.layers.conv2d( layer, args.filters, (3, 3), (1, 1), "same", activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) print("conv", layer.shape) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=229, bins_per_semitone=4) layer = tf.layers.conv2d( layer, args.filters, (3, 3), (1, 1), "valid", activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) print("conv2", layer.shape) if args.undertone_stacking > 0 or args.overtone_stacking > 1: layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking, bin_count=227, bins_per_semitone=4) layer = tf.layers.max_pooling2d(layer, (1, 2), (1, 2)) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) print("maxpool", layer.shape) layer = tf.layers.conv2d( layer, args.filters * 2, (3, 3), (1, 1), "valid", activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.max_pooling2d(layer, (1, 2), (1, 2)) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) print("conv3", layer.shape) layer = tf.layers.flatten(layer) print("flatten", layer.shape) # in the implementation in framewise_2016 repository I left 512 hidden units fixed layer = tf.layers.dense( layer, args.filters * 16, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) print("fc1", layer.shape) layer = tf.layers.dropout(layer, 0.5, training=self.is_training) layer = tf.layers.dense( layer, args.note_range * args.annotations_per_window, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) print("fc2", layer.shape) layer = tf.reshape( layer, [-1, args.annotations_per_window, args.note_range]) self.note_logits = layer print("note_logits shape", self.note_logits.shape) self.voicing_threshold = tf.Variable(0.5, trainable=False) self.loss = common.loss_mf0(self, args) self.est_notes = tf.constant(0) # placeholder, we compute est_notes on cpu self.training = common.optimizer(self, args)