def create_model(self, args): window = self.window window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) initial_layer = tf.layers.conv1d(window_with_channel, args.residual_channels, args.initial_filter_width, 1, args.initial_filter_padding, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) skip_connections = [] dilations = [2**x for x in range(int(np.log2(args.max_dilation))+1)]*args.stack_number print(dilations) current_layer = initial_layer with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(dilations): with tf.name_scope('layer{}'.format(layer_index)): conv_filter = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) conv_gate = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate) skip = tf.layers.conv1d(out, args.skip_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) transformed = tf.layers.conv1d(out, args.residual_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) if args.dilation_layer_dropout: transformed = tf.layers.dropout(transformed, args.dilation_layer_dropout, training=self.is_training) current_layer = transformed + current_layer skip_connections.append(skip) print(skip) with tf.name_scope('postprocessing'): skip_sum = tf.math.add_n(skip_connections) skip = tf.nn.relu(skip_sum) if args.skip_layer_dropout: skip = tf.layers.dropout(skip, args.skip_layer_dropout, training=self.is_training) # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid") # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = common.add_layers_from_string(skip, args.postprocessing) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.nn.relu(skip_sum) print(output_layer.shape) if output_layer.shape.as_list() != [None, self.annotations_per_window, self.bin_count]: print("shape not compatible, adding FC layer") output_layer = tf.nn.relu(output_layer) output_layer = tf.layers.flatten(output_layer) output_layer = tf.layers.dense(output_layer, self.annotations_per_window*self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = tf.reshape(output_layer, [-1, self.annotations_per_window, self.bin_count]) self.note_logits = output_layer self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1: # pro spektrogramy začínající na nižší notě než je výstup # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) # offset = args.min_note - spectrogram_min_note spectrogram = common.harmonic_stacking( self, self.spectrogram, args.spectrogram_undertone_stacking, args.spectrogram_overtone_stacking) else: spectrogram = self.spectrogram[:, :, :self.bin_count, :] # if args.specaugment_prob: # in_shape = tf.shape(spectrogram) # batch_size = in_shape[0] # freq_shape = (batch_size, self.bin_count) # drop_batch = tf.random.uniform((batch_size, 1)) # drop_freq_bands = tf.random.uniform((batch_size, 1), maxval=self.bin_count) # band_size = tf.random.uniform((batch_size, 1), minval=5, maxval=15) # masking_fn = tf.where(np.abs(tf.tile(tf.expand_dims(tf.range(self.bin_count, dtype=tf.float32), 0), [ # batch_size, 1])-drop_freq_bands) < band_size, tf.zeros(freq_shape), tf.ones(freq_shape)) # mask = tf.where(tf.tile(tf.greater(drop_batch, args.specaugment_prob), [1, self.bin_count]), tf.ones(freq_shape), masking_fn) # mask = tf.tile(mask[:, tf.newaxis, :, tf.newaxis], [1, in_shape[1], 1, in_shape[3]]) # tf.summary.image("spectrogram", spectrogram[:,:,:,1:2]) # tf.summary.image("spec_mask", mask[:,:,:,:1]) # spectrogram = spectrogram*tf.cond(self.is_training, lambda: mask, lambda: tf.ones_like(spectrogram)) # tf.summary.image("spectrogram_masked", spectrogram[:,:,:,:1]) print("spectrogram shape", spectrogram.shape) args_context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram if args.architecture.startswith("deep_hcnn"): assert len(args.conv_ctx) <= args.stacks # Prepare kernel sizes (time axis = audio context) args_ctx = np.abs(args.conv_ctx) args_dils = np.abs(args.dilations) ctxs = np.array([ args_ctx[i] if i < len(args_ctx) else args_ctx[-1] for i in range(args.stacks) ]) dils = np.array([ args_dils[i] if i < len(args_dils) else args_dils[-1] for i in range(args.stacks) ]) if args.conv_ctx[0] < 0: ctxs = np.array(list(reversed(ctxs))) if args.dilations[0] < 0: dils = np.array(list(reversed(dils))) print(ctxs) # Cut the unnecessary context needed_context_size = int( np.sum(np.ceil((ctxs - 1) / 2)) + np.ceil((args.last_conv_kernel[0] - 1) / 2)) actual_context_size = args_context_size print("input context", args_context_size, "actual needed context", needed_context_size) if args_context_size < needed_context_size: print( "Warning: provided context is shorter than the needed context field of the network" ) elif args_context_size > needed_context_size: if args.cut_context: print("Cutting the unnecessary context {} --> ".format( layer.shape), end="") diff = args_context_size - needed_context_size layer = layer[:, diff:-diff, :, :] actual_context_size -= diff print(layer.shape, "context now:", actual_context_size) skip = None for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils): kernel = (conv_ctx, args.conv_range) print("add conv2d {} filters, {} kernel".format( args.filters, kernel)) layer = tf.layers.conv2d(layer, args.filters, kernel, (1, 1), "same", activation=None, dilation_rate=(dil, 1)) layer = activation(layer) if args.undertone_stacking > 0 or args.overtone_stacking > 1: print("harmonic stacking {} --> ".format(layer.shape), end="") layer = common.harmonic_stacking(self, layer, args.undertone_stacking, args.overtone_stacking) print(layer.shape) layer = common.regularization(layer, args, training=self.is_training) if i < args.stacks - args.residual_end and i % args.residual_hop == 0: if skip is None: print(".- begin residual connection") else: if args.residual_op == "add": print("|- adding residual connection") layer += skip if args.residual_op == "concat": print("|- concatenating residual connection") layer = tf.concat([skip, layer], -1) skip = layer layer = tf.layers.conv2d(layer, 1, args.last_conv_kernel, (1, 1), "same", activation=None) if actual_context_size > 0: layer = layer[:, actual_context_size:-actual_context_size, :, :] self.note_logits = tf.squeeze(layer, -1) print("note_logits shape", self.note_logits.shape) if args.voicing: with tf.name_scope('model_voicing'): # Cut the unnecessary context voicing_layer = spectrogram if args_context_size > 0: voicing_layer = spectrogram[:, args_context_size: -args_context_size, :, :] if args.voicing_input == "only_salience": voicing_layer = tf.stop_gradient(layer) if args.voicing_input == "spectrogram_salience": voicing_layer = tf.concat( [tf.stop_gradient(layer), voicing_layer], axis=-1) if args.voicing_input == "spectrogram_salience_train": voicing_layer = tf.concat([layer, voicing_layer], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) # print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin) if args.overtone_stacking > 0 or args.undertone_stacking > 0: # offset = args.min_note - spectrogram_min_note spectrogram = harmonic_stacking(self, self.spectrogram, args.undertone_stacking, args.overtone_stacking) else: spectrogram = self.spectrogram[:, :, :self.bin_count, :] # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0))) print(spectrogram.shape) context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram if args.architecture == "bittner_improved": layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 70), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = activation(layer) layer = tf.layers.dropout(layer, args.dropout, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture == "bittnerlike": layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 70), (1, 1), "same", activation=activation) layer = common.regularization(layer, args, training=self.is_training) residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture.startswith("deep_simple"): residual = None for i in range(args.stacks): layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx, args.conv_range), (1, 1), "same", activation=None) layer = activation(layer) if args.harmonic_stacking: layer = harmonic_stacking(self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx + 1, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] if args.architecture.startswith("deep_smooth"): residual = None ctx_end = 1 dilations_start = 5 for i in range(args.stacks): conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1 dil_rate = (1, 1) if i < dilations_start else (2**( i - dilations_start), 1) layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (conv_ctx, args.conv_range), (1, 1), "same", activation=None, dilation_rate=dil_rate) print(i, "kernel", (conv_ctx, args.conv_range), "dilation", dil_rate) layer = activation(layer) if args.harmonic_stacking: layer = harmonic_stacking(self, layer, args.harmonic_stacking, args.harmonic_stacking + 1) layer = common.regularization(layer, args, training=self.is_training) if residual is None: residual = layer else: residual += layer layer = residual layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx, 1), (1, 1), "same", activation=None) layer_cut = layer[:, context_size:-context_size, :, :] self.note_logits = tf.squeeze(layer_cut, -1) print("note_logits shape", self.note_logits.shape) if args.voicing: with tf.name_scope('model_voicing'): voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.voicing_last_conv_ctx: voicing_layer = tf.pad( voicing_layer, ((0, 0), (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d( voicing_layer, 1, (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): layer = self.spectrogram print(layer.shape) context_size = int(args.context_width / self.spectrogram_hop_size) with tf.name_scope('model_pitch'): layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 5), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (9, 3), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (5, 70), (1, 1), "same", activation=None, use_bias=False) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True) note_output = tf.squeeze(layer_cut, -1) print(note_output.shape) self.note_logits = note_output if args.voicing: with tf.name_scope('model_voicing'): voicing_input = tf.concat( [tf.stop_gradient(layer), self.spectrogram], axis=-1) # voicing_input = spectrogram print(voicing_input.shape) voicing_layer = tf.layers.conv2d(voicing_input, 64, (5, 5), (1, 1), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (5, 70), (1, 5), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (5, 12), (1, 12), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 64, (15, 3), (1, 1), "same", activation=tf.nn.relu, use_bias=False) voicing_layer = tf.layers.dropout(voicing_layer, 0.25, training=self.is_training) voicing_layer = tf.layers.batch_normalization( voicing_layer, training=self.is_training) print(voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (1, 6), (1, 1), "valid", activation=None, use_bias=True) cut_layer = voicing_layer[:, context_size:-context_size, :, :] print(cut_layer.shape) self.voicing_logits = tf.squeeze(cut_layer) else: self.voicing_threshold = tf.Variable(0.15, trainable=False) tf.summary.scalar("model/voicing_threshold", self.voicing_threshold) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): if args.overtone_stacking > 0 or args.undertone_stacking > 0: spectrogram_windows = [] print("stacking the spectrogram") for i in [1 / (x + 2) for x in range(args.undertone_stacking)] + list( range(1, args.overtone_stacking + 1)): f_ref = 440 # arbitrary reference frequency hz = f_ref * i interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi( f_ref) int_bins = int(round(interval * self.bins_per_semitone)) spec_layer = self.spectrogram[:, :, max(int_bins, 0):self.bin_count + int_bins, :] print(i, "offset", int_bins, "end", self.bin_count + int_bins, "shape", spec_layer.shape) if int_bins < 0: spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (-int_bins, 0), (0, 0))) spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count - spec_layer.shape[2]), (0, 0))) print("padded shape", spec_layer.shape) spectrogram_windows.append(spec_layer) spectrogram = tf.concat(spectrogram_windows, axis=-1) else: spectrogram = self.spectrogram[:, :, :360, :] # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0))) print(spectrogram.shape) context_size = int(self.context_width / self.spectrogram_hop_size) if args.activation is not None: activation = getattr(tf.nn, args.activation) with tf.name_scope('model_pitch'): layer = spectrogram layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual = layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 5), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 3), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = tf.layers.conv2d(layer, 8 * args.capacity_multiplier, (args.conv_ctx * 2 + 1, 70), (1, 1), "same", activation=None, use_bias=False) # layer = common.regularization(layer, args, training=self.is_training) layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.nn.relu(layer) layer = tf.layers.dropout(layer, 0.25, training=self.is_training) residual += layer layer = residual layer = tf.layers.batch_normalization(layer, training=self.is_training) layer = tf.layers.conv2d(layer, 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1), "same", activation=None, use_bias=False) layer_cut = layer[:, context_size:-context_size, :, :] # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True) note_output = tf.squeeze(layer_cut, -1) print(note_output.shape) self.note_logits = note_output if args.voicing: with tf.name_scope('model_voicing'): voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) note = int(int(voicing_layer.shape[2]) / 6 / 12) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2]) / 6) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d( voicing_layer, 8 * args.voicing_capacity_multiplier, (args.voicing_conv_ctx * 2 + 1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.voicing_last_conv_ctx: voicing_layer = tf.pad( voicing_layer, ((0, 0), (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d( voicing_layer, 1, (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): window = self.window[:, :-1] window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) capacity_multiplier = args.capacity_multiplier if args.multiresolution_convolution: first_layer = [] for i in range(args.multiresolution_convolution): width = 2**(9 - i) capacity = (32 * capacity_multiplier * args.first_layer_capacity ) // args.multiresolution_convolution # bug in capacity computation # capacity = 32//args.multiresolution_convolution*args.first_layer_capacity*capacity_multiplier l = common.bn_conv(window_with_channel, capacity, width, 4, "same", activation=tf.nn.relu, training=self.is_training) print(l.shape, width) first_layer.append(l) audio_net = tf.concat(first_layer, 2) else: if args.variable_stride: first_layer = [] # print(window_with_channel.shape) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 0:512 * 1, :], 32 * capacity_multiplier, 512, 64, "valid", activation=tf.nn.relu, reuse=None, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 1:512 * 2, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 2:512 * 3, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 3:512 * 4, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 4:512 * 5, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 5:512 * 6, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 6:512 * 7, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 7:512 * 9, :], 32 * capacity_multiplier, 512, 4, "same", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 9:512 * 10, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 10:512 * 11, :], 32 * capacity_multiplier, 512, 8, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 11:512 * 12, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 12:512 * 13, :], 32 * capacity_multiplier, 512, 16, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 13:512 * 14, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 14:512 * 15, :], 32 * capacity_multiplier, 512, 32, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) first_layer.append( common.bn_conv(window_with_channel[:, 512 * 15:512 * 16, :], 32 * capacity_multiplier, 512, 64, "valid", activation=tf.nn.relu, reuse=True, training=self.is_training)) print(first_layer) audio_net = tf.concat(first_layer, 1) else: audio_net = common.bn_conv(window_with_channel, 32 * capacity_multiplier, 512, 4, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 4 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 8 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = common.bn_conv(audio_net, 16 * capacity_multiplier, 64, 1, "same", activation=tf.nn.relu, training=self.is_training) audio_net = tf.layers.max_pooling1d(audio_net, 2, 2) audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training) audio_net = tf.layers.flatten(audio_net) output_layer = tf.layers.dense(audio_net, self.annotations_per_window * self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) self.note_logits = tf.reshape( output_layer, [-1, self.annotations_per_window, self.bin_count]) self.voicing_threshold = tf.Variable(0.15, trainable=False) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): context_size = int(self.context_width/self.spectrogram_hop_size) with tf.name_scope('model_pitch'): self.note_logits = None self.note_probabilities = self.spectrogram[:, context_size:-context_size, :360, 0] with tf.name_scope('model_voicing'): # voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1) if args.harmonic_stacking > 1: spectrogram_windows = [] print("stacking the spectrogram") for i in range(args.harmonic_stacking): f_ref = 440 # arbitrary reference frequency hz = f_ref*(i+1) interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(f_ref) int_bins = int(round(interval*self.bins_per_semitone)) spec_layer = self.spectrogram[:, :, int_bins:self.bin_count+int_bins, :] print(i+1, "offset", int_bins, "end", self.bin_count+int_bins, "shape", spec_layer.shape) spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count-spec_layer.shape[2]), (0, 0))) print("padded shape", spec_layer.shape) spectrogram_windows.append(spec_layer) voicing_layer = tf.concat(spectrogram_windows, axis=-1) else: voicing_layer = self.spectrogram[:, :, :360, :] if args.first_pool_type == "avg": voicing_layer = tf.layers.average_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same") if args.first_pool_type == "max": voicing_layer = tf.layers.max_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same") print("after pooling", voicing_layer.shape) octave = int(int(voicing_layer.shape[2])/6) note = int(int(voicing_layer.shape[2])/6/12) if args.activation is not None: activation = getattr(tf.nn, args.activation) if args.architecture == "full_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_1layer": if args.conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0))) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "valid", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_dilated": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave)) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "dilated_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave)) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "octave_note": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave_fix": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_note_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_note_note_octave_octave": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.architecture == "note_octave_octave_temporal": voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, note), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) octave = int(int(voicing_layer.shape[2])/6) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*7+1, 3), (1, 1), "same", activation=activation) voicing_layer = common.regularization(voicing_layer, args, training=self.is_training) if args.last_layer == "conv": print("adding last conv valid layer") print("model output", voicing_layer.shape) if args.last_conv_ctx: voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.last_conv_ctx, args.last_conv_ctx), (0, 0), (0, 0))) print("padded", voicing_layer.shape) voicing_layer = tf.layers.conv2d(voicing_layer, 1, (args.last_conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True) print("last conv output", voicing_layer.shape) voicing_layer = voicing_layer[:, context_size:-context_size, :, :] print("cut context", voicing_layer.shape) self.voicing_logits = tf.squeeze(voicing_layer) print("squeeze", voicing_layer.shape) if args.last_layer == "dense": voicing_layer = tf.layers.flatten(voicing_layer) self.voicing_logits = tf.layers.dense(voicing_layer, args.annotations_per_window) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)
def create_model(self, args): receptive_field = args.stack_number * (args.max_dilation * 2) - ( args.stack_number - 1) receptive_field_ms = (receptive_field * 1000) / args.samplerate context_width = self.context_width print("receptive field: {} samples, {:.4f} ms".format( receptive_field, receptive_field_ms)) if self.context_width > receptive_field: context_width = receptive_field diff = self.context_width - receptive_field window = self.window[:, diff:-diff] print("cutting window {}->{}".format(self.window.shape, window.shape)) else: window = self.window print("warning: receptive field larger than context width") window = common.input_normalization(window, args) window_with_channel = tf.expand_dims(window, axis=2) initial_layer = window_with_channel if args.initial_filter_width > 0: initial_layer = tf.layers.conv1d(initial_layer, args.residual_channels, args.initial_filter_width, 1, args.initial_filter_padding, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) skip_connections = [] dilations = [2**x for x in range(int(np.log2(args.max_dilation)) + 1) ] * args.stack_number print(dilations) current_layer = initial_layer with tf.name_scope('dilated_stack'): for layer_index, dilation in enumerate(dilations): with tf.name_scope('layer{}'.format(layer_index)): conv_filter = tf.layers.conv1d( current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) conv_gate = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate) skip = tf.layers.conv1d(out, args.skip_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) transformed = tf.layers.conv1d( out, args.residual_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) if args.dilation_layer_dropout: transformed = tf.layers.dropout( transformed, args.dilation_layer_dropout, training=self.is_training) current_layer = transformed + current_layer skip_connections.append(skip) print(skip) with tf.name_scope('postprocessing'): if args.skip == "add": skip_sum = tf.math.add_n(skip_connections) elif args.skip == "concat": skip_sum = tf.concat(skip_connections, -1) elif args.skip == "last": skip_sum = skip_connections[-1] if context_width: skip_sum = skip_sum[:, context_width:-context_width, :] print("skip output", skip_sum.shape) skip = tf.nn.relu(skip_sum) if args.skip_layer_dropout: skip = tf.layers.dropout(skip, args.skip_layer_dropout, training=self.is_training) # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid") # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = common.add_layers_from_string(self, skip, args.postprocessing) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) # skip = tf.nn.relu(skip_sum) print("after skip output processing", output_layer.shape) if output_layer.shape.as_list() != [ None, self.annotations_per_window, self.bin_count ]: print("shape not compatible, adding FC layer") output_layer = tf.nn.relu(output_layer) output_layer = tf.layers.flatten(output_layer) output_layer = tf.layers.dense(output_layer, self.annotations_per_window * self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss) output_layer = tf.reshape( output_layer, [-1, self.annotations_per_window, self.bin_count]) self.note_logits = output_layer self.voicing_threshold = tf.Variable(0.15, trainable=False) self.loss = common.loss(self, args) self.est_notes = common.est_notes(self, args) self.training = common.optimizer(self, args)