Exemple #1
0
def create_model(self, args):
    window = self.window
    window = common.input_normalization(window, args)
    window_with_channel = tf.expand_dims(window, axis=2)

    initial_layer = tf.layers.conv1d(window_with_channel, args.residual_channels, args.initial_filter_width, 1, args.initial_filter_padding,
                                     activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)

    skip_connections = []
    dilations = [2**x for x in range(int(np.log2(args.max_dilation))+1)]*args.stack_number
    print(dilations)
    current_layer = initial_layer
    with tf.name_scope('dilated_stack'):
        for layer_index, dilation in enumerate(dilations):
            with tf.name_scope('layer{}'.format(layer_index)):
                conv_filter = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation,
                                               use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
                conv_gate = tf.layers.conv1d(current_layer, args.residual_channels, args.filter_width, 1, "same", dilation_rate=dilation,
                                             use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
                out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate)
                skip = tf.layers.conv1d(out, args.skip_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
                transformed = tf.layers.conv1d(out, args.residual_channels, 1, 1, "same", use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
                if args.dilation_layer_dropout:
                    transformed = tf.layers.dropout(transformed, args.dilation_layer_dropout, training=self.is_training)
                current_layer = transformed + current_layer

                skip_connections.append(skip)
                print(skip)

    with tf.name_scope('postprocessing'):
        skip_sum = tf.math.add_n(skip_connections)
        skip = tf.nn.relu(skip_sum)
        if args.skip_layer_dropout:
            skip = tf.layers.dropout(skip, args.skip_layer_dropout, training=self.is_training)

        # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid")
        # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)

        output_layer = common.add_layers_from_string(skip, args.postprocessing)

        # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # skip = tf.nn.relu(skip_sum)
        print(output_layer.shape)

    if output_layer.shape.as_list() != [None, self.annotations_per_window, self.bin_count]:
        print("shape not compatible, adding FC layer")
        output_layer = tf.nn.relu(output_layer)
        output_layer = tf.layers.flatten(output_layer)
        output_layer = tf.layers.dense(output_layer, self.annotations_per_window*self.bin_count, activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        output_layer = tf.reshape(output_layer, [-1, self.annotations_per_window, self.bin_count])

    self.note_logits = output_layer

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemple #2
0
def create_model(self, args):
    if args.spectrogram_undertone_stacking > 0 or args.spectrogram_overtone_stacking > 1:
        # pro spektrogramy začínající na nižší notě než je výstup
        # spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
        # offset = args.min_note - spectrogram_min_note
        spectrogram = common.harmonic_stacking(
            self, self.spectrogram, args.spectrogram_undertone_stacking,
            args.spectrogram_overtone_stacking)

    else:
        spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    # if args.specaugment_prob:
    # in_shape = tf.shape(spectrogram)
    # batch_size = in_shape[0]
    # freq_shape = (batch_size, self.bin_count)
    # drop_batch = tf.random.uniform((batch_size, 1))
    # drop_freq_bands = tf.random.uniform((batch_size, 1), maxval=self.bin_count)

    # band_size = tf.random.uniform((batch_size, 1), minval=5, maxval=15)

    # masking_fn = tf.where(np.abs(tf.tile(tf.expand_dims(tf.range(self.bin_count, dtype=tf.float32), 0), [
    #                         batch_size, 1])-drop_freq_bands) < band_size, tf.zeros(freq_shape), tf.ones(freq_shape))

    # mask = tf.where(tf.tile(tf.greater(drop_batch, args.specaugment_prob), [1, self.bin_count]), tf.ones(freq_shape), masking_fn)
    # mask = tf.tile(mask[:, tf.newaxis, :, tf.newaxis], [1, in_shape[1], 1, in_shape[3]])

    # tf.summary.image("spectrogram", spectrogram[:,:,:,1:2])
    # tf.summary.image("spec_mask", mask[:,:,:,:1])
    # spectrogram = spectrogram*tf.cond(self.is_training, lambda: mask, lambda: tf.ones_like(spectrogram))
    # tf.summary.image("spectrogram_masked", spectrogram[:,:,:,:1])

    print("spectrogram shape", spectrogram.shape)

    args_context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        if args.architecture.startswith("deep_hcnn"):
            assert len(args.conv_ctx) <= args.stacks
            # Prepare kernel sizes (time axis = audio context)
            args_ctx = np.abs(args.conv_ctx)
            args_dils = np.abs(args.dilations)
            ctxs = np.array([
                args_ctx[i] if i < len(args_ctx) else args_ctx[-1]
                for i in range(args.stacks)
            ])
            dils = np.array([
                args_dils[i] if i < len(args_dils) else args_dils[-1]
                for i in range(args.stacks)
            ])
            if args.conv_ctx[0] < 0:
                ctxs = np.array(list(reversed(ctxs)))
            if args.dilations[0] < 0:
                dils = np.array(list(reversed(dils)))
            print(ctxs)

            # Cut the unnecessary context
            needed_context_size = int(
                np.sum(np.ceil((ctxs - 1) / 2)) +
                np.ceil((args.last_conv_kernel[0] - 1) / 2))
            actual_context_size = args_context_size
            print("input context", args_context_size, "actual needed context",
                  needed_context_size)
            if args_context_size < needed_context_size:
                print(
                    "Warning: provided context is shorter than the needed context field of the network"
                )
            elif args_context_size > needed_context_size:
                if args.cut_context:
                    print("Cutting the unnecessary context {} --> ".format(
                        layer.shape),
                          end="")
                    diff = args_context_size - needed_context_size
                    layer = layer[:, diff:-diff, :, :]
                    actual_context_size -= diff
                    print(layer.shape, "context now:", actual_context_size)

            skip = None
            for i, conv_ctx, dil in zip(range(args.stacks), ctxs, dils):
                kernel = (conv_ctx, args.conv_range)
                print("add conv2d {} filters, {} kernel".format(
                    args.filters, kernel))
                layer = tf.layers.conv2d(layer,
                                         args.filters,
                                         kernel, (1, 1),
                                         "same",
                                         activation=None,
                                         dilation_rate=(dil, 1))

                layer = activation(layer)

                if args.undertone_stacking > 0 or args.overtone_stacking > 1:
                    print("harmonic stacking {} --> ".format(layer.shape),
                          end="")
                    layer = common.harmonic_stacking(self, layer,
                                                     args.undertone_stacking,
                                                     args.overtone_stacking)
                    print(layer.shape)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if i < args.stacks - args.residual_end and i % args.residual_hop == 0:
                    if skip is None:
                        print(".- begin residual connection")
                    else:
                        if args.residual_op == "add":
                            print("|- adding residual connection")
                            layer += skip
                        if args.residual_op == "concat":
                            print("|- concatenating residual connection")
                            layer = tf.concat([skip, layer], -1)
                    skip = layer

            layer = tf.layers.conv2d(layer,
                                     1,
                                     args.last_conv_kernel, (1, 1),
                                     "same",
                                     activation=None)
            if actual_context_size > 0:
                layer = layer[:,
                              actual_context_size:-actual_context_size, :, :]

        self.note_logits = tf.squeeze(layer, -1)
        print("note_logits shape", self.note_logits.shape)

    if args.voicing:
        with tf.name_scope('model_voicing'):
            # Cut the unnecessary context
            voicing_layer = spectrogram
            if args_context_size > 0:
                voicing_layer = spectrogram[:, args_context_size:
                                            -args_context_size, :, :]

            if args.voicing_input == "only_salience":
                voicing_layer = tf.stop_gradient(layer)
            if args.voicing_input == "spectrogram_salience":
                voicing_layer = tf.concat(
                    [tf.stop_gradient(layer), voicing_layer], axis=-1)
            if args.voicing_input == "spectrogram_salience_train":
                voicing_layer = tf.concat([layer, voicing_layer], axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, note), (1, 1),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, note), (1, note),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, octave), (1, 1),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (1, octave), (1, octave),
                                             "same",
                                             activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             1, (1, voicing_layer.shape[2]),
                                             (1, 1),
                                             "valid",
                                             activation=None,
                                             use_bias=True)
            print("last conv output", voicing_layer.shape)
            # print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemple #3
0
def create_model(self, args):
    spectrogram_min_note = librosa.core.hz_to_midi(self.spectrogram_fmin)
    if args.overtone_stacking > 0 or args.undertone_stacking > 0:
        # offset = args.min_note - spectrogram_min_note
        spectrogram = harmonic_stacking(self, self.spectrogram,
                                        args.undertone_stacking,
                                        args.overtone_stacking)

    else:
        spectrogram = self.spectrogram[:, :, :self.bin_count, :]

    # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0)))
    print(spectrogram.shape)

    context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        if args.architecture == "bittner_improved":
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 5),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual = layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 5),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (9, 3),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (9, 3),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier, (5, 70),
                                     (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = activation(layer)
            layer = tf.layers.dropout(layer,
                                      args.dropout,
                                      training=self.is_training)
            residual += layer

            layer = residual

            layer = tf.layers.batch_normalization(layer,
                                                  training=self.is_training)
            layer = tf.layers.conv2d(layer,
                                     1, (10, 1), (1, 1),
                                     "same",
                                     activation=None,
                                     use_bias=False)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture == "bittnerlike":

            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 5), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual = layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 5), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 3), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 3), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer
            layer = tf.layers.conv2d(layer,
                                     8 * args.capacity_multiplier,
                                     (args.conv_ctx * 2 + 1, 70), (1, 1),
                                     "same",
                                     activation=activation)
            layer = common.regularization(layer,
                                          args,
                                          training=self.is_training)
            residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx * 2 + 1, 1),
                                     (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture.startswith("deep_simple"):
            residual = None
            for i in range(args.stacks):
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (args.conv_ctx, args.conv_range),
                                         (1, 1),
                                         "same",
                                         activation=None)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = harmonic_stacking(self, layer,
                                              args.harmonic_stacking,
                                              args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx + 1, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        if args.architecture.startswith("deep_smooth"):
            residual = None
            ctx_end = 1
            dilations_start = 5
            for i in range(args.stacks):
                conv_ctx = args.conv_ctx if i < ctx_end or i >= dilations_start else 1
                dil_rate = (1, 1) if i < dilations_start else (2**(
                    i - dilations_start), 1)
                layer = tf.layers.conv2d(layer,
                                         8 * args.capacity_multiplier,
                                         (conv_ctx, args.conv_range), (1, 1),
                                         "same",
                                         activation=None,
                                         dilation_rate=dil_rate)
                print(i, "kernel", (conv_ctx, args.conv_range), "dilation",
                      dil_rate)

                layer = activation(layer)

                if args.harmonic_stacking:
                    layer = harmonic_stacking(self, layer,
                                              args.harmonic_stacking,
                                              args.harmonic_stacking + 1)

                layer = common.regularization(layer,
                                              args,
                                              training=self.is_training)

                if residual is None:
                    residual = layer
                else:
                    residual += layer

            layer = residual

            layer = tf.layers.conv2d(layer,
                                     1, (args.last_conv_ctx, 1), (1, 1),
                                     "same",
                                     activation=None)
            layer_cut = layer[:, context_size:-context_size, :, :]

        self.note_logits = tf.squeeze(layer_cut, -1)
        print("note_logits shape", self.note_logits.shape)

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram],
                                      axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, note),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, octave),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.voicing_last_conv_ctx:
                voicing_layer = tf.pad(
                    voicing_layer,
                    ((0, 0),
                     (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx),
                     (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                1,
                (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]),
                (1, 1),
                "valid",
                activation=None,
                use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemple #4
0
def create_model(self, args):
    layer = self.spectrogram

    print(layer.shape)

    context_size = int(args.context_width / self.spectrogram_hop_size)

    with tf.name_scope('model_pitch'):
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier, (5, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual = layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier, (5, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier, (9, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier, (9, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier, (5, 70), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer

        layer = residual

        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.layers.conv2d(layer,
                                 1, (10, 1), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer_cut = layer[:, context_size:-context_size, :, :]
        # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True)

        note_output = tf.squeeze(layer_cut, -1)
        print(note_output.shape)
        self.note_logits = note_output

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_input = tf.concat(
                [tf.stop_gradient(layer), self.spectrogram], axis=-1)
            # voicing_input = spectrogram
            print(voicing_input.shape)
            voicing_layer = tf.layers.conv2d(voicing_input,
                                             64, (5, 5), (1, 1),
                                             "same",
                                             activation=tf.nn.relu,
                                             use_bias=False)
            voicing_layer = tf.layers.dropout(voicing_layer,
                                              0.25,
                                              training=self.is_training)
            voicing_layer = tf.layers.batch_normalization(
                voicing_layer, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (5, 70), (1, 5),
                                             "same",
                                             activation=tf.nn.relu,
                                             use_bias=False)
            voicing_layer = tf.layers.dropout(voicing_layer,
                                              0.25,
                                              training=self.is_training)
            voicing_layer = tf.layers.batch_normalization(
                voicing_layer, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (5, 12), (1, 12),
                                             "same",
                                             activation=tf.nn.relu,
                                             use_bias=False)
            voicing_layer = tf.layers.dropout(voicing_layer,
                                              0.25,
                                              training=self.is_training)
            voicing_layer = tf.layers.batch_normalization(
                voicing_layer, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             64, (15, 3), (1, 1),
                                             "same",
                                             activation=tf.nn.relu,
                                             use_bias=False)
            voicing_layer = tf.layers.dropout(voicing_layer,
                                              0.25,
                                              training=self.is_training)
            voicing_layer = tf.layers.batch_normalization(
                voicing_layer, training=self.is_training)

            print(voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(voicing_layer,
                                             1, (1, 6), (1, 1),
                                             "valid",
                                             activation=None,
                                             use_bias=True)
            cut_layer = voicing_layer[:, context_size:-context_size, :, :]
            print(cut_layer.shape)
            self.voicing_logits = tf.squeeze(cut_layer)
    else:
        self.voicing_threshold = tf.Variable(0.15, trainable=False)
        tf.summary.scalar("model/voicing_threshold", self.voicing_threshold)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
Exemple #5
0
def create_model(self, args):
    if args.overtone_stacking > 0 or args.undertone_stacking > 0:
        spectrogram_windows = []
        print("stacking the spectrogram")
        for i in [1 / (x + 2) for x in range(args.undertone_stacking)] + list(
                range(1, args.overtone_stacking + 1)):
            f_ref = 440  # arbitrary reference frequency
            hz = f_ref * i
            interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(
                f_ref)
            int_bins = int(round(interval * self.bins_per_semitone))
            spec_layer = self.spectrogram[:, :,
                                          max(int_bins, 0):self.bin_count +
                                          int_bins, :]
            print(i, "offset", int_bins, "end", self.bin_count + int_bins,
                  "shape", spec_layer.shape)
            if int_bins < 0:
                spec_layer = tf.pad(spec_layer,
                                    ((0, 0), (0, 0), (-int_bins, 0), (0, 0)))

            spec_layer = tf.pad(spec_layer,
                                ((0, 0), (0, 0),
                                 (0, self.bin_count - spec_layer.shape[2]),
                                 (0, 0)))
            print("padded shape", spec_layer.shape)
            spectrogram_windows.append(spec_layer)
        spectrogram = tf.concat(spectrogram_windows, axis=-1)

    else:
        spectrogram = self.spectrogram[:, :, :360, :]

    # layer = tf.pad(layer, ((0, 0), (0, 0), (41, 41), (0, 0)))
    print(spectrogram.shape)

    context_size = int(self.context_width / self.spectrogram_hop_size)

    if args.activation is not None:
        activation = getattr(tf.nn, args.activation)

    with tf.name_scope('model_pitch'):
        layer = spectrogram

        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)

        residual = layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 5), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 3), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer
        layer = tf.layers.conv2d(layer,
                                 8 * args.capacity_multiplier,
                                 (args.conv_ctx * 2 + 1, 70), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        # layer = common.regularization(layer, args, training=self.is_training)
        layer = tf.layers.batch_normalization(layer, training=self.is_training)
        layer = tf.nn.relu(layer)
        layer = tf.layers.dropout(layer, 0.25, training=self.is_training)
        residual += layer

        layer = residual
        layer = tf.layers.batch_normalization(layer, training=self.is_training)

        layer = tf.layers.conv2d(layer,
                                 1, (args.last_conv_ctx * 2 + 1, 1), (1, 1),
                                 "same",
                                 activation=None,
                                 use_bias=False)
        layer_cut = layer[:, context_size:-context_size, :, :]
        # layer = tf.layers.conv2d(layer, 1, (10, 1), (1, 1), "same", activation=None, use_bias=True)

        note_output = tf.squeeze(layer_cut, -1)
        print(note_output.shape)
        self.note_logits = note_output

    if args.voicing:
        with tf.name_scope('model_voicing'):
            voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram],
                                      axis=-1)

            note = int(int(voicing_layer.shape[2]) / 6 / 12)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, note), (1, note),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            octave = int(int(voicing_layer.shape[2]) / 6)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, 1),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                8 * args.voicing_capacity_multiplier,
                (args.voicing_conv_ctx * 2 + 1, octave), (1, octave),
                "same",
                activation=activation)
            voicing_layer = common.regularization(voicing_layer,
                                                  args,
                                                  training=self.is_training)

            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.voicing_last_conv_ctx:
                voicing_layer = tf.pad(
                    voicing_layer,
                    ((0, 0),
                     (args.voicing_last_conv_ctx, args.voicing_last_conv_ctx),
                     (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(
                voicing_layer,
                1,
                (args.voicing_last_conv_ctx * 2 + 1, voicing_layer.shape[2]),
                (1, 1),
                "valid",
                activation=None,
                use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
def create_model(self, args):
    window = self.window[:, :-1]
    window = common.input_normalization(window, args)
    window_with_channel = tf.expand_dims(window, axis=2)

    capacity_multiplier = args.capacity_multiplier

    if args.multiresolution_convolution:
        first_layer = []

        for i in range(args.multiresolution_convolution):
            width = 2**(9 - i)
            capacity = (32 * capacity_multiplier * args.first_layer_capacity
                        ) // args.multiresolution_convolution
            # bug in capacity computation
            # capacity = 32//args.multiresolution_convolution*args.first_layer_capacity*capacity_multiplier
            l = common.bn_conv(window_with_channel,
                               capacity,
                               width,
                               4,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
            print(l.shape, width)
            first_layer.append(l)

        audio_net = tf.concat(first_layer, 2)
    else:
        if args.variable_stride:
            first_layer = []
            # print(window_with_channel.shape)
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 0:512 * 1, :],
                               32 * capacity_multiplier,
                               512,
                               64,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=None,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 1:512 * 2, :],
                               32 * capacity_multiplier,
                               512,
                               32,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 2:512 * 3, :],
                               32 * capacity_multiplier,
                               512,
                               32,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 3:512 * 4, :],
                               32 * capacity_multiplier,
                               512,
                               16,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 4:512 * 5, :],
                               32 * capacity_multiplier,
                               512,
                               16,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 5:512 * 6, :],
                               32 * capacity_multiplier,
                               512,
                               8,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 6:512 * 7, :],
                               32 * capacity_multiplier,
                               512,
                               8,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))

            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 7:512 * 9, :],
                               32 * capacity_multiplier,
                               512,
                               4,
                               "same",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))

            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 9:512 * 10, :],
                               32 * capacity_multiplier,
                               512,
                               8,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 10:512 * 11, :],
                               32 * capacity_multiplier,
                               512,
                               8,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 11:512 * 12, :],
                               32 * capacity_multiplier,
                               512,
                               16,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 12:512 * 13, :],
                               32 * capacity_multiplier,
                               512,
                               16,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 13:512 * 14, :],
                               32 * capacity_multiplier,
                               512,
                               32,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 14:512 * 15, :],
                               32 * capacity_multiplier,
                               512,
                               32,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            first_layer.append(
                common.bn_conv(window_with_channel[:, 512 * 15:512 * 16, :],
                               32 * capacity_multiplier,
                               512,
                               64,
                               "valid",
                               activation=tf.nn.relu,
                               reuse=True,
                               training=self.is_training))
            print(first_layer)
            audio_net = tf.concat(first_layer, 1)
        else:
            audio_net = common.bn_conv(window_with_channel,
                                       32 * capacity_multiplier,
                                       512,
                                       4,
                                       "same",
                                       activation=tf.nn.relu,
                                       training=self.is_training)

    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = common.bn_conv(audio_net,
                               4 * capacity_multiplier,
                               64,
                               1,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = common.bn_conv(audio_net,
                               4 * capacity_multiplier,
                               64,
                               1,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = common.bn_conv(audio_net,
                               4 * capacity_multiplier,
                               64,
                               1,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = common.bn_conv(audio_net,
                               8 * capacity_multiplier,
                               64,
                               1,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = common.bn_conv(audio_net,
                               16 * capacity_multiplier,
                               64,
                               1,
                               "same",
                               activation=tf.nn.relu,
                               training=self.is_training)
    audio_net = tf.layers.max_pooling1d(audio_net, 2, 2)
    audio_net = tf.layers.dropout(audio_net, 0.25, training=self.is_training)

    audio_net = tf.layers.flatten(audio_net)

    output_layer = tf.layers.dense(audio_net,
                                   self.annotations_per_window *
                                   self.bin_count,
                                   activation=None,
                                   bias_regularizer=tf.nn.l2_loss,
                                   kernel_regularizer=tf.nn.l2_loss)
    self.note_logits = tf.reshape(
        output_layer, [-1, self.annotations_per_window, self.bin_count])

    self.voicing_threshold = tf.Variable(0.15, trainable=False)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
def create_model(self, args):
    context_size = int(self.context_width/self.spectrogram_hop_size)

    with tf.name_scope('model_pitch'):
        self.note_logits = None
        self.note_probabilities = self.spectrogram[:, context_size:-context_size, :360, 0]
    

    with tf.name_scope('model_voicing'):
        # voicing_layer = tf.concat([tf.stop_gradient(layer), spectrogram], axis=-1)

        if args.harmonic_stacking > 1:
            spectrogram_windows = []
            print("stacking the spectrogram")
            for i in range(args.harmonic_stacking):
                f_ref = 440 # arbitrary reference frequency
                hz = f_ref*(i+1)
                interval = librosa.core.hz_to_midi(hz) - librosa.core.hz_to_midi(f_ref)
                int_bins = int(round(interval*self.bins_per_semitone))
                spec_layer = self.spectrogram[:, :, int_bins:self.bin_count+int_bins, :]
                print(i+1, "offset", int_bins, "end", self.bin_count+int_bins, "shape", spec_layer.shape)
                spec_layer = tf.pad(spec_layer, ((0, 0), (0, 0), (0, self.bin_count-spec_layer.shape[2]), (0, 0)))
                print("padded shape", spec_layer.shape)
                spectrogram_windows.append(spec_layer)
            voicing_layer = tf.concat(spectrogram_windows, axis=-1)

        else:
            voicing_layer = self.spectrogram[:, :, :360, :]

        if args.first_pool_type == "avg":
            voicing_layer = tf.layers.average_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
        if args.first_pool_type == "max":
            voicing_layer = tf.layers.max_pooling2d(voicing_layer, args.first_pool_size, args.first_pool_stride, padding="same")
        
        print("after pooling", voicing_layer.shape)

        octave = int(int(voicing_layer.shape[2])/6)
        note = int(int(voicing_layer.shape[2])/6/12)

        if args.activation is not None:
            activation = getattr(tf.nn, args.activation)

        if args.architecture == "full_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        if args.architecture == "octave_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        if args.architecture == "note_1layer":
            if args.conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.conv_ctx, args.conv_ctx), (0, 0), (0, 0)))
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "valid", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_dilated":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "dilated_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (1, 6), (1, 1), "same", activation=activation, dilation_rate=(1, octave))
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "octave_note":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        

        if args.architecture == "note_octave_fix":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_note_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

        if args.architecture == "note_note_note_octave_octave":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)
        
        if args.architecture == "note_octave_octave_temporal":
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, note), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, note), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            octave = int(int(voicing_layer.shape[2])/6)
            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*2+1, octave), (1, octave), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)

            voicing_layer = tf.layers.conv2d(voicing_layer, 8*args.capacity_multiplier, (args.conv_ctx*7+1, 3), (1, 1), "same", activation=activation)
            voicing_layer = common.regularization(voicing_layer, args, training=self.is_training)



        if args.last_layer == "conv":
            print("adding last conv valid layer")
            print("model output", voicing_layer.shape)
            if args.last_conv_ctx:
                voicing_layer = tf.pad(voicing_layer, ((0, 0), (args.last_conv_ctx, args.last_conv_ctx), (0, 0), (0, 0)))
                print("padded", voicing_layer.shape)
            voicing_layer = tf.layers.conv2d(voicing_layer, 1, (args.last_conv_ctx*2+1, voicing_layer.shape[2]), (1, 1), "valid", activation=None, use_bias=True)
            print("last conv output", voicing_layer.shape)
            voicing_layer = voicing_layer[:, context_size:-context_size, :, :]
            print("cut context", voicing_layer.shape)
            self.voicing_logits = tf.squeeze(voicing_layer)
            print("squeeze", voicing_layer.shape)
        if args.last_layer == "dense":
            voicing_layer = tf.layers.flatten(voicing_layer)
            self.voicing_logits = tf.layers.dense(voicing_layer, args.annotations_per_window)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)
def create_model(self, args):
    receptive_field = args.stack_number * (args.max_dilation * 2) - (
        args.stack_number - 1)
    receptive_field_ms = (receptive_field * 1000) / args.samplerate

    context_width = self.context_width

    print("receptive field: {} samples, {:.4f} ms".format(
        receptive_field, receptive_field_ms))
    if self.context_width > receptive_field:
        context_width = receptive_field
        diff = self.context_width - receptive_field
        window = self.window[:, diff:-diff]
        print("cutting window {}->{}".format(self.window.shape, window.shape))
    else:
        window = self.window
        print("warning: receptive field larger than context width")

    window = common.input_normalization(window, args)
    window_with_channel = tf.expand_dims(window, axis=2)

    initial_layer = window_with_channel
    if args.initial_filter_width > 0:
        initial_layer = tf.layers.conv1d(initial_layer,
                                         args.residual_channels,
                                         args.initial_filter_width,
                                         1,
                                         args.initial_filter_padding,
                                         activation=None,
                                         bias_regularizer=tf.nn.l2_loss,
                                         kernel_regularizer=tf.nn.l2_loss)

    skip_connections = []
    dilations = [2**x for x in range(int(np.log2(args.max_dilation)) + 1)
                 ] * args.stack_number
    print(dilations)
    current_layer = initial_layer
    with tf.name_scope('dilated_stack'):
        for layer_index, dilation in enumerate(dilations):
            with tf.name_scope('layer{}'.format(layer_index)):
                conv_filter = tf.layers.conv1d(
                    current_layer,
                    args.residual_channels,
                    args.filter_width,
                    1,
                    "same",
                    dilation_rate=dilation,
                    use_bias=args.use_biases,
                    bias_regularizer=tf.nn.l2_loss,
                    kernel_regularizer=tf.nn.l2_loss)
                conv_gate = tf.layers.conv1d(current_layer,
                                             args.residual_channels,
                                             args.filter_width,
                                             1,
                                             "same",
                                             dilation_rate=dilation,
                                             use_bias=args.use_biases,
                                             bias_regularizer=tf.nn.l2_loss,
                                             kernel_regularizer=tf.nn.l2_loss)
                out = tf.tanh(conv_filter) * tf.sigmoid(conv_gate)
                skip = tf.layers.conv1d(out,
                                        args.skip_channels,
                                        1,
                                        1,
                                        "same",
                                        use_bias=args.use_biases,
                                        bias_regularizer=tf.nn.l2_loss,
                                        kernel_regularizer=tf.nn.l2_loss)
                transformed = tf.layers.conv1d(
                    out,
                    args.residual_channels,
                    1,
                    1,
                    "same",
                    use_bias=args.use_biases,
                    bias_regularizer=tf.nn.l2_loss,
                    kernel_regularizer=tf.nn.l2_loss)
                if args.dilation_layer_dropout:
                    transformed = tf.layers.dropout(
                        transformed,
                        args.dilation_layer_dropout,
                        training=self.is_training)
                current_layer = transformed + current_layer

                skip_connections.append(skip)
                print(skip)

    with tf.name_scope('postprocessing'):
        if args.skip == "add":
            skip_sum = tf.math.add_n(skip_connections)
        elif args.skip == "concat":
            skip_sum = tf.concat(skip_connections, -1)
        elif args.skip == "last":
            skip_sum = skip_connections[-1]

        if context_width:
            skip_sum = skip_sum[:, context_width:-context_width, :]

        print("skip output", skip_sum.shape)
        skip = tf.nn.relu(skip_sum)
        if args.skip_layer_dropout:
            skip = tf.layers.dropout(skip,
                                     args.skip_layer_dropout,
                                     training=self.is_training)

        # skip = tf.layers.average_pooling1d(skip, 93, 93, "valid")
        # skip = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=tf.nn.relu, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # output_layer = tf.layers.conv1d(skip, self.bin_count, 3, 1, "same", activation=None, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)

        output_layer = common.add_layers_from_string(self, skip,
                                                     args.postprocessing)

        # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # skip = tf.layers.conv1d(skip, 256, 16, 8, "same", activation=tf.nn.relu, use_bias=args.use_biases, bias_regularizer=tf.nn.l2_loss, kernel_regularizer=tf.nn.l2_loss)
        # skip = tf.nn.relu(skip_sum)
        print("after skip output processing", output_layer.shape)

    if output_layer.shape.as_list() != [
            None, self.annotations_per_window, self.bin_count
    ]:
        print("shape not compatible, adding FC layer")
        output_layer = tf.nn.relu(output_layer)
        output_layer = tf.layers.flatten(output_layer)
        output_layer = tf.layers.dense(output_layer,
                                       self.annotations_per_window *
                                       self.bin_count,
                                       activation=None,
                                       bias_regularizer=tf.nn.l2_loss,
                                       kernel_regularizer=tf.nn.l2_loss)
        output_layer = tf.reshape(
            output_layer, [-1, self.annotations_per_window, self.bin_count])

    self.note_logits = output_layer

    self.voicing_threshold = tf.Variable(0.15, trainable=False)

    self.loss = common.loss(self, args)
    self.est_notes = common.est_notes(self, args)
    self.training = common.optimizer(self, args)