Beispiel #1
0
def vggish(model_path):
    with tf.Graph().as_default() as default_grapth:
        sess = tf.Session()
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, model_path)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

    return sess, embedding_tensor, embedding_tensor, features_tensor
Beispiel #2
0
def model(learning_rate=vggish_params.LEARNING_RATE):
    graph = tf.Graph()

    with graph.as_default():
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

        with tf.variable_scope("mymodel"):
            # Add a fully connected layer with 100 units.
            num_units = 100

            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(fc,
                                          params.NUM_CLASSES,
                                          activation_fn=None,
                                          scope='logits')

            prediction = tf.argmax(logits, name='prediction')

            # Add training ops.
            with tf.variable_scope('train'):
                global_step = tf.Variable(0,
                                          name='global_step',
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              tf.GraphKeys.GLOBAL_STEP
                                          ])

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                labels = tf.placeholder(tf.float32,
                                        shape=(None, params.NUM_CLASSES),
                                        name='labels')

                # Cross-entropy label loss.
                xent = tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=logits, labels=labels, name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)
                variable_summaries(loss)

                # Calculate accuracy
                #accuracy = tf.metrics.accuracy(labels=labels, predictions=logits, name="acc")

                #tf.summary.scalar('accuracy', accuracy)
                #variable_summaries(accuracy)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=learning_rate,
                    epsilon=vggish_params.ADAM_EPSILON)
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name='train_op')

    return graph, prediction
def create_vggish_network(sess, config):
    """
    Define VGGish model, load the checkpoint, and return a dictionary that points
    to the different tensors defined by the model.
    """
    vggish_slim.define_vggish_slim(training=False)
    vggish_params.EXAMPLE_HOP_SECONDS = config.vggish_hop_size

    vggish_slim.load_vggish_slim_checkpoint(
        sess, config.vggish_model_checkpoint_path)

    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    return {'features': features_tensor, 'embedding': embedding_tensor}
    def __init__(self, ckpt_fname, add_classifier=False):
        self.add_classifier = add_classifier
        num_time_samples = 3
        spec_ph = tf.placeholder(tf.float32, (num_time_samples, 96, 64))
        embeddings = vggish_slim.define_vggish_slim(spec_ph, training=True)
        self.spec_ph, self.embeddings = spec_ph, embeddings
        if add_classifier:
            with tf.variable_scope('mymodel'):
                num_units = 100
                num_classes = 527
                fc = slim.fully_connected(embeddings, num_units)
                self.logits = slim.fully_connected(
                    fc, num_classes, activation_fn=None, scope='logits'
                )

        self.sess = tf.Session()
        tf.train.Saver().restore(self.sess, ckpt_fname)
        count += 1
        if count % 100 == 0:
            print("At File ", count, "/", N)

print("Done!")

print("Computing Tensorflow Embeddings...")
# Prepare a postprocessor to munge the model embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params)

output_sequences = []

with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    count = 0
    for batch in batches:
        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        output_sequences.append(postprocessed_batch)
        count += 1
        if count % 100 == 0:
def model(learning_rate=vggish_params.LEARNING_RATE,
          training=FLAGS.train_vggish):
    graph = tf.Graph()

    with graph.as_default():
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(training)

        with tf.variable_scope("mymodel"):
            # Add a fully connected layer with 100 units.
            num_units = 100

            conv1 = slim.conv2d(embeddings,
                                1024,
                                scope="conv1",
                                kernel_size=[3, 3],
                                stride=1,
                                padding='SAME')

            pool1 = slim.avg_pool2d(conv1,
                                    scope='pool1',
                                    kernel_size=[2, 2],
                                    stride=2,
                                    padding='SAME')

            pool1 = slim.flatten(pool1)

            fc1 = tf.contrib.layers.fully_connected(inputs=pool1,
                                                    num_outputs=512,
                                                    activation_fn=None,
                                                    scope="fc1")

            bn1 = tf.layers.batch_normalization(fc1, 1, name="batch_norm_1")

            fc2 = tf.contrib.layers.fully_connected(
                inputs=bn1,
                num_outputs=vggish_params.EMBEDDING_SIZE,
                activation_fn=tf.nn.relu,
                scope="fc2")

            bn2 = tf.layers.batch_normalization(fc2, 1, name="batch_norm_2")

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.

            logits = tf.contrib.layers.fully_connected(bn2,
                                                       params.NUM_CLASSES,
                                                       activation_fn=None,
                                                       scope='logits')

            prediction = tf.argmax(logits, axis=1, name='prediction')

            softmax_prediction = tf.nn.softmax(logits,
                                               axis=1,
                                               name="softmax_prediction")
            softmax_prediction = tf.nn.top_k(softmax_prediction, k=5)

            # Add training ops.
            with tf.variable_scope('train'):
                global_step = tf.Variable(0,
                                          name='global_step',
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              tf.GraphKeys.GLOBAL_STEP
                                          ])

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                labels = tf.placeholder(tf.float32,
                                        shape=(None, params.NUM_CLASSES),
                                        name='labels')

                # Cross-entropy label loss.
                xent = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                               labels=labels,
                                                               name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)
                variable_summaries(loss)

                # Calculate accuracy
                # accuracy = tf.metrics.accuracy(labels=labels, predictions=logits, name="acc")

                # tf.summary.scalar('accuracy', accuracy)
                # variable_summaries(accuracy)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=learning_rate,
                    epsilon=vggish_params.ADAM_EPSILON)
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name='train_op')

            with tf.variable_scope("accuracy"):
                with tf.variable_scope("correct_prediction"):
                    correct_prediction = tf.equal(prediction,
                                                  tf.argmax(labels, 1))
                with tf.variable_scope("accuracy"):
                    accuracy = tf.reduce_mean(
                        tf.cast(correct_prediction, "float"))

            tf.summary.scalar("accuracy", accuracy)

    return graph, accuracy, softmax_prediction
def make_extract_vggish_embedding(frame_duration,
                                  hop_duration,
                                  input_op_name='vggish/input_features',
                                  output_op_name='vggish/embedding',
                                  embedding_size=128,
                                  resources_dir=None):
    """
    Creates a coroutine generator for extracting and saving VGGish embeddings

    Parameters
    ----------
    frame_duration
    hop_duration
    input_op_name
    output_op_name
    embedding_size
    resources_dir

    Returns
    -------
    coroutine

    """
    params = {
        'frame_win_sec': frame_duration,
        'frame_hop_sec': hop_duration,
        'embedding_size': embedding_size
    }

    if not resources_dir:
        resources_dir = os.path.join(os.path.dirname(__file__),
                                     'vggish/resources')

    pca_params_path = os.path.join(resources_dir, 'vggish_pca_params.npz')
    model_path = os.path.join(resources_dir, 'vggish_model.ckpt')

    try:
        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False, **params)
            vggish_slim.load_vggish_slim_checkpoint(sess, model_path, **params)

            while True:
                # We use a coroutine to more easily keep open the Tensorflow contexts
                # without having to constantly reload the model
                audio_path, output_path = (yield)

                if os.path.exists(output_path):
                    continue

                try:
                    examples_batch = vggish_input.wavfile_to_examples(
                        audio_path, **params)
                except ValueError:
                    print("Error opening {}. Skipping...".format(audio_path))
                    continue

                # Prepare a postprocessor to munge the model embeddings.
                pproc = vggish_postprocess.Postprocessor(
                    pca_params_path, **params)

                input_tensor_name = input_op_name + ':0'
                output_tensor_name = output_op_name + ':0'

                features_tensor = sess.graph.get_tensor_by_name(
                    input_tensor_name)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    output_tensor_name)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})

                emb = pproc.postprocess(embedding_batch,
                                        **params).astype(np.float32)

                with gzip.open(output_path, 'wb') as f:
                    emb.dump(f)

    except GeneratorExit:
        pass
Beispiel #8
0
def main(_):
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

        # Define a shallow classification model and associated training ops on top
        # of VGGish.
        with tf.variable_scope('mymodel'):
            # Add a fully connected layer with 100 units.
            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(fc,
                                          _NUM_CLASSES,
                                          activation_fn=None,
                                          scope='logits')
            tf.sigmoid(logits, name='prediction')

            # Add training ops.
            with tf.variable_scope('train'):
                global_step = tf.Variable(0,
                                          name='global_step',
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              tf.GraphKeys.GLOBAL_STEP
                                          ])

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                labels = tf.placeholder(tf.float32,
                                        shape=(None, _NUM_CLASSES),
                                        name='labels')

                # Cross-entropy label loss.
                xent = tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=logits, labels=labels, name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=vggish_params.LEARNING_RATE,
                    epsilon=vggish_params.ADAM_EPSILON)
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name='train_op')

        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

        # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
        global_step_tensor = sess.graph.get_tensor_by_name(
            'mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

        # The training loop.
        for _ in range(FLAGS.num_batches):
            (features, labels) = _get_examples_batch()
            [num_steps, loss,
             _] = sess.run([global_step_tensor, loss_tensor, train_op],
                           feed_dict={
                               features_tensor: features,
                               labels_tensor: labels
                           })
            print('Step %d: loss %g' % (num_steps, loss))
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()