예제 #1
0
def extract_vggish_embedding(audio_data, fs):
    examples_batch = vggish_input.waveform_to_examples(audio_data, fs)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(PCA_PARAMS_PATH)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    #writer = tf.python_io.TFRecordWriter(
    #    FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PATH)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.

    return postprocessed_batch
    def __init__(self, tfrecord_file=None):
        # Prepare a postprocessor to munge the model embeddings.
        self.pproc = vggish_postprocess.Postprocessor(PCA_PARAMS)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        self.writer = tf.python_io.TFRecordWriter(
            tfrecord_file) if tfrecord_file else None

        self.graph = tf.Graph()
        self.sess = tf.Session()
        sess = self.sess

        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, VGG_CHECKPOINT)
        self.features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        self.embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
from __future__ import print_function

import tensorflow as tf
import ausioset.vggish_params as vggish_params
import audioset.vggish_slim as vggish_slim

from tensorflow.python.tools import freeze_graph
from baselines.TFS.transform_pb_to_server_model import *

print('\nTesting your install of VGGish\n')
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

# Paths to downloaded VGGish files.
checkpoint_path = 'vggish_model.ckpt'
pca_params_path = 'vggish_pca_params.npz'

with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim()
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)
    save_server_models(sess, features_tensor, embedding_tensor)
예제 #4
0
def embedding(wav, tf_record_filename):
    try:
        print(wav)
        f = open("csvfile.csv", "a")
        f.write("\n")  # Give your csv text here.
        # Python will convert \n to os.linesep
        f.close()

        label_id = 0
        exist_in_csv = "no"

        # WAV Filename
        if type(wav) == str:
            wav_filename = wav.rsplit("/", 1)[-1]
        else:
            wav_filename = wav

        if FLAGS.ff:
            # if using flat files (--ff) argument, will retreive class label from file name
            print("parsing flat file(s)...")
            class_label = (re.search("\(([^)]+)", wav).group(1)).capitalize()
            print("CLASS LABEL: " + class_label)

        else:
            # if not using the -ff argument, then the class label will be the name of subdirectory
            class_label = str((wav.split("/")[-2]).capitalize())
            print("CLASS LABEL: " + class_label)

        # Acquiring class label id
        if FLAGS.labels_file:
            csv_file = csv.reader(open(FLAGS.labels_file, "rb"), delimiter=",")
            for row in csv_file:
                if class_label in row[2]:
                    print(row)
                    label_id = int(row[0])
                    exist_in_csv = "yes"
                    break

            # Need to append to csv file if label is STILL 0
            if label_id == 0 and exist_in_csv == "no":
                print("Label is still 0. Will append new entry in labels CSV file.")
                last_row = get_last_row(FLAGS.labels_file)
                row = [int(last_row[0]) + 1, "/m/t3st/", class_label]
                # new_row = "\n%s,%s,%s\n" % (int(last_row[0])+1, '/m/t3st/', class_label)
                with open(FLAGS.labels_file, "a") as fd:
                    writer = csv.writer(fd)
                    writer.writerow(row)

        ############################################################################################
        batch = vggish_input.wavfile_to_examples(wav)
        # print(batch)

        ############################################################################################
        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        ############################################################################################

        # If needed, prepare a record writer to store the postprocessed embeddings.
        if FLAGS.tfrecord_file:
            writer = tf.python_io.TFRecordWriter(tf_record_filename)
        # if FLAGS.tf_directory:
        # writer = tf.python_io.TFRecordWriter(tf_record_filename)
        else:
            writer = tf.python_io.TFRecordWriter(tf_record_filename)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME
            )
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME
            )

            # Run inference and postprocessing.
            [embedding_batch] = sess.run(
                [embedding_tensor], feed_dict={features_tensor: batch}
            )
            # print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            # print(postprocessed_batch)

            # Write the postprocessed embeddings as a SequenceExample, in a similar
            # format as the features released in AudioSet. Each row of the batch of
            # embeddings corresponds to roughly a second of audio (96 10ms frames), and
            # the rows are written as a sequence of bytes-valued features, where each
            # feature value contains the 128 bytes of the whitened quantized embedding.
            if type(wav) == str and FLAGS.labels_file:
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            "video_id": tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[wav_filename.encode()]
                                )
                            ),
                            "labels": tf.train.Feature(
                                int64_list=tf.train.Int64List(value=[label_id])
                            ),
                        }
                    ),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(
                                feature=[
                                    tf.train.Feature(
                                        bytes_list=tf.train.BytesList(
                                            value=[embedding.tobytes()]
                                        )
                                    )
                                    for embedding in postprocessed_batch
                                ]
                            )
                        }
                    ),
                )
                print(seq_example)
                if writer:
                    writer.write(seq_example.SerializeToString())
            else:
                seq_example = tf.train.SequenceExample(
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(
                                feature=[
                                    tf.train.Feature(
                                        bytes_list=tf.train.BytesList(
                                            value=[embedding.tobytes()]
                                        )
                                    )
                                    for embedding in postprocessed_batch
                                ]
                            )
                        }
                    )
                )
                print(seq_example)
                if writer:
                    writer.write(seq_example.SerializeToString())

        if writer:
            writer.close()
    except Exception:
        print("Error on: " + wav)
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
예제 #6
0
 def __init__(self):
     with tf.Graph().as_default():
         self.sess = tf.Session()
         vggish_slim.define_vggish_slim()
         vggish_slim.load_vggish_slim_checkpoint(self.sess, self.CHECKPOINT_PATH)
def main(_):
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define VGGish.
    embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

    # Define a shallow classification model and associated training ops on top
    # of VGGish.
    with tf.variable_scope('mymodel'):
      # Add a fully connected layer with 100 units.
      num_units = 100
      fc = slim.fully_connected(embeddings, num_units)

      # Add a classifier layer at the end, consisting of parallel logistic
      # classifiers, one per class. This allows for multi-class tasks.
      logits = slim.fully_connected(
          fc, _NUM_CLASSES, activation_fn=None, scope='logits')
      tf.sigmoid(logits, name='prediction')

      # Add training ops.
      with tf.variable_scope('train'):
        global_step = tf.Variable(
            0, name='global_step', trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                         tf.GraphKeys.GLOBAL_STEP])

        # Labels are assumed to be fed as a batch multi-hot vectors, with
        # a 1 in the position of each positive class label, and 0 elsewhere.
        labels = tf.placeholder(
            tf.float32, shape=(None, _NUM_CLASSES), name='labels')

        # Cross-entropy label loss.
        xent = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=labels, name='xent')
        loss = tf.reduce_mean(xent, name='loss_op')
        tf.summary.scalar('loss', loss)

        # We use the same optimizer and hyperparameters as used to train VGGish.
        optimizer = tf.train.AdamOptimizer(
            learning_rate=vggish_params.LEARNING_RATE,
            epsilon=vggish_params.ADAM_EPSILON)
        optimizer.minimize(loss, global_step=global_step, name='train_op')

    # Initialize all variables in the model, and then load the pre-trained
    # VGGish checkpoint.
    sess.run(tf.global_variables_initializer())
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

    # Locate all the tensors and ops we need for the training loop.
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
    global_step_tensor = sess.graph.get_tensor_by_name(
        'mymodel/train/global_step:0')
    loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
    train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

    # The training loop.
    for _ in range(FLAGS.num_batches):
      (features, labels) = _get_examples_batch()
      [num_steps, loss, _] = sess.run(
          [global_step_tensor, loss_tensor, train_op],
          feed_dict={features_tensor: features, labels_tensor: labels})
      print('Step %d: loss %g' % (num_steps, loss))
def extract_vggish_embeddings(input_filepaths,
                              output_file,
                              xdim=XDIM,
                              ydim=YDIM,
                              start_index=0):

    pproc = vggish_postprocess.Postprocessor(PCA_PARAMS)

    with tf.Graph().as_default(), tf.Session() as sess, tqdm.tqdm(
            total=len(input_filepaths)) as pbar, h5py.File(output_file,
                                                           'w') as h5:
        # create dataset
        d = h5.create_dataset('features', (len(input_filepaths), ),
                              dtype=[('identifier', 'S32'),
                                     ('features', 'f4', (xdim, ydim)),
                                     ('features_z', 'u1', (xdim, ydim))])

        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.

        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PARAMS)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        update_interval = int(len(input_filepaths) / 5.)
        idx = start_index
        for input_filepath in input_filepaths[start_index:]:
            input_data = load_input(input_filepath)

            [embedding] = sess.run([embedding_tensor],
                                   feed_dict={features_tensor: input_data})

            emb_pca = pproc.postprocess(embedding)

            identifier = os.path.split(input_filepath)[1]
            try:
                d[idx] = (identifier, embedding.astype('f4'),
                          emb_pca.astype('u1'))
            except ValueError as e:
                print(idx, e)
                if embedding.shape[0] > xdim:
                    print(
                        'Too much data. Only using first {} output frames. {}'.
                        format(xdim, identifier))
                    embedding = embedding[:xdim, :]
                    emb_pca = emb_pca[:xdim, :]
                else:
                    # pad to size, using NaN as fill
                    # NOTE THAT uint8 can't represent NaN, so you'll have to mask from embedding.
                    print('Too little data. Padding with nan. {}'.format(
                        identifier))
                    embedding = np.pad(embedding,
                                       ((0, xdim - embedding.shape[0]),
                                        (0, 0)),
                                       'constant',
                                       constant_values=np.nan)
                    emb_pca = np.pad(emb_pca,
                                     ((0, xdim - emb_pca.shape[0]), (0, 0)),
                                     'constant',
                                     constant_values=np.nan)

                d[idx] = (identifier, embedding.astype('f4'),
                          emb_pca.astype('u1'))

            idx += 1
            if (idx % update_interval) == 0:
                pbar.update(update_interval)
def main():
    with tf.Graph().as_default(), tf.Session() as sess:
        # -------------------
        # Step 1
        # -------------------
        # Load the model.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')

        # Get all of the variables, and use this to construct a dictionary which maps
        # the name of the variables to their values.
        variables = tf.all_variables()
        variables = [x.name for x in variables]
        variable_values = sess.run(variables)
        variable_dict = dict(zip(variables, variable_values))

        # Create a new state dictionary which maps the TensorFlow version of the weights
        # to those in in the new PyTorch model.
        pytorch_model = VGGish()
        pytorch_feature_dict = pytorch_model.features.state_dict()
        pytorch_fc_dict = pytorch_model.fc.state_dict()

        # -------------------
        # Step 2
        # -------------------
        # There is a bias and weight vector for each convolution layer. The weights are not necessarily stored
        # in the same format and order between the two frameworks; for the TensorFlow model, the 12 vectors for the
        # convolution layers are first, followed by the 6 FC layers.
        tf_feature_names = list(variable_dict.keys())[:-6]
        tf_fc_names = list(variable_dict.keys())[-6:]

        def to_pytorch_tensor(weights):
            if len(weights.shape) == 4:
                tensor = torch.from_numpy(weights.transpose(3, 2, 0,
                                                            1)).float()
            else:
                tensor = torch.from_numpy(weights.T).float()
            return tensor

        # Convert the weights for the convolution layers.
        for tf_name, pytorch_name in zip(tf_feature_names,
                                         pytorch_feature_dict.keys()):
            print(
                f'Converting [{tf_name}] ---------->  [feature.{pytorch_name}]'
            )
            pytorch_feature_dict[pytorch_name] = to_pytorch_tensor(
                variable_dict[tf_name])

        # Convert the weights for the FC layers.
        for tf_name, pytorch_name in zip(tf_fc_names, pytorch_fc_dict.keys()):
            print(f'Converting [{tf_name}] ---------->  [fc.{pytorch_name}]')
            pytorch_fc_dict[pytorch_name] = to_pytorch_tensor(
                variable_dict[tf_name])

        # -------------------
        # Step 3
        # -------------------
        # Load the new state dictionaries into the PyTorch model.
        pytorch_model.features.load_state_dict(pytorch_feature_dict)
        pytorch_model.fc.load_state_dict(pytorch_fc_dict)

        # -------------------
        # Step 4
        # -------------------
        # Generate a sample input (as in the AudioSet repo smoke test).
        num_secs = 3
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)

        # Produce a batch of log mel spectrogram examples.
        input_batch = vggish_input.waveform_to_examples(x, sr)

        # Run inference on the TensorFlow model.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        [tf_output] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: input_batch})

        # Run on the PyTorch model.
        pytorch_model = pytorch_model.to('cpu')
        pytorch_output = pytorch_model(
            torch.from_numpy(input_batch).unsqueeze(dim=1).float())
        pytorch_output = pytorch_output.detach().numpy()

        # -------------------
        # Step 5
        # -------------------
        # Compare the difference between the outputs.
        diff = np.linalg.norm(pytorch_output - tf_output)**2
        print(f'Distance between TensorFlow and PyTorch outputs: [{diff}]')
        assert diff < 1e-6

        # Run a smoke test.
        expected_embedding_mean = 0.131
        expected_embedding_std = 0.238

        # Verify the TF output.
        np.testing.assert_allclose(
            [np.mean(tf_output), np.std(tf_output)],
            [expected_embedding_mean, expected_embedding_std],
            rtol=0.001)

        # Verify the PyTorch output.
        np.testing.assert_allclose(
            [np.mean(pytorch_output),
             np.std(pytorch_output)],
            [expected_embedding_mean, expected_embedding_std],
            rtol=0.001)

        # -------------------
        # Step 6
        # -------------------
        print(
            'Smoke test passed! Saving PyTorch weights to "pytorch_vggish.pth".'
        )
        torch.save(pytorch_model.state_dict(), 'pytorch_vggish.pth')