def extract_vggish_embedding(audio_data, fs): examples_batch = vggish_input.waveform_to_examples(audio_data, fs) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(PCA_PARAMS_PATH) # If needed, prepare a record writer to store the postprocessed embeddings. #writer = tf.python_io.TFRecordWriter( # FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PATH) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. return postprocessed_batch
def __init__(self, tfrecord_file=None): # Prepare a postprocessor to munge the model embeddings. self.pproc = vggish_postprocess.Postprocessor(PCA_PARAMS) # If needed, prepare a record writer to store the postprocessed embeddings. self.writer = tf.python_io.TFRecordWriter( tfrecord_file) if tfrecord_file else None self.graph = tf.Graph() self.sess = tf.Session() sess = self.sess # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, VGG_CHECKPOINT) self.features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME)
from __future__ import print_function import tensorflow as tf import ausioset.vggish_params as vggish_params import audioset.vggish_slim as vggish_slim from tensorflow.python.tools import freeze_graph from baselines.TFS.transform_pb_to_server_model import * print('\nTesting your install of VGGish\n') os.environ["CUDA_VISIBLE_DEVICES"] = '1' # Paths to downloaded VGGish files. checkpoint_path = 'vggish_model.ckpt' pca_params_path = 'vggish_pca_params.npz' with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) save_server_models(sess, features_tensor, embedding_tensor)
def embedding(wav, tf_record_filename): try: print(wav) f = open("csvfile.csv", "a") f.write("\n") # Give your csv text here. # Python will convert \n to os.linesep f.close() label_id = 0 exist_in_csv = "no" # WAV Filename if type(wav) == str: wav_filename = wav.rsplit("/", 1)[-1] else: wav_filename = wav if FLAGS.ff: # if using flat files (--ff) argument, will retreive class label from file name print("parsing flat file(s)...") class_label = (re.search("\(([^)]+)", wav).group(1)).capitalize() print("CLASS LABEL: " + class_label) else: # if not using the -ff argument, then the class label will be the name of subdirectory class_label = str((wav.split("/")[-2]).capitalize()) print("CLASS LABEL: " + class_label) # Acquiring class label id if FLAGS.labels_file: csv_file = csv.reader(open(FLAGS.labels_file, "rb"), delimiter=",") for row in csv_file: if class_label in row[2]: print(row) label_id = int(row[0]) exist_in_csv = "yes" break # Need to append to csv file if label is STILL 0 if label_id == 0 and exist_in_csv == "no": print("Label is still 0. Will append new entry in labels CSV file.") last_row = get_last_row(FLAGS.labels_file) row = [int(last_row[0]) + 1, "/m/t3st/", class_label] # new_row = "\n%s,%s,%s\n" % (int(last_row[0])+1, '/m/t3st/', class_label) with open(FLAGS.labels_file, "a") as fd: writer = csv.writer(fd) writer.writerow(row) ############################################################################################ batch = vggish_input.wavfile_to_examples(wav) # print(batch) ############################################################################################ # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) ############################################################################################ # If needed, prepare a record writer to store the postprocessed embeddings. if FLAGS.tfrecord_file: writer = tf.python_io.TFRecordWriter(tf_record_filename) # if FLAGS.tf_directory: # writer = tf.python_io.TFRecordWriter(tf_record_filename) else: writer = tf.python_io.TFRecordWriter(tf_record_filename) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME ) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME ) # Run inference and postprocessing. [embedding_batch] = sess.run( [embedding_tensor], feed_dict={features_tensor: batch} ) # print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) # print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. if type(wav) == str and FLAGS.labels_file: seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ "video_id": tf.train.Feature( bytes_list=tf.train.BytesList( value=[wav_filename.encode()] ) ), "labels": tf.train.Feature( int64_list=tf.train.Int64List(value=[label_id]) ), } ), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()] ) ) for embedding in postprocessed_batch ] ) } ), ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) else: seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()] ) ) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close() except Exception: print("Error on: " + wav)
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def __init__(self): with tf.Graph().as_default(): self.sess = tf.Session() vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(self.sess, self.CHECKPOINT_PATH)
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected( fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable( 0, name='global_step', trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder( tf.float32, shape=(None, _NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') # The training loop. for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run( [global_step_tensor, loss_tensor, train_op], feed_dict={features_tensor: features, labels_tensor: labels}) print('Step %d: loss %g' % (num_steps, loss))
def extract_vggish_embeddings(input_filepaths, output_file, xdim=XDIM, ydim=YDIM, start_index=0): pproc = vggish_postprocess.Postprocessor(PCA_PARAMS) with tf.Graph().as_default(), tf.Session() as sess, tqdm.tqdm( total=len(input_filepaths)) as pbar, h5py.File(output_file, 'w') as h5: # create dataset d = h5.create_dataset('features', (len(input_filepaths), ), dtype=[('identifier', 'S32'), ('features', 'f4', (xdim, ydim)), ('features_z', 'u1', (xdim, ydim))]) # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PARAMS) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) update_interval = int(len(input_filepaths) / 5.) idx = start_index for input_filepath in input_filepaths[start_index:]: input_data = load_input(input_filepath) [embedding] = sess.run([embedding_tensor], feed_dict={features_tensor: input_data}) emb_pca = pproc.postprocess(embedding) identifier = os.path.split(input_filepath)[1] try: d[idx] = (identifier, embedding.astype('f4'), emb_pca.astype('u1')) except ValueError as e: print(idx, e) if embedding.shape[0] > xdim: print( 'Too much data. Only using first {} output frames. {}'. format(xdim, identifier)) embedding = embedding[:xdim, :] emb_pca = emb_pca[:xdim, :] else: # pad to size, using NaN as fill # NOTE THAT uint8 can't represent NaN, so you'll have to mask from embedding. print('Too little data. Padding with nan. {}'.format( identifier)) embedding = np.pad(embedding, ((0, xdim - embedding.shape[0]), (0, 0)), 'constant', constant_values=np.nan) emb_pca = np.pad(emb_pca, ((0, xdim - emb_pca.shape[0]), (0, 0)), 'constant', constant_values=np.nan) d[idx] = (identifier, embedding.astype('f4'), emb_pca.astype('u1')) idx += 1 if (idx % update_interval) == 0: pbar.update(update_interval)
def main(): with tf.Graph().as_default(), tf.Session() as sess: # ------------------- # Step 1 # ------------------- # Load the model. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') # Get all of the variables, and use this to construct a dictionary which maps # the name of the variables to their values. variables = tf.all_variables() variables = [x.name for x in variables] variable_values = sess.run(variables) variable_dict = dict(zip(variables, variable_values)) # Create a new state dictionary which maps the TensorFlow version of the weights # to those in in the new PyTorch model. pytorch_model = VGGish() pytorch_feature_dict = pytorch_model.features.state_dict() pytorch_fc_dict = pytorch_model.fc.state_dict() # ------------------- # Step 2 # ------------------- # There is a bias and weight vector for each convolution layer. The weights are not necessarily stored # in the same format and order between the two frameworks; for the TensorFlow model, the 12 vectors for the # convolution layers are first, followed by the 6 FC layers. tf_feature_names = list(variable_dict.keys())[:-6] tf_fc_names = list(variable_dict.keys())[-6:] def to_pytorch_tensor(weights): if len(weights.shape) == 4: tensor = torch.from_numpy(weights.transpose(3, 2, 0, 1)).float() else: tensor = torch.from_numpy(weights.T).float() return tensor # Convert the weights for the convolution layers. for tf_name, pytorch_name in zip(tf_feature_names, pytorch_feature_dict.keys()): print( f'Converting [{tf_name}] ----------> [feature.{pytorch_name}]' ) pytorch_feature_dict[pytorch_name] = to_pytorch_tensor( variable_dict[tf_name]) # Convert the weights for the FC layers. for tf_name, pytorch_name in zip(tf_fc_names, pytorch_fc_dict.keys()): print(f'Converting [{tf_name}] ----------> [fc.{pytorch_name}]') pytorch_fc_dict[pytorch_name] = to_pytorch_tensor( variable_dict[tf_name]) # ------------------- # Step 3 # ------------------- # Load the new state dictionaries into the PyTorch model. pytorch_model.features.load_state_dict(pytorch_feature_dict) pytorch_model.fc.load_state_dict(pytorch_fc_dict) # ------------------- # Step 4 # ------------------- # Generate a sample input (as in the AudioSet repo smoke test). num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) # Run inference on the TensorFlow model. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [tf_output] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Run on the PyTorch model. pytorch_model = pytorch_model.to('cpu') pytorch_output = pytorch_model( torch.from_numpy(input_batch).unsqueeze(dim=1).float()) pytorch_output = pytorch_output.detach().numpy() # ------------------- # Step 5 # ------------------- # Compare the difference between the outputs. diff = np.linalg.norm(pytorch_output - tf_output)**2 print(f'Distance between TensorFlow and PyTorch outputs: [{diff}]') assert diff < 1e-6 # Run a smoke test. expected_embedding_mean = 0.131 expected_embedding_std = 0.238 # Verify the TF output. np.testing.assert_allclose( [np.mean(tf_output), np.std(tf_output)], [expected_embedding_mean, expected_embedding_std], rtol=0.001) # Verify the PyTorch output. np.testing.assert_allclose( [np.mean(pytorch_output), np.std(pytorch_output)], [expected_embedding_mean, expected_embedding_std], rtol=0.001) # ------------------- # Step 6 # ------------------- print( 'Smoke test passed! Saving PyTorch weights to "pytorch_vggish.pth".' ) torch.save(pytorch_model.state_dict(), 'pytorch_vggish.pth')