def main(_): with open(FLAGS.wav_files) as f: files_list = [line.replace('\n', '') for line in f] n_files = len(files_list) output_emedding = np.zeros((n_files, 128)) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) processed_fnames = [] with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) for n_file, wav_file in enumerate(files_list): examples_batch = vggish_input.wavfile_to_examples(wav_file) print(n_file, '/', n_files) if examples_batch.shape[0] == 0: with open('bad_files.log', 'a') as logf: logf.write(wav_file + '\n') else: processed_fnames.append(wav_file) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0) output_emedding[n_file, :] = postprocessed_batch_mean np.save(FLAGS.npy_file, output_emedding)
def extract_vggish_features(wav_path): # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.wavfile_to_examples(wav_path) if input_batch.shape[0] < 1: print('{}: Audio sample shorter than 1 second. Ignoring ...', os.path.basename(wav_path)) return None # print('Log Mel Spectrogram example: ', input_batch[0]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. wav_file = FLAGS.wav_file examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) np.save("/postprocessed_batch.npy", postprocessed_batch)
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file): print("Input file: " +input_wav_file) if (os.path.isfile(input_wav_file)): examples_batch = vggish_input.wavfile_to_examples(input_wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2) # extract_n_predict() predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file) return(predicted_class) tf.reset_default_graph()
def main(wav_file): """ #Specify the path for the downloaded or recorded audio files and #also path for writing the embeddings or pickle files """ if wav_file: pkl = wav_file[:-4] + '.pkl' print(pkl) examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) predict_prob, predictions = model_function.predictions_wavfile( postprocessed_batch) K.clear_session() return predict_prob, predictions
def input_sound(): # function to calculate the FS and time series input files = get_file_paths(DIRNAME) # function called to get the file paths File_names = [] full_feature_vector = np.empty([0, 128]) for file in sorted(files): # loop to access each file # print (file) (filepath, ext) = os.path.splitext(file) # get extension of the file file_name = os.path.basename(file) # get the file name if ext == '.wav': File_names.append(file_name) y, sr = librosa.load(file, sr=None) print(sr) examples_batch = waveform_to_examples(y, sr) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(np.shape(postprocessed_batch)) full_feature_vector = np.concatenate( (full_feature_vector, postprocessed_batch), axis=0) print(np.shape(full_feature_vector)) return full_feature_vector
def main(_): opt = parse_opt() with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range] for i in range(3): h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5' if os.path.exists(h5_path): os.remove(h5_path) h5 = h5py.File(h5_path, 'w') dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32') # print(values[i]) for audio_id in range(values[i][0], values[i][1] + 1): wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav' #print(wav_file) # id = int(audio_id[5:-9]) #print(audio_id) if os.path.isfile(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(len(embedding_batch), len(embedding_batch[0])) embedding_batch = embedding_batch.mean(0) dataset_feats[audio_id - values[i][0]] = embedding_batch #print(embedding_batch) if writer: writer.close()
def extract_and_predict(wav): print("Boom from PYTHON!!!") # tf.enable_v2_behavior() # loaded_model = tf2.saved_model.load(saved_model_path) # print("I can load model now!!!") wav_file = wav examples_batch = vggish_input.wavfile_to_examples(wav_file) print("Jerry audio_to_prediction.py: after wavfile_to_examples") # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor() print("Jerry audio_to_prediction.py: after pproc") with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) print("Jerry audio_to_prediction.py: after load vggish_slim") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))] pred_each_n_seconds = predict_with_saved_model(postprocessed_batch) return str(pred_each_n_seconds)
def extract(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor( '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz' ) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt' ) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) return postprocessed_batch
def embed(wavform_slice, rate): norm_wavform_slice = preprocessing.normalize(wavform_slice) examples_batch = vggish_input.waveform_to_examples(norm_wavform_slice,rate) #print('examples_batch:') #print(examples_batch) print('examples_batch len: ' + str(len(examples_batch))) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print('embedding_batch: ') #print(embedding_batch) #print(embedding_batch.shape) postprocessed_batch = pproc.postprocess(embedding_batch) print('postprocessed_batch: ') print(postprocessed_batch) print(postprocessed_batch.shape) return postprocessed_batch
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) vggish_params.EXAMPLE_HOP_SECONDS = ( 1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS # If needed, prepare a record writer_dict to store the postprocessed embeddings. with tf.Graph().as_default(), tf.Session(config=config) as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) movie_id = args.wav_file[args.wav_file.rfind('/') + 1:args.wav_file.rfind('.')] examples_batch = vggish_input.wavfile_to_examples(args.wav_file) num_splits = min(int(examples_batch.shape[0] / 10), 100) num_splits = max(1, num_splits) examples_batch = np.array_split(examples_batch, num_splits) embedding_batch = [] for i in range(num_splits): [batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch[i]}) embedding_batch.extend(batch) postprocessed_batch = pproc.postprocess(np.array(embedding_batch)) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'movie_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[movie_id])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) writer = tf.python_io.TFRecordWriter( os.path.join(args.write_dir, movie_id + '.tfrecord')) writer.write(seq_example.SerializeToString()) writer.close()
def extract_audioset_embedding(): """Extract log mel spectrogram features. """ # Arguments & parameters mel_bins = vggish_params.NUM_BANDS sample_rate = vggish_params.SAMPLE_RATE input_len = vggish_params.NUM_FRAMES embedding_size = vggish_params.EMBEDDING_SIZE '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the hop size. ''' # Paths audio_path = 'appendixes/01.wav' checkpoint_path = os.path.join('vggish_model.ckpt') pcm_params_path = os.path.join('vggish_pca_params.npz') if not os.path.isfile(checkpoint_path): raise Exception( 'Please download vggish_model.ckpt from ' 'https://storage.googleapis.com/audioset/vggish_model.ckpt ' 'and put it in the root of this codebase. ') if not os.path.isfile(pcm_params_path): raise Exception( 'Please download pcm_params_path from ' 'https://storage.googleapis.com/audioset/vggish_pca_params.npz ' 'and put it in the root of this codebase. ') # Load model sess = tf.Session() vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) pproc = vggish_postprocess.Postprocessor(pcm_params_path) # Read audio (audio, _) = read_audio(audio_path, target_fs=sample_rate) # Extract log mel feature logmel = vggish_input.waveform_to_examples(audio, sample_rate) # Extract embedding feature [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel}) # PCA postprocessed_batch = pproc.postprocess(embedding_batch) print('Audio length: {}'.format(len(audio))) print('Log mel shape: {}'.format(logmel.shape)) print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
def readDirectory(dirname, label): pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz") for wav_file in glob.glob(dirname + "*.wav"): print(wav_file) try: examples_batch = vggish_input.wavfile_to_examples(wav_file) except: continue writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord") with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) try: [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) except: continue postprocessed_batch = pproc.postprocess(embedding_batch) nBatches = len(postprocessed_batch) if nBatches < 10: nBatches = 1 else: nBatches = nBatches / 10 for i in range(nBatches): seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ "labels": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch[i * 10:i * 10 + 10] ]) })) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(_): if FLAGS.wav_file: wav_file = FLAGS.wav_file else: return "No wav file" examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(wav_file, npz_path): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. #if FLAGS.wav_file: # wav_file = str(FLAGS.wav_file) # print (FLAGS.wav_file) if 1: wav_file = wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter(tfrecord_file) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) if 0 in embedding_batch.shape: print('NO') return 0 np.savez_compressed(npz_path, postprocessed_batch) return 1
def main(_): ontology_lookup = {} with open(ONTROLOGY, 'r') as f: label_json = json.load(f) for entry in label_json: label_id = entry['id'].replace('/', '_') assert label_id not in ontology_lookup.keys() ontology_lookup[label_id] = entry wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav')) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) audio_tsv = [] label_tsv = [] emb_tsv = [] for wavfile in tqdm(wav_paths): label = Path(Path(wavfile).parent).stem filename = Path(wavfile).name examples_batch = vggish_input.wavfile_to_examples(wavfile) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # emb = [] # for embedding in embedding_batch: # emb.append(embedding.tolist()) emb = np.mean(embedding_batch, axis=0).tolist() label_tsv.append([ontology_lookup[label]['name']]) audio_tsv.append([f'{label}/{filename}']) emb_tsv.append(emb) assert len(emb_tsv[0]) == len(emb) with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f: for emb in emb_tsv: csv.writer(f, delimiter='\t').writerow(emb) with open(f'{OUTPUTDIR}/label.tsv', 'w') as f: for label in label_tsv: csv.writer(f, delimiter='\t').writerow(label) with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f: for audio_path in audio_tsv: csv.writer(f, delimiter='\t').writerow(audio_path)
def post_init(self): self.to_device() import tensorflow as tf tf.compat.v1.disable_eager_execution() self.sess = tf.compat.v1.Session() vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(self.sess, self.model_path) self.feature_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) self.post_processor = vggish_postprocess.Postprocessor(self.pca_path)
def _build_model(self): # Restore VGGish model trained on YouTube8M dataset # Retrieve PCA-embeddings of bottleneck features # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(self.sess, model_checkpoint) self.features_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Prepare a postprocessor to munge the model embeddings. self.pproc = vggish_postprocess.Postprocessor(pca_params)
def OutputAudioEmbeddings(pathIn, row): video_id = row['video_id'] video_path = row['video_path'] split = row['split'] full_path = os.path.join(pathIn, video_path) full_path = full_path.replace("%(ext)s", "wav") # output file of the downloader path if split == 'train': full_path_cut = full_path.replace("train", "train/cut") elif split == 'test': full_path_cut = full_path.replace("test", "test/cut") # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if os.path.isfile(full_path_cut): wav_file = full_path_cut examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) #print(postprocessed_batch.shape) np.save( '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' + split + '/' + video_id, postprocessed_batch)
def processList(filelist): # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate # to test resampling to 16 kHz during feature extraction). out_file_list = [] input_list = open(filelist) input_lists = input_list.readlines() for i, infl in tqdm(enumerate(input_lists)): infl_new = audio_root + '/' + infl.replace("\ ", " ") y, sr = librosa.load(infl_new.strip(), sr=None) if len(y) < sr: y1 = np.pad(y, (0, sr - len(y)), 'wrap') y = y1 # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(y, sr) print('Log Mel Spectrogram example: ', input_batch[0]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) print('VGGish embedding: done ', i) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) infl_list = infl_new.strip().split("/") file_name = infl_list[-1].strip() out_dir = output_root + "/" + infl_list[-2] if not os.path.exists(out_dir): os.makedirs(out_dir) featfile = str(out_dir) + '/' + str(file_name) + '.txt' out_file_list.append(featfile.strip()) np.savetxt(featfile, postprocessed_batch.astype(int), fmt='%i', delimiter=",") create_file(output_file, out_file_list)
def main(_): # We run the examples from audio files from the input path through the model. # If none is provided, we generate a synthetic input. if FLAGS.input_path: wav_files = os.listdir(FLAGS.input_path) else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_files[0] = six.BytesIO() wavfile.write(wav_files[0], sr, samples) wav_files[0].seek(0) examples_batch = [ vggish_input.wavfile_to_examples(FLAGS.input_path + wav_file) for wav_file in wav_files ] print("data sample", examples_batch[0]) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. for i in range(len(examples_batch)): print("Batch number: ", i) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch[i]}) #print("embedding_batch: ",embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) np.savetxt(FLAGS.output_path + wav_files[i][:-4] + ".csv", postprocessed_batch, fmt='%i', delimiter=",")
def main(): num_secs = 3 # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file print(wav_file) snore_path = "/home/grodri/bacelar/apnea_detection/data/audio/snore_3s/" bg_path = "/home/grodri/bacelar/apnea_detection/data/audio/bg_3s/" # Load datasets to dictionary snore_IDs = glob.glob(snore_path + '*.wav') bg_IDs = glob.glob(bg_path + '*.wav') for c, class_file in enumerate(['bg', 'snore']): result = list() if class_file == 'snore': continue file_ids = snore_IDs else: file_ids = bg_IDs for wav_file in tqdm(file_ids): filename = wav_file.split('/')[-1] examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) client = boto3.client('runtime.sagemaker', region_name='eu-west-1') data = np.expand_dims(examples_batch, axis=-1).tolist() endpoint_feat_extract = 'vggish-features' response = client.invoke_endpoint( EndpointName=endpoint_feat_extract, Body=json.dumps(data)) body = response['Body'].read().decode('utf-8') embedding_sound = np.array( json.loads(body)['outputs']['vgg_features'] ['floatVal']).reshape(-1, vggish_params.EMBEDDING_SIZE) if len(embedding_sound.shape) == 2: postprocessed_batch_keras = pproc.postprocess_single_sample( embedding_sound, num_secs) postprocessed_batch_keras = uint8_to_float32( postprocessed_batch_keras).reshape(num_secs, -1) else: postprocessed_batch_keras = pproc.postprocess(embedding_sound) result.append({ 'filename': filename, 'embedding': postprocessed_batch_keras, 'label': c }) with open('./dataset/features_' + class_file + '.pickle', 'wb') as handle: pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_vggish_params(): graph = tf.get_default_graph() sess = tf.Session() # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, r'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) pproc = vggish_postprocess.Postprocessor(r'vggish_pca_params.npz') print("done loading the models") return graph, sess, features_tensor, embedding_tensor, pproc
def get_embed(input_wav, sr=None, sess=None): ''' accepts an input of raw wav data and produces an embedding for it treating the entire wav data as one sequence of audio --- input_wav: raw wav data as an numpy ndarray sess: existing tensorflow if already active (required) or None if not return: postprocessed_batch (the embeddings), and sess, the tf session used so that it can be reused. note that returned sess must be handled appropriately by the user ''' # color.INFO('INFO', 'generating input example from wav\r') examples_batch = vggish_input.waveform_to_examples(input_wav, sr) # load models and postprocessor (a PCA model) # color.INFO('INFO', 'loading vggish model checkpoint\r') pproc = vggish_postprocess.Postprocessor('../vggish/vggish_pca_params.npz') if sess == None: sess = tf.Session() tf.Graph().as_default() vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, '../vggish/vggish_model.ckpt') else: # color.INFO('INFO', 'attempting to reuse tensorflow session\r') pass # color.INFO('INFO', 'generating features\r') features_tensor = sess.graph.get_tensor_by_name('vggish/input_features:0') # color.INFO('INFO', 'generating embeddings\r') embedding_tensor = sess.graph.get_tensor_by_name('vggish/embedding:0') # Compute embeddings: # color.INFO('INFO', 'computing embeddings\r') [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # color.INFO('INFO', 'post-processing data\r') postprocessed_batch = pproc.postprocess(embedding_batch) # Print out dimensions: # TODO: make str formatting error go away # color.INFO('INFO', 'shape of input batches: %s\r' % examples_batch.shape) # color.INFO('INFO', 'shape of vggish output: %s\r' % embedding_batch.shape) # color.INFO('INFO', 'shape postprocessed: %s\r' % postprocessed_batch.shape) return postprocessed_batch, sess
def __init__(self, checkpoint_path='vggish_model.ckpt', pcm_params_path='vggish_pca_params.npz'): checkpoint_path = os.path.join(checkpoint_path) pcm_params_path = os.path.join(pcm_params_path) # Load model vggish_slim.define_vggish_slim(training=False) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(self.sess, checkpoint_path) self.features_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) self.pproc = vggish_postprocess.Postprocessor(pcm_params_path)
def main(_): if FLAGS.dataset_path: subfolders = list_folders(FLAGS.dataset_path) #wav_files = [os.path.join(folder, '{}.wav'.format(folder.split('/')[-1])) for folder in folders] # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) for subfolder in subfolders: #pdb.set_trace() # format the wave file name wav_file = os.path.join(subfolder, '{}.wav'.format(subfolder.split('/')[-1])) if not os.path.exists(wav_file): print('Skipping {}!'.format(wav_file)) os.removedirs(subfolder) print('Remove dir {}!'.format(subfolder)) continue print('Processing {}!'.format(wav_file)) # transform wav_file examples_batch = vggish_input.wavfile_to_examples(wav_file) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) feats = pproc.postprocess(embedding_batch) #print(feats.shape) #pdb.set_trace() # write audio features into numpy file np.save('{}.npy'.format(os.path.join(subfolder, 'afeat')), feats[:120, ]) print('Audio feature extraction is finished!')
def ProcessWithVGGish_file(vgg, file): '''Run the VGGish model, starting with a sound (x) at sample rate (sr). Return a whitened version of the embeddings. Sound must be scaled to be floats between -1 and +1.''' # # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.wavfile_to_examples(file) # # print('Log Mel Spectrogram example: ', input_batch[0]) [embedding_batch] = sess.run([vgg['embedding']], feed_dict={vgg['features']: input_batch}) # Postprocess the results to produce whitened quantized embeddings. pca_params_path = 'vggish_pca_params.npz' pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) # print('Postprocessed VGGish embedding: ', postprocessed_batch[0]) return postprocessed_batch[0]
def __init__(self, checkpoint, pca_params, input_tensor_name, output_tensor_name): """Create a new Graph and a new Session for every VGGishExtractor object.""" super(VGGishExtractor, self).__init__() self.graph = tf.Graph() with self.graph.as_default(): vggish_slim.define_vggish_slim(training=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True self.sess = tf.Session(graph=self.graph, config=sess_config) vggish_slim.load_defined_vggish_slim_checkpoint(self.sess, checkpoint) # use the self.sess to init others self.input_tensor = self.graph.get_tensor_by_name(input_tensor_name) self.output_tensor = self.graph.get_tensor_by_name(output_tensor_name) # postprocessor self.postprocess = vggish_postprocess.Postprocessor(pca_params)
def getAudioSetFeatures(fname): pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz') mels = getMelSpecGram(fname) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: mels}) postprocessed_batch = pproc.postprocess(embedding_batch) sample = unit8_to_float32(postprocessed_batch) return sample
def main(unused_argv): print("Input file: " + FLAGS.input_video_label) for wav_file, st_time, end_time, label in csv.reader(open( FLAGS.input_video_label), delimiter='\t'): print(wav_file, st_time, end_time, label) if (os.path.isfile(wav_file)): examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]], dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[ 0, 0:postprocessed_batch.shape[0], :] = utils.Dequantize( postprocessed_batch.astype(float), 2, -2) inference(video_batch_val, num_frames_batch_val, FLAGS.checkpoint_file, FLAGS.train_dir, FLAGS.output_file) tf.reset_default_graph()