Example #1
0
def main(_):
  with open(FLAGS.wav_files) as f:
      files_list = [line.replace('\n', '') for line in f]

  n_files = len(files_list)
  output_emedding = np.zeros((n_files, 128))
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
  processed_fnames = []
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    for n_file, wav_file in enumerate(files_list):
        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        print(n_file, '/', n_files)

        if examples_batch.shape[0] == 0:
          with open('bad_files.log', 'a') as logf:
            logf.write(wav_file + '\n')
        else:
          processed_fnames.append(wav_file)

          [embedding_batch] = sess.run([embedding_tensor],
                                       feed_dict={features_tensor: examples_batch})
          postprocessed_batch = pproc.postprocess(embedding_batch)
          postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0)
          output_emedding[n_file, :] = postprocessed_batch_mean
      
    np.save(FLAGS.npy_file, output_emedding)
def extract_vggish_features(wav_path):
    # Produce a batch of log mel spectrogram examples.
    input_batch = vggish_input.wavfile_to_examples(wav_path)
    if input_batch.shape[0] < 1:
        print('{}: Audio sample shorter than 1 second. Ignoring ...',
              os.path.basename(wav_path))
        return None

    # print('Log Mel Spectrogram example: ', input_batch[0])

    # Define VGGish, load the checkpoint, and run the batch through the model to
    # produce embeddings.
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim()
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: input_batch})
    # Postprocess the results to produce whitened quantized embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params_path)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    return postprocessed_batch
Example #3
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.

  wav_file = FLAGS.wav_file

  examples_batch = vggish_input.wavfile_to_examples(wav_file)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})

    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)
    np.save("/postprocessed_batch.npy", postprocessed_batch)
Example #4
0
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file):
    print("Input file: " +input_wav_file)

    
    if (os.path.isfile(input_wav_file)):
      examples_batch = vggish_input.wavfile_to_examples(input_wav_file)
      #print(examples_batch)
      pproc = vggish_postprocess.Postprocessor(pca_params)

      with tf.Graph().as_default(), tf.Session() as sess:
       # Define the model in inference mode, load the checkpoint, and
       # locate input and output tensors.
       vggish_slim.define_vggish_slim(training=False)
       vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
       features_tensor = sess.graph.get_tensor_by_name(
          vggish_params.INPUT_TENSOR_NAME)
       embedding_tensor = sess.graph.get_tensor_by_name(
          vggish_params.OUTPUT_TENSOR_NAME)

       # Run inference and postprocessing.
       [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
       #print(embedding_batch)
       postprocessed_batch = pproc.postprocess(embedding_batch)
       #print(postprocessed_batch)
       num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32)
    
       video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
       video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2)
    

 #  extract_n_predict()
       predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file)
       return(predicted_class)
      tf.reset_default_graph()
Example #5
0
def main(wav_file):
    """
    #Specify the path for the downloaded or recorded audio files and
    #also path for writing the embeddings or pickle files
    """
    if wav_file:
        pkl = wav_file[:-4] + '.pkl'
        print(pkl)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    with tf.Graph().as_default(), tf.Session() as sess:

        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    predict_prob, predictions = model_function.predictions_wavfile(
        postprocessed_batch)
    K.clear_session()
    return predict_prob, predictions
Example #6
0
def input_sound():  # function to calculate the FS and time series input
    files = get_file_paths(DIRNAME)  # function called to get the file paths
    File_names = []
    full_feature_vector = np.empty([0, 128])
    for file in sorted(files):  # loop to access each file
        # print (file)
        (filepath, ext) = os.path.splitext(file)  # get extension of the file
        file_name = os.path.basename(file)  # get the file name
        if ext == '.wav':
            File_names.append(file_name)
            y, sr = librosa.load(file, sr=None)
            print(sr)
            examples_batch = waveform_to_examples(y, sr)
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
            with tf.Graph().as_default(), tf.Session() as sess:
                # Define the model in inference mode, load the checkpoint, and
                # locate input and output tensors.
                vggish_slim.define_vggish_slim(training=False)
                vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
                features_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.INPUT_TENSOR_NAME)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.OUTPUT_TENSOR_NAME)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
                postprocessed_batch = pproc.postprocess(embedding_batch)
                print(np.shape(postprocessed_batch))
                full_feature_vector = np.concatenate(
                    (full_feature_vector, postprocessed_batch), axis=0)
                print(np.shape(full_feature_vector))
    return full_feature_vector
def main(_):
    opt = parse_opt()
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
        keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range]

        for i in range(3):
            h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5'
            if os.path.exists(h5_path): os.remove(h5_path)
            h5 = h5py.File(h5_path, 'w')
            dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32')
            # print(values[i])
            for audio_id in range(values[i][0], values[i][1] + 1):
                wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav'
                #print(wav_file)
                # id = int(audio_id[5:-9])
                #print(audio_id)
                if os.path.isfile(wav_file):
                    examples_batch = vggish_input.wavfile_to_examples(wav_file)
                    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
                    writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
                    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})
                    #print(len(embedding_batch), len(embedding_batch[0]))
                    embedding_batch = embedding_batch.mean(0)
                    dataset_feats[audio_id - values[i][0]] = embedding_batch
                    #print(embedding_batch)

    if writer:
        writer.close()
Example #8
0
def extract_and_predict(wav):
    print("Boom from PYTHON!!!")

    # tf.enable_v2_behavior()
    # loaded_model = tf2.saved_model.load(saved_model_path)
    # print("I can load model now!!!")

    wav_file = wav
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print("Jerry audio_to_prediction.py: after wavfile_to_examples")


    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor()

    print("Jerry audio_to_prediction.py: after pproc")

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        print("Jerry audio_to_prediction.py: after load vggish_slim")
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))]
    pred_each_n_seconds = predict_with_saved_model(postprocessed_batch)
    return str(pred_each_n_seconds)
def extract(wav_file):
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    pproc = vggish_postprocess.Postprocessor(
        '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz'
    )

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(
            sess,
            '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt'
        )
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    return postprocessed_batch
def embed(wavform_slice, rate):  
  norm_wavform_slice = preprocessing.normalize(wavform_slice)
  examples_batch = vggish_input.waveform_to_examples(norm_wavform_slice,rate)
  #print('examples_batch:')
  #print(examples_batch)
  print('examples_batch len: ' + str(len(examples_batch)))

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
    vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
    vggish_params.OUTPUT_TENSOR_NAME)
    
    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                     feed_dict={features_tensor: examples_batch})
    #print('embedding_batch: ')
    #print(embedding_batch)
    #print(embedding_batch.shape)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print('postprocessed_batch: ')
    print(postprocessed_batch)
    print(postprocessed_batch.shape)
  return postprocessed_batch
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params)
    vggish_params.EXAMPLE_HOP_SECONDS = (
        1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS

    # If needed, prepare a record writer_dict to store the postprocessed embeddings.

    with tf.Graph().as_default(), tf.Session(config=config) as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        movie_id = args.wav_file[args.wav_file.rfind('/') +
                                 1:args.wav_file.rfind('.')]

        examples_batch = vggish_input.wavfile_to_examples(args.wav_file)
        num_splits = min(int(examples_batch.shape[0] / 10), 100)
        num_splits = max(1, num_splits)
        examples_batch = np.array_split(examples_batch, num_splits)

        embedding_batch = []
        for i in range(num_splits):
            [batch] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: examples_batch[i]})
            embedding_batch.extend(batch)

        postprocessed_batch = pproc.postprocess(np.array(embedding_batch))

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'movie_id':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[movie_id]))
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        writer = tf.python_io.TFRecordWriter(
            os.path.join(args.write_dir, movie_id + '.tfrecord'))
        writer.write(seq_example.SerializeToString())
        writer.close()
Example #12
0
def extract_audioset_embedding():
    """Extract log mel spectrogram features. 
    """

    # Arguments & parameters
    mel_bins = vggish_params.NUM_BANDS
    sample_rate = vggish_params.SAMPLE_RATE
    input_len = vggish_params.NUM_FRAMES
    embedding_size = vggish_params.EMBEDDING_SIZE
    '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the 
    hop size. '''

    # Paths
    audio_path = 'appendixes/01.wav'
    checkpoint_path = os.path.join('vggish_model.ckpt')
    pcm_params_path = os.path.join('vggish_pca_params.npz')

    if not os.path.isfile(checkpoint_path):
        raise Exception(
            'Please download vggish_model.ckpt from '
            'https://storage.googleapis.com/audioset/vggish_model.ckpt '
            'and put it in the root of this codebase. ')

    if not os.path.isfile(pcm_params_path):
        raise Exception(
            'Please download pcm_params_path from '
            'https://storage.googleapis.com/audioset/vggish_pca_params.npz '
            'and put it in the root of this codebase. ')

    # Load model
    sess = tf.Session()

    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    pproc = vggish_postprocess.Postprocessor(pcm_params_path)

    # Read audio
    (audio, _) = read_audio(audio_path, target_fs=sample_rate)

    # Extract log mel feature
    logmel = vggish_input.waveform_to_examples(audio, sample_rate)

    # Extract embedding feature
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: logmel})

    # PCA
    postprocessed_batch = pproc.postprocess(embedding_batch)

    print('Audio length: {}'.format(len(audio)))
    print('Log mel shape: {}'.format(logmel.shape))
    print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
def readDirectory(dirname, label):
    pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")

    for wav_file in glob.glob(dirname + "*.wav"):
        print(wav_file)
        try:
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
        except:
            continue
        writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord")

        with tf.Graph().as_default(), tf.Session() as sess:
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            try:
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
            except:
                continue
            postprocessed_batch = pproc.postprocess(embedding_batch)

            nBatches = len(postprocessed_batch)

            if nBatches < 10:
                nBatches = 1
            else:
                nBatches = nBatches / 10

            for i in range(nBatches):
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            "labels":
                            tf.train.Feature(int64_list=tf.train.Int64List(
                                value=[label]))
                        }),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                            tf.train.FeatureList(feature=[
                                tf.train.Feature(bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch[i * 10:i *
                                                                     10 + 10]
                            ])
                        }))

                if writer:
                    writer.write(seq_example.SerializeToString())

        if writer:
            writer.close()
Example #14
0
def main(_):
  
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        return "No wav file"
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                        tf.train.FeatureList(
                            feature=[
                                tf.train.Feature(
                                    bytes_list=tf.train.BytesList(
                                        value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch
                            ]
                        )
                }
            )
        )
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
Example #15
0
def main(wav_file, npz_path):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    #if FLAGS.wav_file:
    #  wav_file = str(FLAGS.wav_file)
    #  print (FLAGS.wav_file)

    if 1:
        wav_file = wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    #print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(tfrecord_file)

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        #print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        #print(postprocessed_batch)

        if 0 in embedding_batch.shape:
            print('NO')
            return 0

        np.savez_compressed(npz_path, postprocessed_batch)
    return 1
Example #16
0
def main(_):
    ontology_lookup = {}
    with open(ONTROLOGY, 'r') as f:
        label_json = json.load(f)
    for entry in label_json:
        label_id = entry['id'].replace('/', '_')
        assert label_id not in ontology_lookup.keys()
        ontology_lookup[label_id] = entry
    wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav'))

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    audio_tsv = []
    label_tsv = []
    emb_tsv = []
    for wavfile in tqdm(wav_paths):
        label = Path(Path(wavfile).parent).stem
        filename = Path(wavfile).name
        examples_batch = vggish_input.wavfile_to_examples(wavfile)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            # emb = []
            # for embedding in embedding_batch:
            #     emb.append(embedding.tolist())
            emb = np.mean(embedding_batch, axis=0).tolist()

        label_tsv.append([ontology_lookup[label]['name']])
        audio_tsv.append([f'{label}/{filename}'])
        emb_tsv.append(emb)
        assert len(emb_tsv[0]) == len(emb)

    with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f:
        for emb in emb_tsv:
            csv.writer(f, delimiter='\t').writerow(emb)
    with open(f'{OUTPUTDIR}/label.tsv', 'w') as f:
        for label in label_tsv:
            csv.writer(f, delimiter='\t').writerow(label)
    with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f:
        for audio_path in audio_tsv:
            csv.writer(f, delimiter='\t').writerow(audio_path)
Example #17
0
 def post_init(self):
     self.to_device()
     import tensorflow as tf
     tf.compat.v1.disable_eager_execution()
     self.sess = tf.compat.v1.Session()
     vggish_slim.define_vggish_slim()
     vggish_slim.load_vggish_slim_checkpoint(self.sess, self.model_path)
     self.feature_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.INPUT_TENSOR_NAME)
     self.embedding_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.OUTPUT_TENSOR_NAME)
     self.post_processor = vggish_postprocess.Postprocessor(self.pca_path)
Example #18
0
 def _build_model(self):
     # Restore VGGish model trained on YouTube8M dataset
     # Retrieve PCA-embeddings of bottleneck features
     # Define the model in inference mode, load the checkpoint, and
     # locate input and output tensors.
     vggish_slim.define_vggish_slim(training=False)
     vggish_slim.load_vggish_slim_checkpoint(self.sess, model_checkpoint)
     self.features_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.INPUT_TENSOR_NAME)
     self.embedding_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.OUTPUT_TENSOR_NAME)
     # Prepare a postprocessor to munge the model embeddings.
     self.pproc = vggish_postprocess.Postprocessor(pca_params)
def OutputAudioEmbeddings(pathIn, row):
    video_id = row['video_id']
    video_path = row['video_path']
    split = row['split']
    full_path = os.path.join(pathIn, video_path)
    full_path = full_path.replace("%(ext)s",
                                  "wav")  # output file of the downloader path
    if split == 'train':
        full_path_cut = full_path.replace("train", "train/cut")
    elif split == 'test':
        full_path_cut = full_path.replace("test", "test/cut")

    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.

    if os.path.isfile(full_path_cut):
        wav_file = full_path_cut

        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        #print(examples_batch)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        writer = tf.python_io.TFRecordWriter(
            FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            #print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            print(postprocessed_batch)
            #print(postprocessed_batch.shape)
            np.save(
                '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' +
                split + '/' + video_id, postprocessed_batch)
Example #20
0
def processList(filelist):
    # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
    # to test resampling to 16 kHz during feature extraction).
    out_file_list = []
    input_list = open(filelist)
    input_lists = input_list.readlines()
    for i, infl in tqdm(enumerate(input_lists)):
        infl_new = audio_root + '/' + infl.replace("\ ", " ")
        y, sr = librosa.load(infl_new.strip(), sr=None)
        if len(y) < sr:
            y1 = np.pad(y, (0, sr - len(y)), 'wrap')
            y = y1
        # Produce a batch of log mel spectrogram examples.

        input_batch = vggish_input.waveform_to_examples(y, sr)
        print('Log Mel Spectrogram example: ', input_batch[0])
        # Define VGGish, load the checkpoint, and run the batch through the model to
        # produce embeddings.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:
            vggish_slim.define_vggish_slim()
            vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: input_batch})
            print('VGGish embedding: done ', i)

        # Postprocess the results to produce whitened quantized embeddings.
        pproc = vggish_postprocess.Postprocessor(pca_params_path)
        postprocessed_batch = pproc.postprocess(embedding_batch)

        infl_list = infl_new.strip().split("/")
        file_name = infl_list[-1].strip()
        out_dir = output_root + "/" + infl_list[-2]
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        featfile = str(out_dir) + '/' + str(file_name) + '.txt'
        out_file_list.append(featfile.strip())
        np.savetxt(featfile,
                   postprocessed_batch.astype(int),
                   fmt='%i',
                   delimiter=",")
    create_file(output_file, out_file_list)
def main(_):
    # We run the examples from audio files from the input path through the model.
    # If none is provided, we generate a synthetic input.
    if FLAGS.input_path:
        wav_files = os.listdir(FLAGS.input_path)
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_files[0] = six.BytesIO()
        wavfile.write(wav_files[0], sr, samples)
        wav_files[0].seek(0)
    examples_batch = [
        vggish_input.wavfile_to_examples(FLAGS.input_path + wav_file)
        for wav_file in wav_files
    ]
    print("data sample", examples_batch[0])

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        for i in range(len(examples_batch)):
            print("Batch number: ", i)
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch[i]})
            #print("embedding_batch: ",embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            np.savetxt(FLAGS.output_path + wav_files[i][:-4] + ".csv",
                       postprocessed_batch,
                       fmt='%i',
                       delimiter=",")
def main():
    num_secs = 3
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
        print(wav_file)
    snore_path = "/home/grodri/bacelar/apnea_detection/data/audio/snore_3s/"
    bg_path = "/home/grodri/bacelar/apnea_detection/data/audio/bg_3s/"
    # Load datasets to dictionary
    snore_IDs = glob.glob(snore_path + '*.wav')
    bg_IDs = glob.glob(bg_path + '*.wav')
    for c, class_file in enumerate(['bg', 'snore']):
        result = list()
        if class_file == 'snore':
            continue
            file_ids = snore_IDs
        else:
            file_ids = bg_IDs
        for wav_file in tqdm(file_ids):
            filename = wav_file.split('/')[-1]
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
            # Prepare a postprocessor to munge the model embeddings.
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
            client = boto3.client('runtime.sagemaker', region_name='eu-west-1')
            data = np.expand_dims(examples_batch, axis=-1).tolist()
            endpoint_feat_extract = 'vggish-features'
            response = client.invoke_endpoint(
                EndpointName=endpoint_feat_extract, Body=json.dumps(data))
            body = response['Body'].read().decode('utf-8')
            embedding_sound = np.array(
                json.loads(body)['outputs']['vgg_features']
                ['floatVal']).reshape(-1, vggish_params.EMBEDDING_SIZE)
            if len(embedding_sound.shape) == 2:
                postprocessed_batch_keras = pproc.postprocess_single_sample(
                    embedding_sound, num_secs)
                postprocessed_batch_keras = uint8_to_float32(
                    postprocessed_batch_keras).reshape(num_secs, -1)
            else:
                postprocessed_batch_keras = pproc.postprocess(embedding_sound)
            result.append({
                'filename': filename,
                'embedding': postprocessed_batch_keras,
                'label': c
            })
        with open('./dataset/features_' + class_file + '.pickle',
                  'wb') as handle:
            pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #23
0
def get_vggish_params():
    graph = tf.get_default_graph()
    sess = tf.Session()

    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, r'vggish_model.ckpt')
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    pproc = vggish_postprocess.Postprocessor(r'vggish_pca_params.npz')
    print("done loading the models")
    return graph, sess, features_tensor, embedding_tensor, pproc
def get_embed(input_wav, sr=None, sess=None):
    '''
    accepts an input of raw wav data and produces an embedding for it treating
    the entire wav data as one sequence of audio
    ---
        input_wav: raw wav data as an numpy ndarray
        sess: existing tensorflow if already active (required) or None if not

        return: postprocessed_batch (the embeddings), and sess, the tf session
                used so that it can be reused. note that returned sess must be
                handled appropriately by the user
    '''
    # color.INFO('INFO', 'generating input example from wav\r')
    examples_batch = vggish_input.waveform_to_examples(input_wav, sr)

    # load models and postprocessor (a PCA model)
    # color.INFO('INFO', 'loading vggish model checkpoint\r')
    pproc = vggish_postprocess.Postprocessor('../vggish/vggish_pca_params.npz')
    if sess == None:
        sess = tf.Session()
        tf.Graph().as_default()
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess,
                                                '../vggish/vggish_model.ckpt')
    else:
        # color.INFO('INFO', 'attempting to reuse tensorflow session\r')
        pass

    # color.INFO('INFO', 'generating features\r')
    features_tensor = sess.graph.get_tensor_by_name('vggish/input_features:0')
    # color.INFO('INFO', 'generating embeddings\r')
    embedding_tensor = sess.graph.get_tensor_by_name('vggish/embedding:0')

    # Compute embeddings:
    # color.INFO('INFO', 'computing embeddings\r')
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    # color.INFO('INFO', 'post-processing data\r')
    postprocessed_batch = pproc.postprocess(embedding_batch)

    # Print out dimensions: # TODO: make str formatting error go away
    # color.INFO('INFO', 'shape of input batches: %s\r' % examples_batch.shape)
    # color.INFO('INFO', 'shape of vggish output: %s\r' % embedding_batch.shape)
    # color.INFO('INFO', 'shape postprocessed: %s\r' % postprocessed_batch.shape)

    return postprocessed_batch, sess
    def __init__(self,
                 checkpoint_path='vggish_model.ckpt',
                 pcm_params_path='vggish_pca_params.npz'):
        checkpoint_path = os.path.join(checkpoint_path)
        pcm_params_path = os.path.join(pcm_params_path)
        # Load model

        vggish_slim.define_vggish_slim(training=False)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(self.sess, checkpoint_path)
        self.features_tensor = self.sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        self.embedding_tensor = self.sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        self.pproc = vggish_postprocess.Postprocessor(pcm_params_path)
def main(_):
    if FLAGS.dataset_path:
        subfolders = list_folders(FLAGS.dataset_path)
    #wav_files = [os.path.join(folder, '{}.wav'.format(folder.split('/')[-1])) for folder in folders]

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        for subfolder in subfolders:
            #pdb.set_trace()
            # format the wave file name
            wav_file = os.path.join(subfolder,
                                    '{}.wav'.format(subfolder.split('/')[-1]))
            if not os.path.exists(wav_file):
                print('Skipping {}!'.format(wav_file))
                os.removedirs(subfolder)
                print('Remove dir {}!'.format(subfolder))
                continue

            print('Processing {}!'.format(wav_file))
            # transform wav_file
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            feats = pproc.postprocess(embedding_batch)
            #print(feats.shape)
            #pdb.set_trace()

            # write audio features into numpy file
            np.save('{}.npy'.format(os.path.join(subfolder, 'afeat')),
                    feats[:120, ])
        print('Audio feature extraction is finished!')
def ProcessWithVGGish_file(vgg, file):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.wavfile_to_examples(file)
  # # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]
Example #28
0
    def __init__(self, checkpoint, pca_params, input_tensor_name, output_tensor_name):
        """Create a new Graph and a new Session for every VGGishExtractor object."""
        super(VGGishExtractor, self).__init__()
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            vggish_slim.define_vggish_slim(training=False)

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(graph=self.graph, config=sess_config)
        vggish_slim.load_defined_vggish_slim_checkpoint(self.sess, checkpoint)
        
        # use the self.sess to init others
        self.input_tensor = self.graph.get_tensor_by_name(input_tensor_name)
        self.output_tensor = self.graph.get_tensor_by_name(output_tensor_name)

        # postprocessor
        self.postprocess = vggish_postprocess.Postprocessor(pca_params)
Example #29
0
def getAudioSetFeatures(fname):
    pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz')
    mels = getMelSpecGram(fname)
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: mels})
        postprocessed_batch = pproc.postprocess(embedding_batch)

        sample = unit8_to_float32(postprocessed_batch)
    return sample
Example #30
0
def main(unused_argv):
    print("Input file: " + FLAGS.input_video_label)

    for wav_file, st_time, end_time, label in csv.reader(open(
            FLAGS.input_video_label),
                                                         delimiter='\t'):
        print(wav_file, st_time, end_time, label)
        if (os.path.isfile(wav_file)):
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
            #print(examples_batch)
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

            with tf.Graph().as_default(), tf.Session() as sess:
                # Define the model in inference mode, load the checkpoint, and
                # locate input and output tensors.
                vggish_slim.define_vggish_slim(training=False)
                vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
                features_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.INPUT_TENSOR_NAME)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.OUTPUT_TENSOR_NAME)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
                #print(embedding_batch)
                postprocessed_batch = pproc.postprocess(embedding_batch)
                #print(postprocessed_batch)
                num_frames_batch_val = np.array([postprocessed_batch.shape[0]],
                                                dtype=np.int32)

                video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
                video_batch_val[
                    0, 0:postprocessed_batch.shape[0], :] = utils.Dequantize(
                        postprocessed_batch.astype(float), 2, -2)

                inference(video_batch_val, num_frames_batch_val,
                          FLAGS.checkpoint_file, FLAGS.train_dir,
                          FLAGS.output_file)

            tf.reset_default_graph()