Esempio n. 1
0
def main(_):
  with open(FLAGS.wav_files) as f:
      files_list = [line.replace('\n', '') for line in f]

  n_files = len(files_list)
  output_emedding = np.zeros((n_files, 128))
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
  processed_fnames = []
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    for n_file, wav_file in enumerate(files_list):
        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        print(n_file, '/', n_files)

        if examples_batch.shape[0] == 0:
          with open('bad_files.log', 'a') as logf:
            logf.write(wav_file + '\n')
        else:
          processed_fnames.append(wav_file)

          [embedding_batch] = sess.run([embedding_tensor],
                                       feed_dict={features_tensor: examples_batch})
          postprocessed_batch = pproc.postprocess(embedding_batch)
          postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0)
          output_emedding[n_file, :] = postprocessed_batch_mean
      
    np.save(FLAGS.npy_file, output_emedding)
def extract_vggish_features(wav_path):
    # Produce a batch of log mel spectrogram examples.
    input_batch = vggish_input.wavfile_to_examples(wav_path)
    if input_batch.shape[0] < 1:
        print('{}: Audio sample shorter than 1 second. Ignoring ...',
              os.path.basename(wav_path))
        return None

    # print('Log Mel Spectrogram example: ', input_batch[0])

    # Define VGGish, load the checkpoint, and run the batch through the model to
    # produce embeddings.
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim()
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: input_batch})
    # Postprocess the results to produce whitened quantized embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params_path)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    return postprocessed_batch
Esempio n. 3
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.

  wav_file = FLAGS.wav_file

  examples_batch = vggish_input.wavfile_to_examples(wav_file)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})

    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)
    np.save("/postprocessed_batch.npy", postprocessed_batch)
Esempio n. 4
0
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file):
    print("Input file: " +input_wav_file)

    
    if (os.path.isfile(input_wav_file)):
      examples_batch = vggish_input.wavfile_to_examples(input_wav_file)
      #print(examples_batch)
      pproc = vggish_postprocess.Postprocessor(pca_params)

      with tf.Graph().as_default(), tf.Session() as sess:
       # Define the model in inference mode, load the checkpoint, and
       # locate input and output tensors.
       vggish_slim.define_vggish_slim(training=False)
       vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
       features_tensor = sess.graph.get_tensor_by_name(
          vggish_params.INPUT_TENSOR_NAME)
       embedding_tensor = sess.graph.get_tensor_by_name(
          vggish_params.OUTPUT_TENSOR_NAME)

       # Run inference and postprocessing.
       [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
       #print(embedding_batch)
       postprocessed_batch = pproc.postprocess(embedding_batch)
       #print(postprocessed_batch)
       num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32)
    
       video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
       video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2)
    

 #  extract_n_predict()
       predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file)
       return(predicted_class)
      tf.reset_default_graph()
    def setup(self):
        # Paths to downloaded VGGish files.
        self.checkpoint_path = 'vggish_model.ckpt'
        self.pca_params_path = 'vggish_pca_params.npz'
        self.batch_size = 60

        # If we can't find the trained model files, download them
        if not os.path.exists(self.checkpoint_path):
            print(
                'AudiosetAnalysis: Downloading model file {} (please wait - this may take a while)'
                .format(self.checkpoint_path))
            urllib.urlretrieve(
                'https://storage.googleapis.com/audioset/vggish_model.ckpt',
                self.checkpoint_path)
        if not os.path.exists(self.pca_params_path):
            print(
                'AudiosetAnalysis: Downloading params file {} (please wait - this may take a while)'
                .format(self.pca_params_path))
            urllib.urlretrieve(
                'https://storage.googleapis.com/audioset/vggish_pca_params.npz',
                self.pca_params_path)

        # Define VGGish
        self.sess = tf.Graph().as_default()
        config = tf.ConfigProto(device_count={'CPU': 4})
        self.sess = tf.Session(config=config)

        # Load the checkpoint
        vggish_slim.define_vggish_slim()
        vggish_slim.load_vggish_slim_checkpoint(self.sess,
                                                self.checkpoint_path)
        self.features_tensor = self.sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        self.embedding_tensor = self.sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
Esempio n. 6
0
def input_sound():  # function to calculate the FS and time series input
    files = get_file_paths(DIRNAME)  # function called to get the file paths
    File_names = []
    full_feature_vector = np.empty([0, 128])
    for file in sorted(files):  # loop to access each file
        # print (file)
        (filepath, ext) = os.path.splitext(file)  # get extension of the file
        file_name = os.path.basename(file)  # get the file name
        if ext == '.wav':
            File_names.append(file_name)
            y, sr = librosa.load(file, sr=None)
            print(sr)
            examples_batch = waveform_to_examples(y, sr)
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
            with tf.Graph().as_default(), tf.Session() as sess:
                # Define the model in inference mode, load the checkpoint, and
                # locate input and output tensors.
                vggish_slim.define_vggish_slim(training=False)
                vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
                features_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.INPUT_TENSOR_NAME)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.OUTPUT_TENSOR_NAME)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
                postprocessed_batch = pproc.postprocess(embedding_batch)
                print(np.shape(postprocessed_batch))
                full_feature_vector = np.concatenate(
                    (full_feature_vector, postprocessed_batch), axis=0)
                print(np.shape(full_feature_vector))
    return full_feature_vector
Esempio n. 7
0
def main(wav_file):
    """
    #Specify the path for the downloaded or recorded audio files and
    #also path for writing the embeddings or pickle files
    """
    if wav_file:
        pkl = wav_file[:-4] + '.pkl'
        print(pkl)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    with tf.Graph().as_default(), tf.Session() as sess:

        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    predict_prob, predictions = model_function.predictions_wavfile(
        postprocessed_batch)
    K.clear_session()
    return predict_prob, predictions
Esempio n. 8
0
    def main(_):
        audio_files = os.listdir(audio_path)
        # maxi = 0
        for each_file in tqdm.tqdm(audio_files):
            file_nm = dest_path + each_file.split('.')[0] + '.npy'
            if not (path.exists(file_nm)):
                try:
                    wav_file = audio_path + each_file
                    examples_batch = vggish_input.wavfile_to_examples(wav_file)

                    with tf.Graph().as_default(), tf.Session() as sess:
                        vggish_slim.define_vggish_slim(training=False)
                        vggish_slim.load_vggish_slim_checkpoint(
                            sess, FLAGS.checkpoint)
                        features_tensor = sess.graph.get_tensor_by_name(
                            vggish_params.INPUT_TENSOR_NAME)
                        embedding_tensor = sess.graph.get_tensor_by_name(
                            vggish_params.OUTPUT_TENSOR_NAME)
                        [embedding_batch] = sess.run(
                            [embedding_tensor],
                            feed_dict={features_tensor: examples_batch})
                        postprocessed_batch = embedding_batch
                        #indices = np.linspace(0, len(postprocessed_batch), max_frames, endpoint=False, dtype=int)
                        #postprocessed_batch = postprocessed_batch[indices]
                        np.save(dest_path + each_file.split('.')[0] + '.npy',
                                postprocessed_batch)
                except:
                    print("here")
                    continue
Esempio n. 9
0
def extract(wav_file):
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    pproc = vggish_postprocess.Postprocessor(
        '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz'
    )

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(
            sess,
            '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt'
        )
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    return postprocessed_batch
def extract_audioset_features(ids, id2audio_path, id2label):
    first_audio = True
    for i in ids:
        if first_audio:
            input_data = vggish_input.wavfile_to_examples(id2audio_path[i])
            ground_truth = np.repeat(id2label[i], input_data.shape[0], axis=0)
            identifiers = np.repeat(i, input_data.shape[0], axis=0)
            first_audio = False
        else:
            tmp_in = vggish_input.wavfile_to_examples(id2audio_path[i])
            input_data = np.concatenate((input_data, tmp_in), axis=0)
            tmp_gt = np.repeat(id2label[i], tmp_in.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(i, tmp_in.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        extracted_feat = sess.run([embedding_tensor],
                                  feed_dict={features_tensor: input_data})
        feature = np.squeeze(np.asarray(extracted_feat))

    return [feature, ground_truth, identifiers]
Esempio n. 11
0
def inference(file_path,checkpoint_dir,checkpoint_path):
    
    '''
        Inference loop for prediction the audio file.
    '''
    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:    
     
        logits_inf = vggish_slim.define_audio_slim(training=False,is_reuse=None)
         
        with tf.compat.v1.variable_scope('mymodel'):

          predict_inf = tf.sigmoid(logits_inf, name='prediction_inf')
                
          # Add inference ops.
          with tf.variable_scope('train'):
            global_step = tf.Variable(
                0, name='global_step', trainable=False,  
                collections=[tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
                             tf.compat.v1.GraphKeys.GLOBAL_STEP])
                
        # Initialize all variables in the model
        sess.run(tf.compat.v1.global_variables_initializer())  
        
        # Restore the model
        saver = tf.compat.v1.train.Saver()
        saver.restore(sess, checkpoint_path)

        # Locate all the tensors and ops we need for the inference loop.
        features_tensor_inf = sess.graph.get_tensor_by_name(
            'audio/audio_input_features:0')

        prediction_tensor_inf = sess.graph.get_tensor_by_name('mymodel/prediction_inf:0')  
         
        graph = tf.Graph()
        with graph.as_default():
            vggish_slim.define_vggish_slim(training=False)
            sess_ext = tf.compat.v1.Session(graph=graph)
            vggish_slim.load_vggish_slim_checkpoint(sess_ext, checkpoint_dir + "vggish_model.ckpt")
            input_tensor = graph.get_tensor_by_name('vggish/input_features:0')
            output_tensor = graph.get_tensor_by_name('vggish/embedding:0')
            pproc = Postprocessor.Postprocessor(checkpoint_dir + "vggish_pca_params.npz")
                   
        print('\n###################')
        print('#  Inference loop  #')
        print('###################')

        try:
            data, sampleratde = sf.read(Path(file_path))
            wave_array_example_pre = data_transformation.waveform_to_examples(data,sampleratde,display=0)
        
            [embedding_batch] = sess_ext.run([output_tensor],
                feed_dict={input_tensor: wave_array_example_pre})
        
            wave_arrays = pproc.postprocess(embedding_batch)

            pred_inf_restore = sess.run(prediction_tensor_inf, feed_dict={features_tensor_inf: wave_arrays})
            return pred_inf_restore 

        except:
            print('This program does not support the input file format or file does not found ')
def extract_and_predict(wav):
    wav_file = wav
    examples_batch = vggish_input.wavfile_to_examples(wav_file)

    # Prepare a postprocessor to munge the model embeddings.
    # pproc = vggish_postprocess.Postprocessor()

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = vggish_postprocess.postprocess(embedding_batch)
        postprocessed_batch = [
            postprocessed_batch[i] for i in range(len(postprocessed_batch))
        ]
    pred_each_n_seconds = predict_with_saved_model(postprocessed_batch)
    print(str(pred_each_n_seconds))
Esempio n. 13
0
def main(_):
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
    # print(FLAGS.checkpoint)
    # print(os.getcwd())
    # print(path.exists(FLAGS.checkpoint))
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        # embedding_batch为提取结果
        with open(path.splitext(wav_file)[0] + '.pk', "wb") as f:
            pickle.dump(embedding_batch, f)
def create_vggish_frozen_graph():
    """Create the VGGish frozen graph."""
    os.system('git clone https://github.com/tensorflow/models.git')
    sys.path.append('models/research/audioset/vggish/')

    import vggish_slim
    os.system(
        'curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt')
    ckpt_path = 'vggish_model.ckpt'

    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, ckpt_path)

        saver = tf.train.Saver(tf.all_variables())

        freeze_graph.freeze_graph_with_def_protos(
            sess.graph_def,
            saver.as_saver_def(),
            ckpt_path,
            'vggish/fc2/BiasAdd',
            restore_op_name=None,
            filename_tensor_name=None,
            output_graph='/tmp/mediapipe/vggish_new.pb',
            clear_devices=True,
            initializer_nodes=None)
    os.system('rm -rf models/')
    os.system('rm %s' % ckpt_path)
Esempio n. 15
0
def extract_and_predict(wav):
    print("Boom from PYTHON!!!")

    # tf.enable_v2_behavior()
    # loaded_model = tf2.saved_model.load(saved_model_path)
    # print("I can load model now!!!")

    wav_file = wav
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print("Jerry audio_to_prediction.py: after wavfile_to_examples")


    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor()

    print("Jerry audio_to_prediction.py: after pproc")

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        print("Jerry audio_to_prediction.py: after load vggish_slim")
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))]
    pred_each_n_seconds = predict_with_saved_model(postprocessed_batch)
    return str(pred_each_n_seconds)
Esempio n. 16
0
def main(_):
    with tf.Graph().as_default(), tf.Session() as sess:
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)
        with tf.variable_scope('mymodel'):

            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            logits = slim.fully_connected(fc,
                                          _NUM_CLASSES,
                                          activation_fn=None,
                                          scope='logits')
            tf.sigmoid(logits, name='prediction')

            with tf.variable_scope('train'):
                global_step = tf.Variable(0,
                                          name='global_step',
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              tf.GraphKeys.GLOBAL_STEP
                                          ])

                labels = tf.placeholder(tf.float32,
                                        shape=(None, _NUM_CLASSES),
                                        name='labels')

                xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                               labels=labels,
                                                               name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=vggish_params.LEARNING_RATE,
                    epsilon=vggish_params.ADAM_EPSILON)
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name='train_op')

        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
        global_step_tensor = sess.graph.get_tensor_by_name(
            'mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

        for _ in range(FLAGS.num_batches):
            (features, labels) = _get_examples_batch()
            [num_steps, loss,
             _] = sess.run([global_step_tensor, loss_tensor, train_op],
                           feed_dict={
                               features_tensor: features,
                               labels_tensor: labels
                           })
            print('Step %d: loss %g' % (num_steps, loss))
def main(_):
    opt = parse_opt()
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
        keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range]

        for i in range(3):
            h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5'
            if os.path.exists(h5_path): os.remove(h5_path)
            h5 = h5py.File(h5_path, 'w')
            dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32')
            # print(values[i])
            for audio_id in range(values[i][0], values[i][1] + 1):
                wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav'
                #print(wav_file)
                # id = int(audio_id[5:-9])
                #print(audio_id)
                if os.path.isfile(wav_file):
                    examples_batch = vggish_input.wavfile_to_examples(wav_file)
                    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
                    writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
                    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})
                    #print(len(embedding_batch), len(embedding_batch[0]))
                    embedding_batch = embedding_batch.mean(0)
                    dataset_feats[audio_id - values[i][0]] = embedding_batch
                    #print(embedding_batch)

    if writer:
        writer.close()
def embed(wavform_slice, rate):  
  norm_wavform_slice = preprocessing.normalize(wavform_slice)
  examples_batch = vggish_input.waveform_to_examples(norm_wavform_slice,rate)
  #print('examples_batch:')
  #print(examples_batch)
  print('examples_batch len: ' + str(len(examples_batch)))

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
    vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
    vggish_params.OUTPUT_TENSOR_NAME)
    
    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                     feed_dict={features_tensor: examples_batch})
    #print('embedding_batch: ')
    #print(embedding_batch)
    #print(embedding_batch.shape)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print('postprocessed_batch: ')
    print(postprocessed_batch)
    print(postprocessed_batch.shape)
  return postprocessed_batch
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params)
    vggish_params.EXAMPLE_HOP_SECONDS = (
        1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS

    # If needed, prepare a record writer_dict to store the postprocessed embeddings.

    with tf.Graph().as_default(), tf.Session(config=config) as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        movie_id = args.wav_file[args.wav_file.rfind('/') +
                                 1:args.wav_file.rfind('.')]

        examples_batch = vggish_input.wavfile_to_examples(args.wav_file)
        num_splits = min(int(examples_batch.shape[0] / 10), 100)
        num_splits = max(1, num_splits)
        examples_batch = np.array_split(examples_batch, num_splits)

        embedding_batch = []
        for i in range(num_splits):
            [batch] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: examples_batch[i]})
            embedding_batch.extend(batch)

        postprocessed_batch = pproc.postprocess(np.array(embedding_batch))

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'movie_id':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[movie_id]))
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        writer = tf.python_io.TFRecordWriter(
            os.path.join(args.write_dir, movie_id + '.tfrecord'))
        writer.write(seq_example.SerializeToString())
        writer.close()
 def initialize_classifier(self):
     if not os.path.exists(os.path.join(os.getcwd(), self.model_dir)):
         os.mkdir(os.path.join(os.getcwd(), self.model_dir))
     # Load pre-trained VGGish
     vggish_slim.load_vggish_slim_checkpoint(self.sess,
                                             self.vggish_checkpoint)
     # Save model checkpoint
     self.save_variables()
Esempio n. 21
0
def readDirectory(dirname, label):
    pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")

    for wav_file in glob.glob(dirname + "*.wav"):
        print(wav_file)
        try:
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
        except:
            continue
        writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord")

        with tf.Graph().as_default(), tf.Session() as sess:
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            try:
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
            except:
                continue
            postprocessed_batch = pproc.postprocess(embedding_batch)

            nBatches = len(postprocessed_batch)

            if nBatches < 10:
                nBatches = 1
            else:
                nBatches = nBatches / 10

            for i in range(nBatches):
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            "labels":
                            tf.train.Feature(int64_list=tf.train.Int64List(
                                value=[label]))
                        }),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                            tf.train.FeatureList(feature=[
                                tf.train.Feature(bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch[i * 10:i *
                                                                     10 + 10]
                            ])
                        }))

                if writer:
                    writer.write(seq_example.SerializeToString())

        if writer:
            writer.close()
Esempio n. 22
0
def extract_audioset_embedding():
    """Extract log mel spectrogram features. 
    """

    # Arguments & parameters
    mel_bins = vggish_params.NUM_BANDS
    sample_rate = vggish_params.SAMPLE_RATE
    input_len = vggish_params.NUM_FRAMES
    embedding_size = vggish_params.EMBEDDING_SIZE
    '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the 
    hop size. '''

    # Paths
    audio_path = 'appendixes/01.wav'
    checkpoint_path = os.path.join('vggish_model.ckpt')
    pcm_params_path = os.path.join('vggish_pca_params.npz')

    if not os.path.isfile(checkpoint_path):
        raise Exception(
            'Please download vggish_model.ckpt from '
            'https://storage.googleapis.com/audioset/vggish_model.ckpt '
            'and put it in the root of this codebase. ')

    if not os.path.isfile(pcm_params_path):
        raise Exception(
            'Please download pcm_params_path from '
            'https://storage.googleapis.com/audioset/vggish_pca_params.npz '
            'and put it in the root of this codebase. ')

    # Load model
    sess = tf.Session()

    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    pproc = vggish_postprocess.Postprocessor(pcm_params_path)

    # Read audio
    (audio, _) = read_audio(audio_path, target_fs=sample_rate)

    # Extract log mel feature
    logmel = vggish_input.waveform_to_examples(audio, sample_rate)

    # Extract embedding feature
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: logmel})

    # PCA
    postprocessed_batch = pproc.postprocess(embedding_batch)

    print('Audio length: {}'.format(len(audio)))
    print('Log mel shape: {}'.format(logmel.shape))
    print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
Esempio n. 23
0
def main(_):
  
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        return "No wav file"
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                        tf.train.FeatureList(
                            feature=[
                                tf.train.Feature(
                                    bytes_list=tf.train.BytesList(
                                        value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch
                            ]
                        )
                }
            )
        )
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
Esempio n. 24
0
def main(_):
    ontology_lookup = {}
    with open(ONTROLOGY, 'r') as f:
        label_json = json.load(f)
    for entry in label_json:
        label_id = entry['id'].replace('/', '_')
        assert label_id not in ontology_lookup.keys()
        ontology_lookup[label_id] = entry
    wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav'))

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    audio_tsv = []
    label_tsv = []
    emb_tsv = []
    for wavfile in tqdm(wav_paths):
        label = Path(Path(wavfile).parent).stem
        filename = Path(wavfile).name
        examples_batch = vggish_input.wavfile_to_examples(wavfile)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            # emb = []
            # for embedding in embedding_batch:
            #     emb.append(embedding.tolist())
            emb = np.mean(embedding_batch, axis=0).tolist()

        label_tsv.append([ontology_lookup[label]['name']])
        audio_tsv.append([f'{label}/{filename}'])
        emb_tsv.append(emb)
        assert len(emb_tsv[0]) == len(emb)

    with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f:
        for emb in emb_tsv:
            csv.writer(f, delimiter='\t').writerow(emb)
    with open(f'{OUTPUTDIR}/label.tsv', 'w') as f:
        for label in label_tsv:
            csv.writer(f, delimiter='\t').writerow(label)
    with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f:
        for audio_path in audio_tsv:
            csv.writer(f, delimiter='\t').writerow(audio_path)
Esempio n. 25
0
def main(wav_file, npz_path):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    #if FLAGS.wav_file:
    #  wav_file = str(FLAGS.wav_file)
    #  print (FLAGS.wav_file)

    if 1:
        wav_file = wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    #print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(tfrecord_file)

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        #print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        #print(postprocessed_batch)

        if 0 in embedding_batch.shape:
            print('NO')
            return 0

        np.savez_compressed(npz_path, postprocessed_batch)
    return 1
Esempio n. 26
0
 def post_init(self):
     self.to_device()
     import tensorflow as tf
     tf.compat.v1.disable_eager_execution()
     self.sess = tf.compat.v1.Session()
     vggish_slim.define_vggish_slim()
     vggish_slim.load_vggish_slim_checkpoint(self.sess, self.model_path)
     self.feature_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.INPUT_TENSOR_NAME)
     self.embedding_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.OUTPUT_TENSOR_NAME)
     self.post_processor = vggish_postprocess.Postprocessor(self.pca_path)
Esempio n. 27
0
 def __init__(self):
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.sess = tf.Session(graph=self.graph)
         # Define the model in inference mode, load the checkpoint, and
         # locate input and output tensors.
         vggish_slim.define_vggish_slim(training=False)
         vggish_slim.load_vggish_slim_checkpoint(self.sess, FLAGS.checkpoint)
         self.features_tensor = self.sess.graph.get_tensor_by_name(
             vggish_params.INPUT_TENSOR_NAME)
         self.embedding_tensor = self.sess.graph.get_tensor_by_name(
             vggish_params.OUTPUT_TENSOR_NAME)
Esempio n. 28
0
 def _build_model(self):
     # Restore VGGish model trained on YouTube8M dataset
     # Retrieve PCA-embeddings of bottleneck features
     # Define the model in inference mode, load the checkpoint, and
     # locate input and output tensors.
     vggish_slim.define_vggish_slim(training=False)
     vggish_slim.load_vggish_slim_checkpoint(self.sess, model_checkpoint)
     self.features_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.INPUT_TENSOR_NAME)
     self.embedding_tensor = self.sess.graph.get_tensor_by_name(
         vggish_params.OUTPUT_TENSOR_NAME)
     # Prepare a postprocessor to munge the model embeddings.
     self.pproc = vggish_postprocess.Postprocessor(pca_params)
Esempio n. 29
0
def processList(filelist):
    # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
    # to test resampling to 16 kHz during feature extraction).
    out_file_list = []
    input_list = open(filelist)
    input_lists = input_list.readlines()
    for i, infl in tqdm(enumerate(input_lists)):
        infl_new = audio_root + '/' + infl.replace("\ ", " ")
        y, sr = librosa.load(infl_new.strip(), sr=None)
        if len(y) < sr:
            y1 = np.pad(y, (0, sr - len(y)), 'wrap')
            y = y1
        # Produce a batch of log mel spectrogram examples.

        input_batch = vggish_input.waveform_to_examples(y, sr)
        print('Log Mel Spectrogram example: ', input_batch[0])
        # Define VGGish, load the checkpoint, and run the batch through the model to
        # produce embeddings.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:
            vggish_slim.define_vggish_slim()
            vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: input_batch})
            print('VGGish embedding: done ', i)

        # Postprocess the results to produce whitened quantized embeddings.
        pproc = vggish_postprocess.Postprocessor(pca_params_path)
        postprocessed_batch = pproc.postprocess(embedding_batch)

        infl_list = infl_new.strip().split("/")
        file_name = infl_list[-1].strip()
        out_dir = output_root + "/" + infl_list[-2]
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        featfile = str(out_dir) + '/' + str(file_name) + '.txt'
        out_file_list.append(featfile.strip())
        np.savetxt(featfile,
                   postprocessed_batch.astype(int),
                   fmt='%i',
                   delimiter=",")
    create_file(output_file, out_file_list)
def OutputAudioEmbeddings(pathIn, row):
    video_id = row['video_id']
    video_path = row['video_path']
    split = row['split']
    full_path = os.path.join(pathIn, video_path)
    full_path = full_path.replace("%(ext)s",
                                  "wav")  # output file of the downloader path
    if split == 'train':
        full_path_cut = full_path.replace("train", "train/cut")
    elif split == 'test':
        full_path_cut = full_path.replace("test", "test/cut")

    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.

    if os.path.isfile(full_path_cut):
        wav_file = full_path_cut

        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        #print(examples_batch)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        writer = tf.python_io.TFRecordWriter(
            FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            #print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            print(postprocessed_batch)
            #print(postprocessed_batch.shape)
            np.save(
                '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' +
                split + '/' + video_id, postprocessed_batch)
Esempio n. 31
0
def main(_):
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define VGGish.
    embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

    # Define a shallow classification model and associated training ops on top
    # of VGGish.
    with tf.variable_scope('mymodel'):
      # Add a fully connected layer with 100 units.
      num_units = 100
      fc = slim.fully_connected(embeddings, num_units)

      # Add a classifier layer at the end, consisting of parallel logistic
      # classifiers, one per class. This allows for multi-class tasks.
      logits = slim.fully_connected(
          fc, _NUM_CLASSES, activation_fn=None, scope='logits')
      tf.sigmoid(logits, name='prediction')

      # Add training ops.
      with tf.variable_scope('train'):
        global_step = tf.Variable(
            0, name='global_step', trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                         tf.GraphKeys.GLOBAL_STEP])

        # Labels are assumed to be fed as a batch multi-hot vectors, with
        # a 1 in the position of each positive class label, and 0 elsewhere.
        labels = tf.placeholder(
            tf.float32, shape=(None, _NUM_CLASSES), name='labels')

        # Cross-entropy label loss.
        xent = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=labels, name='xent')
        loss = tf.reduce_mean(xent, name='loss_op')
        tf.summary.scalar('loss', loss)

        # We use the same optimizer and hyperparameters as used to train VGGish.
        optimizer = tf.train.AdamOptimizer(
            learning_rate=vggish_params.LEARNING_RATE,
            epsilon=vggish_params.ADAM_EPSILON)
        optimizer.minimize(loss, global_step=global_step, name='train_op')

    # Initialize all variables in the model, and then load the pre-trained
    # VGGish checkpoint.
    sess.run(tf.global_variables_initializer())
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

    # Locate all the tensors and ops we need for the training loop.
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
    global_step_tensor = sess.graph.get_tensor_by_name(
        'mymodel/train/global_step:0')
    loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
    train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

    # The training loop.
    for _ in range(FLAGS.num_batches):
      (features, labels) = _get_examples_batch()
      [num_steps, loss, _] = sess.run(
          [global_step_tensor, loss_tensor, train_op],
          feed_dict={features_tensor: features, labels_tensor: labels})
      print('Step %d: loss %g' % (num_steps, loss))
Esempio n. 32
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.
  if FLAGS.wav_file:
    wav_file = FLAGS.wav_file
  else:
    # Write a WAV of a sine wav into an in-memory file object.
    num_secs = 5
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)
    # Convert to signed 16-bit samples.
    samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    wav_file = six.BytesIO()
    wavfile.write(wav_file, sr, samples)
    wav_file.seek(0)
  examples_batch = vggish_input.wavfile_to_examples(wav_file)
  print(examples_batch)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  # If needed, prepare a record writer to store the postprocessed embeddings.
  writer = tf.python_io.TFRecordWriter(
      FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)
    if writer:
      writer.write(seq_example.SerializeToString())

  if writer:
    writer.close()
Esempio n. 33
0
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
with tf.Graph().as_default(), tf.Session() as sess:
  vggish_slim.define_vggish_slim()
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)
  [embedding_batch] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: input_batch})
  print('VGGish embedding: ', embedding_batch[0])
  expected_embedding_mean = 0.131
  expected_embedding_std = 0.238
  np.testing.assert_allclose(
      [np.mean(embedding_batch), np.std(embedding_batch)],
      [expected_embedding_mean, expected_embedding_std],
      rtol=rel_error)