コード例 #1
0
ファイル: vggish_train_demo.py プロジェクト: 812864539/models
def _get_examples_batch():
  """Returns a shuffled batch of examples of all audio classes.

  Note that this is just a toy function because this is a simple demo intended
  to illustrate how the training code might work.

  Returns:
    a tuple (features, labels) where features is a NumPy array of shape
    [batch_size, num_frames, num_bands] where the batch_size is variable and
    each row is a log mel spectrogram patch of shape [num_frames, num_bands]
    suitable for feeding VGGish, while labels is a NumPy array of shape
    [batch_size, num_classes] where each row is a multi-hot label vector that
    provides the labels for corresponding rows in features.
  """
  # Make a waveform for each class.
  num_seconds = 5
  sr = 44100  # Sampling rate.
  t = np.linspace(0, num_seconds, int(num_seconds * sr))  # Time axis.
  # Random sine wave.
  freq = np.random.uniform(100, 1000)
  sine = np.sin(2 * np.pi * freq * t)
  # Random constant signal.
  magnitude = np.random.uniform(-1, 1)
  const = magnitude * t
  # White noise.
  noise = np.random.normal(-1, 1, size=t.shape)

  # Make examples of each signal and corresponding labels.
  # Sine is class index 0, Const class index 1, Noise class index 2.
  sine_examples = vggish_input.waveform_to_examples(sine, sr)
  sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])
  const_examples = vggish_input.waveform_to_examples(const, sr)
  const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])
  noise_examples = vggish_input.waveform_to_examples(noise, sr)
  noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])

  # Shuffle (example, label) pairs across all classes.
  all_examples = np.concatenate((sine_examples, const_examples, noise_examples))
  all_labels = np.concatenate((sine_labels, const_labels, noise_labels))
  labeled_examples = list(zip(all_examples, all_labels))
  shuffle(labeled_examples)

  # Separate and return the features and labels.
  features = [example for (example, _) in labeled_examples]
  labels = [label for (_, label) in labeled_examples]
  return (features, labels)
コード例 #2
0
ファイル: vggish_smoke_test.py プロジェクト: 812864539/models
checkpoint_path = 'vggish_model.ckpt'
pca_params_path = 'vggish_pca_params.npz'

# Relative tolerance of errors in mean and standard deviation of embeddings.
rel_error = 0.1  # Up to 10%

# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
# to test resampling to 16 kHz during feature extraction).
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
with tf.Graph().as_default(), tf.Session() as sess:
  vggish_slim.define_vggish_slim()
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)
コード例 #3
0
parser.add_argument('--output_path', default='/home/ICT2000/jondras/dvra_datasets/mimicry/audio_features')
args = parser.parse_args()

start_time = time.time()
cnt = 0
for audio_filepath in sorted(glob.glob(f'{args.input_path}/*.wav')):

    audio_basename = audio_filepath.split('/')[-1].split('.')[0]
    print(f'Extracting features for {audio_basename}')

    wav_rate, wav_samples = wavfile.read(audio_filepath)
    print(f'\t{wav_rate} {len(wav_samples)}')
    if len(wav_samples) < wav_rate:
        wav_samples = np.pad(wav_samples, (0, wav_rate - len(wav_samples)), 'constant')

    samples = vggish_input.waveform_to_examples(wav_samples, wav_rate)

    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, args.model_file)
    
        samples_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

        print(time.time() - start_time)
        [features] = sess.run([features_tensor], feed_dict={samples_tensor: samples})
        print(time.time() - start_time)

    print(f'\t{features.shape}')
    # print(f'\t{features}')
    np.save(f'{args.output_path}/{audio_basename}.npy', features)
コード例 #4
0
ファイル: audio_tagging.py プロジェクト: ZhicunXu/AudioTagger
def prepareAudio(WAV_FILE):
    fs, data = wavfile.read(WAV_FILE)
    #data = data[0]
    data = truncateAudioFs(data, fs)
    data = data / 32768.0
    return vggish_input.waveform_to_examples(data, fs)
コード例 #5
0
def _get_examples_batch():
    """Returns a shuffled batch of examples of all audio classes.



  Note that this is just a toy function because this is a simple demo intended

  to illustrate how the training code might work.



  Returns:

    a tuple (features, labels) where features is a NumPy array of shape

    [batch_size, num_frames, num_bands] where the batch_size is variable and

    each row is a log mel spectrogram patch of shape [num_frames, num_bands]

    suitable for feeding VGGish, while labels is a NumPy array of shape

    [batch_size, num_classes] where each row is a multi-hot label vector that

    provides the labels for corresponding rows in features.

  """

    # Make a waveform for each class.

    num_seconds = 5

    sr = 44100  # Sampling rate.

    t = np.linspace(0, num_seconds, int(num_seconds * sr))  # Time axis.

    # Random sine wave.

    freq = np.random.uniform(100, 1000)

    sine = np.sin(2 * np.pi * freq * t)

    # Random constant signal.

    magnitude = np.random.uniform(-1, 1)

    const = magnitude * t

    # White noise.

    noise = np.random.normal(-1, 1, size=t.shape)

    # Make examples of each signal and corresponding labels.

    # Sine is class index 0, Const class index 1, Noise class index 2.

    sine_examples = vggish_input.waveform_to_examples(sine, sr)

    sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])

    const_examples = vggish_input.waveform_to_examples(const, sr)

    const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])

    noise_examples = vggish_input.waveform_to_examples(noise, sr)

    noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])

    # Shuffle (example, label) pairs across all classes.

    all_examples = np.concatenate(
        (sine_examples, const_examples, noise_examples))

    all_labels = np.concatenate((sine_labels, const_labels, noise_labels))

    labeled_examples = list(zip(all_examples, all_labels))

    shuffle(labeled_examples)

    # Separate and return the features and labels.

    features = [example for (example, _) in labeled_examples]

    labels = [label for (_, label) in labeled_examples]

    return (features, labels)
コード例 #6
0
        # Slice audio according to segments
        slicer = essentia.standard.Slicer(startTimes=[s[0] for s in segments],
                                          endTimes=[s[1] for s in segments],
                                          timeUnits='samples',
                                          sampleRate=22050)

        sliced_audio = np.hstack(slicer(audio))

        sliced_duration = essentia.standard.Duration(
            sampleRate=22050)(sliced_audio)

        # Collate the active segments
        active_y = np.hstack(
            [audio[segment[0]:segment[1]] for segment in segments])

        input_batch = vggish_input.waveform_to_examples(active_y, sampleRate)
        predictions = np.argmax(model.predict(input_batch[:, :, :, None]),
                                axis=1)

        p_music = 0
        p_speech = 0
        p_sfx = 0
        sum_ = len(predictions)
        for n in range(len(predictions)):
            if predictions[n] == 0:
                p_music += 1 / sum_
            elif predictions[n] == 1:
                p_speech += 1 / sum_
            elif predictions[n] == 2:
                p_sfx += 1 / sum_
コード例 #7
0
ファイル: vggish_infer.py プロジェクト: tombarkerml/models
#     t = np.arange(0, num_secs, 1 / sr)
#     x = np.sin(2 * np.pi * freq * t)
#     # Convert to signed 16-bit samples.
#     samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
#     wav_file = six.BytesIO()
#     soundfile.write(wav_file, samples, sr, format='WAV', subtype='PCM_16')
#     wav_file.seek(0)
#     examples_frames = vggish_input.wavfile_to_frames(wav_file) #reset the seek on the in memory wav object
#     wav_file.seek(0)
#
#
# examples_batch = vggish_input.wavfile_to_examples(wav_file)
# wav_file.seek(0)

example_waveform = generate_audio.gen_audio(3, 16000)
examples_batch = vggish_input.waveform_to_examples(example_waveform, 16000)
examples_frames = vggish_input.wavform_to_frames(example_waveform, 16000)

#since there is no zero padding, the last frames from 'wavefile to frames' are not aligned with any of the log-mel features. So, they need to be discarded.
num_frames_to_keep = examples_batch.shape[0] * examples_batch.shape[1]
examples_frames = examples_frames[0:num_frames_to_keep, :]

print(examples_batch)

# Prepare a postprocessor to munge the model embeddings.
pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

# If needed, prepare a record writer to store the postprocessed embeddings.
writer = tf.python_io.TFRecordWriter(
    FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
コード例 #8
0
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    #
    print("begin vggish_inference_demo file")
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)

    output_csv_dict, examples_batch = vggish_input.wavfile_to_examples(
        wav_file)
    print("after initializing wav_file")

    num_secs = 3
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)

    # Produce a batch of log mel spectrogram examples.
    _, input_smoke_test_batch = vggish_input.waveform_to_examples(
        x, sr, wav_file)

    print(examples_batch)

    print("Shape of examples batch:", examples_batch.shape,
          "Shape of smoke test batch: ", input_smoke_test_batch.shape)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    # Things to try
    # input noise tensor
    # example audio

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print("embedding_batch")
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print("postprocessed_batch")
        print(postprocessed_batch)

        dict_to_csv(output_csv_dict, postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
コード例 #9
0
ファイル: server.py プロジェクト: tuyenbk/SoundWatch
def handle_source(json_data):
    data = str(json_data['data'])
    data = data[1:-1]
    global graph
    np_wav = np.fromstring(data, dtype=np.int16, sep=',') / \
        32768.0  # Convert to [-1.0, +1.0]
    # Compute RMS and convert to dB
    print('Successfully convert to NP rep', np_wav)
    rms = np.sqrt(np.mean(np_wav**2))
    db = dbFS(rms)
    print('Db...', db)
    # Make predictions
    print('Making prediction...')
    x = waveform_to_examples(np_wav, RATE)
    predictions = []
    if x.shape[0] != 0:
        x = x.reshape(len(x), 96, 64, 1)
    print('Successfully reshape x', x.shape)
    pred = model.predict(x)
    predictions.append(pred)

    with graph.as_default():
    if x.shape[0] != 0:
        x = x.reshape(len(x), 96, 64, 1)
        print('Successfully reshape x', x)
        # pred = model.predict(x)
        # predictions.append(pred)

    for prediction in predictions:
        context_prediction = np.take(
            prediction[0], [homesounds.labels[x] for x in active_context])
        m = np.argmax(context_prediction)
        print('Max prediction', str(
            homesounds.to_human_labels[active_context[m]]), str(context_prediction[m]))
        if (context_prediction[m] > PREDICTION_THRES and db > DBLEVEL_THRES):
            socketio.emit('audio_label',
                          {'label': str(homesounds.to_human_labels[active_context[m]]),
                           'accuracy': str(context_prediction[m])})
            print("Prediction: %s (%0.2f)" % (
                homesounds.to_human_labels[active_context[m]], context_prediction[m]))
    socket.emit('audio_label',
                {
                    'label': 'Unrecognized Sound',
                    'accuracy': '1.0'
                })


def background_thread():
    """Example of how to send server generated events to clients."""
    count = 0
    while True:
        socketio.sleep(10)
        count += 1
        socketio.emit('my_response',
                      {'data': 'Server generated event', 'count': count},
                      namespace='/test')


@app.route('/')
def index():
    return render_template('index.html',)

@socketio.on('send_message')
def handle_source(json_data):
    print('Receive message...' + str(json_data['message']))
    text = json_data['message'].encode('ascii', 'ignore')
    socketio.emit('echo', {'echo': 'Server Says: ' + str(text)})
    print('Sending message back..')

@socketio.on('disconnect_request', namespace='/test')
def disconnect_request():
    @copy_current_request_context
    def can_disconnect():
        disconnect()

    session['receive_count'] = session.get('receive_count', 0) + 1
    # for this emit we use a callback function
    # when the callback function is invoked we know that the message has been
    # received and it is safe to disconnect
    emit('my_response',
         {'data': 'Disconnected!', 'count': session['receive_count']},
         callback=can_disconnect)

@socketio.on('connect', namespace='/test')
def test_connect():
    global thread
    with thread_lock:
        if thread is None:
            thread = socketio.start_background_task(background_thread)
    emit('my_response', {'data': 'Connected', 'count': 0})


@socketio.on('disconnect', namespace='/test')
def test_disconnect():
    print('Client disconnected', request.sid)


if __name__ == '__main__':
    socketio.run(app, debug=True)
コード例 #10
0
 def wav2mel(self, blob, sample_rate):
     self.logger.debug(f'blob: {blob.shape}, sample_rate: {sample_rate}')
     mel_spec = vggish_input.waveform_to_examples(blob,
                                                  sample_rate).squeeze()
     self.logger.debug(f'mel_spec: {mel_spec.shape}')
     return mel_spec
コード例 #11
0
df = pd.read_csv('data.csv')
df['File'] = df['Category'] + '/' + df['File']
idx, labels, events, files = df.index.values, df.Category.values, df.Event.values, df.File.values

df_eval = pd.read_csv('Logsheet_Evaluation.csv')

files_eval = df_eval.File.values
audio_path = '/home/tianxiangchen1/cssvp/Evaluation/16k/'
embedding_path = '/home/tianxiangchen1/cssvp/embeddings/from_file/Evaluation/'

X_eval = []
X_eval_2 = []
for f in files_eval:
    wav = read_audio(audio_path + f)
    Sxx = vggish_input.waveform_to_examples(wav, sample_rate=16000)
    Sxx = np.vstack(Sxx)
    X_eval.append(Sxx.reshape(1, Sxx.shape[0], Sxx.shape[1], 1))
    feat = np.load(embedding_path + f + '.npy')
    X_eval_2.append(feat.reshape(1, feat.shape[0], feat.shape[1]))
X_eval = np.vstack(X_eval)
X_eval_2 = np.vstack(X_eval_2)

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# Encoding main task
label_enc = LabelEncoder()
enc = OneHotEncoder(sparse=False)
y_int = label_enc.fit_transform(labels)
y_int = y_int.reshape(len(y_int), 1)
y_one_hot = enc.fit_transform(y_int)