コード例 #1
0
def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Analyze input wave-file and save detected speech interval to json file.'
    )
    # parser.add_argument('inputfile', metavar='INPUTWAVE',
    #                    help='the full path to input wave file')
    parser.add_argument(
        'outputfile',
        metavar='OUTPUTFILE',
        help=
        'the full path to output json file to save detected speech intervals')
    args = parser.parse_args()

    # Create a speech detection object and read in the audio data
    v = VoiceActivityDetector('FILE_NAME.wav')

    # Detect speech in the given data
    raw_detection = v.detect_speech()

    # Convert the detected speech signals to a readable format
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)

    # Save the converted and detected speech to a file
    save_to_file(speech_labels, args.outputfile)
コード例 #2
0
    x.export(FILE_FOLDER + FILE_NAME + '-mono.wav', format="wav")
    waveFile = wave.open(FILE_FOLDER + FILE_NAME + '-mono.wav', 'r')

# Getting some variables from file
signal = waveFile.readframes(-1)
signal = np.fromstring(signal, 'Int16')
frameRate = waveFile.getframerate()
Time = np.linspace(0, len(signal) / frameRate, num=len(signal))
totalFrames = waveFile.getnframes()
wavFileLengthSec = totalFrames / frameRate

#  VOICE ACTIVITY DETECTION (VAD)

vad = VoiceActivityDetector(fullFileName)
raw_detection = vad.detect_speech()
speech_labels, starters, enders = vad.convert_windows_to_readible_labels(
    raw_detection)

# list of beginnings of voice detected segments (in frames)
starterFrames = [int(i) for i in starters]
# list of endings of voice detected segments (in frames)
enderFrames = [int(i) for i in enders]

# if the last voice detected segment isn't closed, use the last frame as end of this segment
if len(starterFrames) > len(enderFrames):
    enderFrames.append(totalFrames)

# list of beginnings of voice detected segments (in milliseconds)
starterMs = [int((i / frameRate) * 1000) for i in starterFrames]
# list of endings of voice detected segments (in milliseconds)
enderMs = [int((i / frameRate) * 1000) for i in enderFrames]
コード例 #3
0
ファイル: main.py プロジェクト: cpickard828/superSensor
    def run(self):
        #global my_mutex
        global numRecordings
        global audioNum
        global processNum

        megaArray = []
        currDay = 1
        fileChar = "A"
        wavFile = ""
        #while processNum < numRecordings:
        while goingFlag == True or processNum < audioNum:
            while processNum >= audioNum:
                y = 0
            #print processNum
            try:
                if processNum > 0:
                    subprocess.check_output(['rm', wavFile])
                #jsonFile = "/home/pi/results" + `processNum` + ".json"
                wavFile = dir_path + "/talking" + ` processNum ` + ".wav"
                print "Processing " + wavFile
                #foFile = "fo" + `processNum` + ".fo"
                #pmFile = "pm" + `processNum` + ".pm"
                timeEnd = q.get()
                tempDay = currDay
                currDay = qSoundJson.get()
                if tempDay != currDay:
                    megaArray = []
                soundName = dir_path + "/data/day" + str(
                    currDay) + "/" + fileChar + "/sound" + str(
                        fileNum) + ".json"
                if fileChar == "A":
                    fileChar = "B"
                else:
                    fileChar = "A"
                print timeEnd
                #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)])
                v = VoiceActivityDetector(wavFile)
                ampData = v._calculate_amplitude(v.data)

                if np.percentile(ampData, 75) < 200:
                    print "AUTO"
                    speech_labels = []
                    speech_dict = {}
                    speech_dict['percent_time_w_speech'] = 0.0
                    speech_dict['time_started'] = round(timeEnd, 3)
                    speech_dict['length_of_recording'] = round(
                        lengthRecording, 3)
                    speech_dict['mean_amp'] = round(
                        float(float(sum(ampData)) / len(ampData)), 3)
                    speech_dict['med_amp'] = round(np.median(ampData), 3)
                    speech_dict['25_amp'] = round(np.percentile(ampData, 25),
                                                  3)
                    speech_dict['75_amp'] = round(np.percentile(ampData, 75),
                                                  3)
                    speech_dict['max_amp'] = round(np.amax(ampData), 3)
                    speech_dict['processed'] = 0
                    speech_labels.append(speech_dict)
                    print "AUTO DONE"
                else:
                    print "MANUAL"
                    raw_detection = v.detect_speech()
                    #print raw_detection
                    speech_labels = v.convert_windows_to_readible_labels(
                        raw_detection, str(timeEnd))
                    print "MANUAL DONE"
                megaArray.append(speech_labels)
                save_to_file(megaArray, soundName)
                #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a'])
                processNum = processNum + 1

            except subprocess.CalledProcessError as e:
                logging.exception("message")

        subprocess.check_output(['rm', wavFile])
        sense.clear()
        with open("processWav.txt", "w") as text_file:
            text_file.write("Total wavs processed: {}".format(str(processNum)))
        with open("createWav.txt", "w") as t2:
            t2.write("Total wavs supposed to be processed: {}".format(
                str(audioNum)))
        currTime = time.time()
        currTime = currTime - timeStart
        with open("timeTaken.txt", "w") as t3:
            t3.write("Time taken to process wavs: {}".format(str(currTime)))
def main():
    datadir = './dataset/train/audio/'
    labels = {
        'yes': 0,
        'no': 1,
        'up': 2,
        'down': 3,
        'left': 4,
        'right': 5,
        'on': 6,
        'off': 7,
        'go': 8,
        'stop': 9
    }
    labels_list = np.array([
        'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'go', 'stop'
    ])
    train_in = []
    train_out = []
    test_in = []
    test_out = []
    removal_thr = 0

    vadcount = 0
    for word, label in labels.items():
        print(word)
        dirpath = datadir + word
        files = os.listdir(dirpath)
        np.random.shuffle(files)

        for i, fn in enumerate(files):

            filepath = os.path.join(dirpath, fn)
            sample_rate, samples = wavfile.read(filepath)
            #print(filepath)
            # run voice activity detector
            v = VoiceActivityDetector(filepath)
            detected_windows = v.detect_speech()
            speechtime = v.convert_windows_to_readible_labels(detected_windows)

            silence_free = np.array([])
            if len(speechtime) != 0:
                for segment in speechtime:
                    n0 = int(segment['speech_begin'] * sample_rate)
                    n1 = int(segment['speech_end'] * sample_rate)
                    silence_free = np.append(silence_free, samples[n0:n1])
                #print(len(silence_free))
            # !!! IMPORTANT check the quality of silence removel
            # bad ones will crush the spectrogram step
            if len(silence_free) >= 3200:
                vadcount += 1
                freqs, times, spectrogram = log_specgram(silence_free,
                                                         sample_rate,
                                                         window_size=10,
                                                         step_size=5)
            else:
                freqs, times, spectrogram = log_specgram(samples,
                                                         sample_rate,
                                                         window_size=10,
                                                         step_size=5)

            #plt.pcolormesh(freqs, times, spectrogram, cmap='coolwarm')
            #plt.show()
            # average spectrogram shape is (97.3,161)   vvvvv: x and y are reversed in cv2
            spectrogram = cv2.resize(spectrogram,
                                     dsize=(161, 100),
                                     interpolation=cv2.INTER_CUBIC)
            spectrogram = spectrogram.reshape(100, 161, 1)

            if i < len(files) / 5:
                test_in.append(spectrogram)
                test_out.append(one_hot_encode(label))
            else:
                train_in.append(spectrogram)
                train_out.append(one_hot_encode(label))
    train_in = np.array(train_in, dtype=np.float32)
    train_out = np.array(train_out, dtype=np.int32)
    test_in = np.array(test_in, dtype=np.float32)
    test_out = np.array(test_out, dtype=np.int32)
    print(train_in.shape)
    print(train_out.shape)
    print(test_in.shape)
    print(test_out.shape)
    print(vadcount, ' audios were processed with VAD.')

    save_filename = './mini_speech_data2.npz'
    np.savez(save_filename,
             train_in=train_in,
             train_out=train_out,
             test_in=test_in,
             test_out=test_out,
             labels=labels_list)
コード例 #5
0
from vad import VoiceActivityDetector
import argparse
import json

def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.')
    parser.add_argument('inputfile', metavar='INPUTWAVE',
                        help='the full path to input wave file')
    parser.add_argument('outputfile', metavar='OUTPUTFILE',
                        help='the full path to output json file to save detected speech intervals')
    args = parser.parse_args()
    
    v = VoiceActivityDetector(args.inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    
    save_to_file(speech_labels, args.outputfile)