コード例 #1
0
def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        aS.speaker_diarization(inputFile, numSpeakers, plot_res=True)
    else:
        aS.speaker_diarization(inputFile,
                               numSpeakers,
                               lda_dim=0,
                               plot_res=True)
コード例 #2
0
    def diarize_sentences(self):
        if self.num_of_speakers != 1:
            from pyAudioAnalysis.audioSegmentation import speaker_diarization

            input_file = os.path.join(
                self.full_path,
                (DEFAULT_STT_INPUT_FILENAME + DEFAULT_STT_INPUT_FORMAT))
            output_file = os.path.join(REPO_PATH,
                                       PY_AUDIO_ANALYSIS_DATA_DIRECTORY,
                                       (DEFAULT_STT_INPUT_FILENAME + ".wav"))
            command = "ffmpeg -i " + input_file + " " + output_file
            os.system(command)
            input(
                "\a\nFFmpeg audio conversion for diarization complete. Press <enter> to continue"
            )

            diarized_speakers = speaker_diarization(output_file,
                                                    self.num_of_speakers)
            input("\a\nDiarization complete. Press <enter> to continue")

            # list of speakers... example -> [0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0]
            diarized_speakers.tolist()  # converts from numpy ndarray
            # the number of speakers in the audio/video (this could have been dynamically determined)
            self.num_of_speakers = int(max(diarized_speakers) + 1)
            # sometimes the ML algo doesn't mark the original speaker as 0... this corrects that
            # by moving every speaker marker down by one (zeros circle back to the top)
            if int(diarized_speakers[0]) != 0:
                for i in range(len(diarized_speakers)):
                    if diarized_speakers[int(i)] > 0:
                        diarized_speakers[int(i)] -= 1
                    elif diarized_speakers[int(i)] == 0:
                        diarized_speakers[int(i)] = self.num_of_speakers - 1
            # parallel list to diarized_speakers; the time of each element in diarized_speakers
            speaker_timing = [
                round((i * 0.2 + 0.1), 1)
                for i in range(len(diarized_speakers))
            ]

            print(f"\a\n{self.num_of_speakers} unique speakers detected:")
            for i in range(self.num_of_speakers):
                self.speaker_genders[(i + 1)] = input(
                    f"\tSpeaker #{(i+1)} gender (male=1, female=2, unknown=3): "
                )

            for sentence in self.sentences:
                # finds sub list of valid speakers inside time interval of sentence
                start = sentence.start_time
                end = sentence.end_time
                valid_times = [
                    time for time in speaker_timing if time >= start
                    if time <= end
                ]
                valid_speakers = [
                    diarized_speakers[speaker_timing.index(time)]
                    for time in valid_times
                ]
                # sets sentence's speaker and gender based on the most prevelant speaker in the interval
                sentence.speaker = int(
                    max(set(valid_speakers), key=valid_speakers.count)) + 1
                sentence.gender = int(self.speaker_genders[sentence.speaker])
コード例 #3
0
def pipeline(args):
    start_time = time.time()
    args.filename, to_delete = convert(args.filename)
    t = time.time()
    print("diarization...")
    diarization = np.array(
        speaker_diarization(args.filename,
                            n_speakers=args.n_speakers,
                            mid_step=args.chunk_size,
                            short_window=args.short_window,
                            mid_window=args.mid_window,
                            lda_dim=args.lda_dim)).astype("int")
    diarization = make_diarization_chronological(diarization)
    print("Done !")
    print("Took {} seconds".format(int(time.time() - t)))
    print("number of chunk : {}".format(len(diarization)))
    print("Found {} speakers".format(len(np.unique(diarization))))
    print("building segments...")
    segment_list, speaker_list = sound_to_segments(diarization,
                                                   args.chunk_size)
    if args.show_speakers:
        print("showing speakers...")
        show_speakers(args.filename, segment_list, speaker_list)
        print("choose speeds for each speakers (separated by space)")
        args.speeds = np.array(input().split(" ")).astype(float)
    if args.auto:
        print("Automatically finding speakers speeds...")
        speeds = find_speaker_speeds(args.filename,
                                     segment_list,
                                     speaker_list,
                                     max_length=60,
                                     min_length=350)
        print("Speakers speeds: (syllab / minutes)")
        print(list(60 * np.array(speeds)))  # convert to syllab / minutes
        args.speeds = [max(speeds) / speed for speed in speeds]
        print("Going to speed up the speakers by :")
        print(args.speeds)
    print("speeding up...")
    speed_up(segment_list,
             speaker_list,
             args.speeds,
             args.filename,
             output_file="temp_folder/audio_speedup.wav")
    print("adding intro...")
    add_intro("temp_folder/audio_speedup.wav", args.save_file)
    if to_delete:
        os.remove(args.filename)
    print("Done in {} seconds! Saved the result to {}".format(
        int(time.time() - start_time), args.save_file))
    return args.speeds
コード例 #4
0
while (time.process_time() - time_start < 60):
    print("Checking...")
    try:
        message = pop_message(client, queue_url)
        print(message)
        if message == "Listening":
            myrecording = sd.rec(int(seconds * fs), samplerate=fs,
                                 channels=1)  #num channels
            sd.wait()  # Wait until recording is finished
            write('output.wav', fs, myrecording)  # Save as WAV file
            mood = aT.file_regression("output.wav", [
                "data/svmSpeechEmotion_valence",
                "data/svmSpeechEmotion_valenceMEANS",
                "data/svmSpeechEmotion_arousal",
                "data/svmSpeechEmotion_arousalMEANS"
            ], "svm")
            people = audioSegmentation.speaker_diarization("output.wav",
                                                           0,
                                                           mid_window=2.0,
                                                           mid_step=0.2,
                                                           short_window=0.05,
                                                           lda_dim=35,
                                                           plot_res=False)
            result = GenrePicker(mood[0][0], mood[0][1], people)
            print(result)
        elif message == "Responding":
            print(result)
            post_message(client, result, queue_url)
    except:
        pass
コード例 #5
0
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            F = MidTermFeatures.short_term_feature_extraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            aT.file_classification("diarizationExample.wav", "svmSM", "svm")
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            [flagsInd, classesAll,
             acc] = aS.mid_term_file_classification("diarizationExample.wav",
                                                    "svmSM", "svm", False, '')
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            aS.hmm_segmentation('diarizationExample.wav', 'hmmRadioSM', False,
                                '')
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            segments = aS.silence_removal(x,
                                          Fs,
                                          0.050,
                                          0.050,
                                          smooth_window=1.0,
                                          Weight=0.3,
                                          plot=False)
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            [A1, A2, B1, B2,
             Smatrix] = aS.music_thumbnailing(x1, Fs1, 1.0, 1.0,
                                              15.0)  # find thumbnail endpoints
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            aS.speaker_diarization("diarizationExample.wav",
                                   4,
                                   LDAdim=0,
                                   PLOT=False)
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            aS.speaker_diarization("diarizationExample.wav", 4, PLOT=False)
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
コード例 #6
0
        tempAudioData = np.empty_like(data)
        tempAudioData[:] = data
        step_size = 0.1
        frameset_to_take = int(framerate * step_size)
        ctr = np.zeros(10)
        amp_sum = np.zeros(10)
        amp_cnt = np.zeros(10)
        amp_mean = np.zeros(10)
        counter = 0
        try:
            # Segmentation of Audio file based on amplitude of voice using supervised clustering
            temp = aseg.speaker_diarization("c.wav",
                                            2,
                                            mid_window=0.1,
                                            mid_step=0.1,
                                            short_window=0.1,
                                            lda_dim=0,
                                            plot_res=False)
            # Finding mean amplitude of each identified speaker
            for k in range(temp.size):
                for l in range(frameset_to_take):
                    if counter < data.size:
                        if data.ndim == 2:
                            amp_sum[int(temp[k])] += abs(data[int(counter), 0])
                        else:
                            amp_sum[int(temp[k])] += abs(data[int(counter)])
                        amp_cnt[int(temp[k])] += 1
                        counter += 1
            for m in range(10):
                if amp_cnt[m] != 0:
コード例 #7
0
def test_speaker_diarization():
    labels, purity_cluster_m, purity_speaker_m = \
        aS.speaker_diarization("test_data/diarizationExample.wav",
                                4, plot_res=False)
    assert purity_cluster_m > 0.9, "Diarization cluster purity is low"
    assert purity_speaker_m > 0.9, "Diarization speaker purity is low"
コード例 #8
0
ファイル: main.py プロジェクト: CornSnak3/SummerPractice
def pyAudioDiar():
    duration, result = aS.speaker_diarization(labelFileNameSound.get(),
                                              int(labelNumberOfSpeakers.get()),
                                              lda_dim=0,
                                              plot_res=False)
    show = 'diarizationExample'
    input_show = labelFileNameSound.get()
    input_sad = None
    win_size = 250
    thr_l = 2
    thr_h = 3
    thr_vit = -250
    wdir = os.path.join('out', show)
    if not os.path.exists(wdir):
        os.makedirs(wdir)
    fs = get_feature_server(input_show, feature_server_type='basic')
    cep, _ = fs.load(show)
    cep.shape

    if input_sad is not None:
        init_diar = Diar.read_seg(input_sad)
        init_diar.pack(50)
    else:
        init_diar = segmentation.init_seg(cep, show)

    seg_diar = segmentation.segmentation(cep, init_diar, win_size)

    bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False)

    bic = hac_bic.HAC_BIC(cep, bicl_diar, thr_h, sr=False)
    bich_diar = bic.perform(to_the_end=True)

    vit_diar = viterbi.viterbi_decoding(cep, bich_diar, thr_vit)
    resList = []
    currentPosition = 0
    for row in vit_diar:
        speakerValue = int(row[1][1:])
        while currentPosition < (row[3] + row[4]):
            resList.append(speakerValue)
            currentPosition += 20

    currentPosition = 0
    realityList = []
    realityFile = pd.read_csv(labelFileNameSegment.get(),
                              delimiter='\t',
                              encoding='utf-8',
                              names=['start', 'end', 'speaker'])
    for index, row in realityFile.iterrows():
        speakerValue = int(row['speaker'][1:])
        while currentPosition < row['end']:
            realityList.append(int(speakerValue))
            currentPosition += 0.2

    plot.subplot(3, 1, 2)
    plot.title("s4d:")
    plot.plot(np.arange(0, duration, duration / len(resList)), resList, 'ro')
    plot.subplot(3, 1, 1)
    plot.title("Реальность:")
    plot.plot(np.arange(0, duration, duration / len(realityList)), realityList,
              'bo')
    plot.subplot(3, 1, 3)
    plot.title("pyPlotAudio:")
    plot.plot(np.arange(0, duration, duration / len(result)), result, 'go')
    plot.show()