コード例 #1
0
def get_embedding(audio_path):
    times, segs = VAD_chunk(2, audio_path)
    #print("segs ", segs.shape)
    if segs == []:
        print('No voice activity detected')
        return None
    concat_seg = concat_segs(times, segs)
    STFT_frames = get_STFTs(concat_seg)
    STFT_frames = np.stack(STFT_frames, axis=2)
    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)))
    #print("STFT shape: ", STFT_frames.shape)
    embeddings = encoder(STFT_frames)
    return embeddings
コード例 #2
0
def dvector_make(file, embedder_net):
    times, segs = VAD_chunk(2, file)
    if segs == []:
        print('No voice activity detected')
    concat_seg = concat_segs(times, segs)
    STFT_frames = get_STFTs(concat_seg)
    STFT_frames = np.stack(STFT_frames, axis=2)
    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)))
    embeddings = embedder_net(STFT_frames)
    aligned_embeddings = np.array(align_embeddings(
        embeddings.detach().numpy()))
    dvector = np.mean(aligned_embeddings, axis=0)
    dvector = dvector / np.linalg.norm(dvector, 2)
    return dvector
コード例 #3
0
def feature_embeddings(model_path):
    for i, folder in enumerate(audio_path):
        print("&" * 10, folder, "&" * 100)
        for file in os.listdir(folder):
            print(file)
            if file[-4:] == '.wav':
                # subprocess.call(['ffmpeg', '-i', 'file', file[-4:]+'.wav'])
                times, segs = VAD_chunk(2, folder + '/' + file)
                # print("times" * 10, times)
                # print("segs" * 10)
                #print(segs)
                if segs == []:
                    print('No voice activity detected')
                    continue
                concat_seg = concat_segs(times, segs)
                STFT_frames = get_STFTs(concat_seg)
                STFT_frames = np.stack(STFT_frames, axis=2)
                STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)))
                embeddings = embedder_net(STFT_frames)
                # print(embeddings)
                break
                aligned_embeddings = align_embeddings(embeddings.detach().numpy())
                train_sequence.append(aligned_embeddings)
                for embedding in aligned_embeddings:
                    train_cluster_id.append(str(label))
                count = count + 1
                if count % 100 == 0:
                    print('Processed {0}/{1} files'.format(count, len(audio_path)))
        label = label + 1

        if not train_saved and i > train_speaker_num:
            train_sequence = np.concatenate(train_sequence, axis=0)
            train_cluster_id = np.asarray(train_cluster_id)
            np.save('train_sequence', train_sequence)
            np.save('train_cluster_id', train_cluster_id)
            train_saved = True
            train_sequence = []
            train_cluster_id = []
コード例 #4
0
                     10) * 9  # split total data 90% train and 10% test

embedder_net = SpeechEmbedder()
embedder_net.load_state_dict(torch.load(hp.model.model_path))
embedder_net.eval()

train_sequence = []
train_cluster_id = []
label = 0
count = 0
train_saved = False
for i, folder in enumerate(audio_path):
    for file in os.listdir(folder):
        print('folder')
        if file[-4:] == '.WAV':
            times, segs = VAD_chunk(2, folder + '/' + file)
            if segs == []:
                print('No voice activity detected')
                continue
            concat_seg = concat_segs(times, segs)
            STFT_frames = get_STFTs(concat_seg)
            STFT_frames = np.stack(STFT_frames, axis=2)
            STFT_frames = torch.tensor(
                np.transpose(STFT_frames, axes=(2, 1, 0)))
            embeddings = embedder_net(STFT_frames)
            aligned_embeddings = align_embeddings(embeddings.detach().numpy())
            train_sequence.append(aligned_embeddings)
            for embedding in aligned_embeddings:
                train_cluster_id.append(str(label))
            count = count + 1
            if count % 100 == 0:
コード例 #5
0
                all_speakers.append(speaker)

            # Get duration
            if debug:
                print('\n==== DEBUG ====')
                print(fpath)
            try:
                duration = str(timedelta(seconds=get_duration(filename=fpath)))
                if debug:
                    print('File duration: {}'.format(duration))
            except:
                print('UNABLE TO GET DURATION')
                raise

            # Chunk into segments with speech audio
            times, segs = VAD_chunk(2, fpath)
            if segs == []:
                print('No voice activity detected')
                continue
            if debug:
                print('{} - {:,} segments'.format(datetime.now() - start,
                                                  len(segs)))

            # Short-term Fourier Transform
            concat_seg = concat_segs(times, segs)
            if debug:
                print('{} - Concatenated segments'.format(datetime.now() -
                                                          start))
            STFT_frames = get_STFTs(concat_seg)
            if debug:
                print('{} - Got STFT frames'.format(datetime.now() - start))
embedder_net.eval()
embedder_net = embedder_net.cuda()

train_sequence = []
train_cluster_id = []
label = 0
count = 0
ns = 0
nf = 0
train_saved = False

for i, folder in enumerate(tqdm.tqdm(audio_path, desc="meeting", position=0)):
    for file in tqdm.tqdm(os.listdir(folder), desc="Segment", position=1):
        if file.split('.')[-1] == 'wav':
            # 0 is least agressive about filtering out non-speech, 3 is the most
            times, segs = VAD_chunk(hp.data.aggressiveness, os.path.join(folder, file))
            if segs == []:
                ns += 1
                continue
            concat_seg = concat_segs(times, segs)
            STFT_frames = get_STFTs(concat_seg)
            if len(STFT_frames) == 0:
                nf += 1
                continue
            STFT_frames = np.stack(STFT_frames, axis=2)
            STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))
            embeddings = embedder_net(STFT_frames.cuda())
            aligned_embeddings = align_embeddings(embeddings.cpu().detach().numpy())
            train_sequence.append(aligned_embeddings)
            for embedding in aligned_embeddings:
                train_cluster_id.append(str(label))
コード例 #7
0
                use_label = judge_dict[spkr_name[:-15]]
            else:
                judge_dict[spkr_name[:-15]] = cnt
                use_label = judge_dict[spkr_name[:-15]]
                cnt += 1
        else:
            use_label = label
            label += 1

        spkr_file_lst = []
        spkr_sequence = []
        spkr_cluster_lst = []

        for file in os.listdir(folder + '/' + spkr_name):
            if file[-4:] == '.wav':
                times, segs = VAD_chunk(2,
                                        folder + '/' + spkr_name + '/' + file)

                # Bad .wav detection
                if segs == []:
                    rm_pthlst.append(folder + '/' + file)
                    continue

                concat_seg = concat_segs(times, segs)
                if len(concat_seg) < min_va:
                    rm_pthlst.append(folder + '/' + file)
                    continue

                STFT_frames = get_STFTs(concat_seg)
                STFT_frames = np.stack(STFT_frames, axis=2)
                STFT_frames = torch.tensor(
                    np.transpose(STFT_frames, axes=(2, 1, 0)))
コード例 #8
0
def two_person_diarization(call_file):
    """ Diarization of a call of 2 speakers.
        Input: path to a call
        Output: a dictionary of two np arrays of timestamps in seconds - one per speaker.
    """
    # create embeddings and cluster them
    seg_times, speech_segs = VAD_chunk(2, call_file)
    assert speech_segs != [], "No voice apctivity detected."

    all_embeddings = []
    embedding_times = []
    for speech_seg, seg_time in zip(speech_segs, seg_times):
        STFT_frames = signal_processing.split_segment_to_frames(speech_seg)
        if not STFT_frames:
            # not enough frames, continue to next segment
            continue
        STFT_frames = np.stack(STFT_frames, axis=1)
        embeddings = speaker_verification_lstm_model.extract_embedding(
            STFT_frames)
        # calculate time stamps for each embedding
        delta_t = (seg_time[1] - seg_time[0]) / embeddings.shape[0]
        times_start = np.linspace(seg_time[0],
                                  seg_time[1],
                                  num=embeddings.shape[0])
        times_end = np.linspace(seg_time[0] + delta_t,
                                seg_time[1] + delta_t,
                                num=embeddings.shape[0])
        for idx, embedding in enumerate(embeddings):
            all_embeddings.append(embedding)
            embedding_times.append([times_start[idx], times_end[idx]])

    # Using K-Means to separate the two speakers embeddings
    kmeans_emb = KMeans(n_clusters=2, random_state=0).fit(all_embeddings)
    # Getting the cluster labels
    labels = kmeans_emb.predict(all_embeddings)
    # Taking only the embeddings that are close to the centers
    distances_from_centers = []
    distances_from_centers.append([])
    distances_from_centers.append([])
    for embedding, label in zip(all_embeddings, labels):
        distances_from_centers[label].append(
            scipy.spatial.distance.euclidean(
                embedding, kmeans_emb.cluster_centers_[label]))
    # Take only the most certain segments
    median_dist = []
    median_dist.append(statistics.median(distances_from_centers[0]))
    median_dist.append(statistics.median(distances_from_centers[1]))
    # Create list of times and embedding per speaker
    first_speaker_label = labels[0]
    speaker0_times = []
    speaker1_times = []
    speaker0_embeddings = []
    speaker1_embeddings = []
    for idx, label in enumerate(labels):
        if scipy.spatial.distance.euclidean(
                all_embeddings[idx],
                kmeans_emb.cluster_centers_[label]) < median_dist[label]:
            if label == first_speaker_label:
                speaker0_times.append(embedding_times[idx])
                speaker0_embeddings.append(all_embeddings[idx])
            else:
                speaker1_times.append(embedding_times[idx])
                speaker1_embeddings.append(all_embeddings[idx])

    # build dictionary to return
    return {
        "FIRST SPEAKER": speaker0_times,
        "SECOND SPEAKER": speaker1_times
    }, {
        "FIRST SPEAKER": speaker0_embeddings,
        "SECOND SPEAKER": speaker1_embeddings
    }