Python PlotDiar.PlotDiar Examples

Programming Language: Python

Namespace/Package Name: viewer

Class/Type: PlotDiar

Method/Function: PlotDiar

Examples at hotexamples.com: 6

Python PlotDiar.PlotDiar - 6 examples found. These are the top rated real world Python examples of viewer.PlotDiar.PlotDiar extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PlotDiar(6)

draw(6)

Frequently Used Methods

PlotDiar (6)

draw (6)

Example #1

Show file

File: speakerDiarization.py Project: aascode/Speaker_Diarization

def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = arguments.parse_arguments()
    model_args.observation_dim = 512
    diarization_Model = UISRNN(model_args)
    diarization_Model.load(SAVED_MODEL_NAME)
    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)
    feats = []

    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = diarization_Model.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice,
                 wav=wav_path,
                 gui=True,
                 pick=True,
                 size=(25, 6))
    p.draw()
    p.plot.show()

Example #2

Show file

File: main_pybk.py Project: henryph/vba_project

def runDiarization(showName, config):

    wav_path = config['PATH']['audio'] + showName + '.wav'

    t0 = time.time()
    print('showName\t\t', showName)
    print('Extracting features')

    if config.getint('GENERAL', 'performFeatureExtraction'):
        allData = extractFeatures(
            config['PATH']['audio'] + showName + config['EXTENSION']['audio'],
            config.getfloat('FEATURES', 'framelength'),
            config.getfloat('FEATURES', 'frameshift'),
            config.getint('FEATURES', 'nfilters'),
            config.getint('FEATURES', 'ncoeff'))
    else:
        allData = getFeatures(config['PATH']['features'] + showName +
                              config['EXTENSION']['features'])
    nFeatures = allData.shape[0]
    print('shape of features:', allData.shape)
    print('Initial number of features\t', nFeatures)

    t1 = time.time()
    feature_t = t1 - t0
    print("Time used for extracting features:", feature_t)

    if os.path.isfile(config['PATH']['UEM'] + showName +
                      config['EXTENSION']['UEM']):
        maskUEM = readUEMfile(config['PATH']['UEM'], showName,
                              config['EXTENSION']['UEM'], nFeatures,
                              config.getfloat('FEATURES', 'frameshift'))
    else:
        print(
            'UEM file does not exist. The complete audio content is considered.'
        )
        maskUEM = np.ones([1, nFeatures])

    if os.path.isfile(config['PATH']['SAD'] + showName +
                      config['EXTENSION']['SAD']) and not (config.getint(
                          'GENERAL', 'performVAD')):
        maskSAD = readSADfile(config['PATH']['SAD'], showName,
                              config['EXTENSION']['SAD'], nFeatures,
                              config.getfloat('FEATURES', 'frameshift'),
                              config['GENERAL']['SADformat'])
    else:
        print(
            'SAD file does not exist or automatic VAD is enabled in the config. VAD is applied and saved at %s.\n'
            % (config['PATH']['SAD'] + showName + '.lab'))
        maskSAD = getSADfile(config, showName, nFeatures)

    t2 = time.time()
    SAD_t = t2 - t1
    print("Time used for SAD: ", SAD_t)

    print('shape of SAD mask', maskSAD.shape)

    mask = np.logical_and(maskUEM, maskSAD)
    mask = mask[0][0:nFeatures]

    nSpeechFeatures = np.sum(mask)
    speechMapping = np.zeros(nFeatures)
    #you need to start the mapping from 1 and end it in the actual number of features independently of the indexing style
    #so that we don't lose features on the way
    speechMapping[np.nonzero(mask)] = np.arange(1, nSpeechFeatures + 1)
    data = allData[np.where(mask == 1)]
    del allData
    segmentTable = getSegmentTable(mask, speechMapping,
                                   config.getint('SEGMENT', 'length'),
                                   config.getint('SEGMENT', 'increment'),
                                   config.getint('SEGMENT', 'rate'))
    numberOfSegments = np.size(segmentTable, 0)
    print('Number of speech features\t', nSpeechFeatures)
    print('Number of segements \t', numberOfSegments)

    print(segmentTable[0])
    #create the KBM
    print('Training the KBM... ')

    #set the window rate in order to obtain "minimumNumberOfInitialGaussians" gaussians
    if np.floor((nSpeechFeatures - config.getint('KBM', 'windowLength')) /
                config.getint(
                    'KBM', 'minimumNumberOfInitialGaussians')) < config.getint(
                        'KBM', 'maximumKBMWindowRate'):
        windowRate = int(
            np.floor(
                (np.size(data, 0) - config.getint('KBM', 'windowLength')) /
                config.getint('KBM', 'minimumNumberOfInitialGaussians')))
    else:
        windowRate = int(config.getint('KBM', 'maximumKBMWindowRate'))
    print('KBM window rate:', windowRate)

    poolSize = np.floor(
        (nSpeechFeatures - config.getint('KBM', 'windowLength')) / windowRate)
    if config.getint('KBM', 'useRelativeKBMsize'):
        kbmSize = int(np.floor(poolSize *
                               config.getfloat('KBM', 'relKBMsize')))
    else:
        kbmSize = int(config.getint('KBM', 'kbmSize'))
    print('Training pool of', int(poolSize), 'gaussians with a rate of',
          int(windowRate), 'frames')
    kbm, gmPool = trainKBM(data, config.getint('KBM', 'windowLength'),
                           windowRate, kbmSize)
    print('Selected', kbmSize, 'gaussians from the pool')

    Vg = getVgMatrix(data, gmPool, kbm,
                     config.getint('BINARY_KEY', 'topGaussiansPerFrame'))
    print(Vg[0])
    print('Vg shape:', Vg.shape)

    t3 = time.time()
    KBM_t = t3 - t2
    print("Time used for traing KBM: ", KBM_t)

    print('Computing binary keys for all segments... ')
    segmentBKTable, segmentCVTable = getSegmentBKs(
        segmentTable, kbmSize, Vg,
        config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), speechMapping)

    print(segmentBKTable.shape)
    print(segmentCVTable.shape)

    t4 = time.time()
    BKCV_t = t4 - t3
    print("Time used to cal BK, CV: ", BKCV_t)

    print('Performing initial clustering... ')
    initialClustering = np.digitize(
        np.arange(numberOfSegments),
        np.arange(0, numberOfSegments,
                  numberOfSegments / config.getint('CLUSTERING', 'N_init')))

    print('initial clustering:', initialClustering.size)
    #print('initial clustering:', initialClustering)

    print('done')
    print('Performing agglomerative clustering... ')
    if config.getint('CLUSTERING', 'linkage'):
        finalClusteringTable, k = performClusteringLinkage(
            segmentBKTable, segmentCVTable,
            config.getint('CLUSTERING',
                          'N_init'), config['CLUSTERING']['linkageCriterion'],
            config['CLUSTERING']['metric'])
    else:
        finalClusteringTable, k = performClustering(
            speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg,
            config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), kbmSize,
            config.getint('CLUSTERING', 'N_init'), initialClustering,
            config['CLUSTERING']['metric'])

    t5 = time.time()
    clustering_t = t5 - t4
    print("Time used for clustering: ", clustering_t)

    print('Selecting best clustering...')
    if config['CLUSTERING_SELECTION']['bestClusteringCriterion'] == 'elbow':
        bestClusteringID = getBestClustering(
            config['CLUSTERING_SELECTION']['metric_clusteringSelection'],
            segmentBKTable, segmentCVTable, finalClusteringTable, k)
    elif config['CLUSTERING_SELECTION'][
            'bestClusteringCriterion'] == 'spectral':
        bestClusteringID = getSpectralClustering(
            config['CLUSTERING_SELECTION']['metric_clusteringSelection'],
            finalClusteringTable, config.getint(
                'CLUSTERING', 'N_init'), segmentBKTable, segmentCVTable, k,
            config.getint('CLUSTERING_SELECTION', 'sigma'),
            config.getint('CLUSTERING_SELECTION', 'percentile'),
            config.getint('CLUSTERING_SELECTION', 'maxNrSpeakers')) + 1
    print('Best clustering:\t', bestClusteringID.astype(int))
    print(
        'Number of clusters:\t',
        np.size(
            np.unique(finalClusteringTable[:,
                                           bestClusteringID.astype(int) - 1]),
            0))

    print(np.unique(finalClusteringTable))
    print(finalClusteringTable.shape)
    print(np.unique(finalClusteringTable[:, bestClusteringID.astype(int) - 1]))
    #print('best clustering results:')
    #print(finalClusteringTable[:,bestClusteringID.astype(int)-1])

    t6 = time.time()
    best_clustering_t = t6 - t5
    print("Time used for best clustering: ", best_clustering_t)

    final_clustering = finalClusteringTable[:,
                                            bestClusteringID.astype(int) - 1]

    if config.getint('RESEGMENTATION', 'resegmentation') and np.size(
            np.unique(finalClusteringTable[:,
                                           bestClusteringID.astype(int) - 1]),
            0) > 1:

        print(final_clustering.shape)
        print(final_clustering)

        print(segmentTable.shape)

        print('Performing GMM-ML resegmentation...')
        finalClusteringTableResegmentation, finalSegmentTable = performResegmentation(
            data, speechMapping, mask,
            finalClusteringTable[:, bestClusteringID.astype(int) - 1],
            segmentTable, config.getint('RESEGMENTATION', 'modelSize'),
            config.getint('RESEGMENTATION', 'nbIter'),
            config.getint('RESEGMENTATION', 'smoothWin'), nSpeechFeatures)

        print(finalClusteringTableResegmentation.shape)
        print(finalClusteringTableResegmentation)
        print(finalSegmentTable.shape)

        print('done')

        print(finalClusteringTableResegmentation.shape)
        print(finalSegmentTable.shape)
        print(segmentTable.shape)
        '''
        for i in range(0, 19):
            print(finalClusteringTableResegmentation[i],finalSegmentTable[i],finalSegmentTable[i][0]-finalSegmentTable[i][2])
        
        '''

        t7 = time.time()
        reseg_t = t7 - t6
        print("Time used for resegmentation: ", reseg_t)

        tu = t7 - t0
        print('Total time used:', tu)

        getSegmentationFile(config['OUTPUT']['format'],
                            config.getfloat('FEATURES',
                                            'frameshift'), finalSegmentTable,
                            np.squeeze(finalClusteringTableResegmentation),
                            showName, config['EXPERIMENT']['name'],
                            config['PATH']['output'],
                            config['EXTENSION']['output'])

        t1 = time.time()
        y, sr = librosa.load(wav_path, sr=None)
        audio_duration = librosa.get_duration(y, sr=sr)
        print('load data: ', time.time() - t1)

        print('audio duration: ', audio_duration)
        print('real-time factor: ', tu / audio_duration)

        print(wav_path)

        print(feature_t, SAD_t, KBM_t, clustering_t, reseg_t, tu)

        #wav_path = './audio_test/2.wav'
        #print(config['PATH']['audio']+showName+'.wav')

        speakerSlice = getSegResultForPlot(
            config.getfloat('FEATURES', 'frameshift'), finalSegmentTable,
            np.squeeze(finalClusteringTableResegmentation))

    else:
        clustering = rearrangeClusterID(final_clustering)
        #getSegmentationFile(config['OUTPUT']['format'],config.getfloat('FEATURES','frameshift'),segmentTable, finalClusteringTable[:,bestClusteringID.astype(int)-1], showName, config['EXPERIMENT']['name'], config['PATH']['output'], config['EXTENSION']['output'])
        speakerSlice = getSegResultForPlot(
            config.getfloat('FEATURES', 'frameshift'), segmentTable,
            clustering)

    p = PlotDiar(map=speakerSlice,
                 wav=wav_path,
                 title='Binary key diarization: ' + wav_path +
                 ', number of speakers: ' + str(len(speakerSlice)),
                 gui=True,
                 pick=True,
                 size=(25, 6))

    wm = p.plot.get_current_fig_manager()
    wm.window.state('zoomed')

    p.draw()
    p.plot.show()

    if config.getint('OUTPUT', 'returnAllPartialSolutions'):
        if not os.path.isdir(config['PATH']['output']):
            os.mkdir(config['PATH']['output'])
        outputPathInd = config['PATH']['output'] + config['EXPERIMENT'][
            'name'] + '/' + showName + '/'
        if not os.path.isdir(config['PATH']['output'] +
                             config['EXPERIMENT']['name']):
            os.mkdir(config['PATH']['output'] + config['EXPERIMENT']['name'])
        if not os.path.isdir(outputPathInd):
            os.mkdir(outputPathInd)
        for i in np.arange(k):
            getSegmentationFile(
                config['OUTPUT']['format'],
                config.getfloat('FEATURES', 'frameshift'), segmentTable,
                finalClusteringTable[:, i], showName, showName + '_' +
                str(np.size(np.unique(finalClusteringTable[:, i]), 0)) +
                '_spk', outputPathInd, config['EXTENSION']['output'])

    print('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

Example #3

Show file

File: main.py Project: jackyguo624/VAD_Diarization_viewer

from viewer import PlotDiar

speakerSlice={'1': [{'start':175*100, 'stop':200}, {'start':30, 'stop':120}], '2': [{'start':90, 'stop':130*1000}]}
p = PlotDiar(map=speakerSlice, wav=r'example.wav', gui=True, size=(25, 6))
p.draw()
p.plot.show()

Example #4

Show file

def main(wav_path,
         embedding_per_second=1.0,
         n_classes=5994,
         overlap_rate=0.5,
         plot_results=True):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)
    #

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    print('intervals', intervals, len(intervals))
    print('mapTable', mapTable, len(mapTable))
    print('keys', keys, len(keys))
    # print('mapTable, keys', mapTable, keys)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        # print('v',v.shape)
        #print('feats', feats.shape)

        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)
    print(feats.shape)
    print(inference_args)
    print('predicted_label', predicted_label)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    print('time_spec_rate', time_spec_rate)
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    print('speakerSlice', speakerSlice)
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        print(spk, timeDicts)
        for tid, timeDict in enumerate(timeDicts):
            print(tid, timeDict)
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    print('offset', offset)
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

                print('i,s,e')
                print(i, s, e, tid, spk)
            print('>>>>>', i, s, e, tid, spk)
            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    speaker_assingments = []
    for spk, timeDicts in speakerSlice.items():
        speaker = str(spk)
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            start = timeDict['start']
            end = timeDict['stop']
            start = fmtTime(
                start)  # change point moves to the center of the slice
            end = fmtTime(end)
            print(start + ' ==> ' + end)
            speaker_assingments.append((start, end, speaker, wav_path))

    if plot_results:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return feats, predicted_label, intervals, speaker_assingments, time_spec_rate

Example #5

Show file

File: SpeakerDiarizationClustering.py Project: snehalmane30/SpeakerDiarization

def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)

    #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4)
    #mapTable1,keys1 =genMap(interval1)
    mapTable, keys = genMap(intervals)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]


# =============================================================================
#     for spec1 in specs1:
#         spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
#         v = network_eval.predict(spec1)
#         feats += [v]
# =============================================================================
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    #print(len(feats),'00000000')
    #predicted_label = uisrnnModel.predict(feats, inference_args)

    #silhoutte score
    # =============================================================================
    #     sli=[]
    #     fromsel=[]
    #     li=[]
    #     knum=[]
    #     for i in range(10):
    #         li=[]
    #         range_n_clusters = list (range(2,5))
    #         for n_clusters in range_n_clusters:
    #             clusterer = KMeans(n_clusters=n_clusters)
    #             preds = clusterer.fit_predict(feats)
    #             centers = clusterer.cluster_centers_
    #
    #             score = silhouette_score (feats, preds, metric='euclidean')
    #             print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
    #             li.append([n_clusters,score,clusterer,centers])
    #     # =============================================================================
    #     #     print([float(str(i[1])[:4]) for i in li])
    #     #     kvalue=(max([float(str(i[1])[:4]) for i in li]))
    #     #     for i in range(len(li)):
    #     #         if kvalue==float(str(li[i][1])[:4]):
    #     #             true_k=li[i][0]
    #     #             break
    #     # =============================================================================
    #         maxi=li[0][1]
    #         for i in range(1,len(li)):
    #             if li[i][1]-maxi>=0.005:
    #                 maxi=li[i][1]
    #         for i in li:
    #             if i[1]==maxi:
    #                 true_k=i[0]
    #     # =============================================================================
    #     #     maxi=max([i[1] for i in li])
    #     #     for i in li:
    #     #         if i[1]==maxi:
    #     #             true_k=i[0]
    #     # =============================================================================
    #         fromsel.append(li[true_k-2])
    #         print(true_k)
    #         knum.append(true_k)
    #     kval=(max(set(knum), key=knum.count))
    #     print(kval)
    # =============================================================================

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.95,
                                  gaussian_blur_sigma=1)
    predicted_label = clusterer.predict(feats)

    # =============================================================================
    #     clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0)
    #     clusters.fit(feats)
    #     tsne = TSNEVisualizer()
    #     tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_])
    #     tsne.poof()
    # =============================================================================

    global no_speakers
    no_speakers = len(set(predicted_label))
    #print(predicted_label,'**************************')
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()

Example #6

Show file

def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         retain_audio_clip=False):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            get_transcript(str(spk), s, e)

    result = print_transcipt()
    try:
        for item in result:
            start = fmtTime(item[1])
            end = fmtTime(item[2])
            file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a')
            transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}"
            print(transcription)
            file.write(transcription)
    except Exception as exp:
        print(f"Failed in main() while writing to file with exception {exp}")
    finally:
        file.close()

    if not retain_audio_clip:
        shutil.rmtree(dir_name)
    else:
        print(
            f'Audio files of transcriptions can be found in {dir_name} folder')

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()

    return result