def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = arguments.parse_arguments()
    model_args.observation_dim = 512
    diarization_Model = UISRNN(model_args)
    diarization_Model.load(SAVED_MODEL_NAME)
    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)
    feats = []

    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = diarization_Model.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice,
                 wav=wav_path,
                 gui=True,
                 pick=True,
                 size=(25, 6))
    p.draw()
    p.plot.show()
Example #2
0
def runDiarization(showName, config):

    wav_path = config['PATH']['audio'] + showName + '.wav'

    t0 = time.time()
    print('showName\t\t', showName)
    print('Extracting features')

    if config.getint('GENERAL', 'performFeatureExtraction'):
        allData = extractFeatures(
            config['PATH']['audio'] + showName + config['EXTENSION']['audio'],
            config.getfloat('FEATURES', 'framelength'),
            config.getfloat('FEATURES', 'frameshift'),
            config.getint('FEATURES', 'nfilters'),
            config.getint('FEATURES', 'ncoeff'))
    else:
        allData = getFeatures(config['PATH']['features'] + showName +
                              config['EXTENSION']['features'])
    nFeatures = allData.shape[0]
    print('shape of features:', allData.shape)
    print('Initial number of features\t', nFeatures)

    t1 = time.time()
    feature_t = t1 - t0
    print("Time used for extracting features:", feature_t)

    if os.path.isfile(config['PATH']['UEM'] + showName +
                      config['EXTENSION']['UEM']):
        maskUEM = readUEMfile(config['PATH']['UEM'], showName,
                              config['EXTENSION']['UEM'], nFeatures,
                              config.getfloat('FEATURES', 'frameshift'))
    else:
        print(
            'UEM file does not exist. The complete audio content is considered.'
        )
        maskUEM = np.ones([1, nFeatures])

    if os.path.isfile(config['PATH']['SAD'] + showName +
                      config['EXTENSION']['SAD']) and not (config.getint(
                          'GENERAL', 'performVAD')):
        maskSAD = readSADfile(config['PATH']['SAD'], showName,
                              config['EXTENSION']['SAD'], nFeatures,
                              config.getfloat('FEATURES', 'frameshift'),
                              config['GENERAL']['SADformat'])
    else:
        print(
            'SAD file does not exist or automatic VAD is enabled in the config. VAD is applied and saved at %s.\n'
            % (config['PATH']['SAD'] + showName + '.lab'))
        maskSAD = getSADfile(config, showName, nFeatures)

    t2 = time.time()
    SAD_t = t2 - t1
    print("Time used for SAD: ", SAD_t)

    print('shape of SAD mask', maskSAD.shape)

    mask = np.logical_and(maskUEM, maskSAD)
    mask = mask[0][0:nFeatures]

    nSpeechFeatures = np.sum(mask)
    speechMapping = np.zeros(nFeatures)
    #you need to start the mapping from 1 and end it in the actual number of features independently of the indexing style
    #so that we don't lose features on the way
    speechMapping[np.nonzero(mask)] = np.arange(1, nSpeechFeatures + 1)
    data = allData[np.where(mask == 1)]
    del allData
    segmentTable = getSegmentTable(mask, speechMapping,
                                   config.getint('SEGMENT', 'length'),
                                   config.getint('SEGMENT', 'increment'),
                                   config.getint('SEGMENT', 'rate'))
    numberOfSegments = np.size(segmentTable, 0)
    print('Number of speech features\t', nSpeechFeatures)
    print('Number of segements \t', numberOfSegments)

    print(segmentTable[0])
    #create the KBM
    print('Training the KBM... ')

    #set the window rate in order to obtain "minimumNumberOfInitialGaussians" gaussians
    if np.floor((nSpeechFeatures - config.getint('KBM', 'windowLength')) /
                config.getint(
                    'KBM', 'minimumNumberOfInitialGaussians')) < config.getint(
                        'KBM', 'maximumKBMWindowRate'):
        windowRate = int(
            np.floor(
                (np.size(data, 0) - config.getint('KBM', 'windowLength')) /
                config.getint('KBM', 'minimumNumberOfInitialGaussians')))
    else:
        windowRate = int(config.getint('KBM', 'maximumKBMWindowRate'))
    print('KBM window rate:', windowRate)

    poolSize = np.floor(
        (nSpeechFeatures - config.getint('KBM', 'windowLength')) / windowRate)
    if config.getint('KBM', 'useRelativeKBMsize'):
        kbmSize = int(np.floor(poolSize *
                               config.getfloat('KBM', 'relKBMsize')))
    else:
        kbmSize = int(config.getint('KBM', 'kbmSize'))
    print('Training pool of', int(poolSize), 'gaussians with a rate of',
          int(windowRate), 'frames')
    kbm, gmPool = trainKBM(data, config.getint('KBM', 'windowLength'),
                           windowRate, kbmSize)
    print('Selected', kbmSize, 'gaussians from the pool')

    Vg = getVgMatrix(data, gmPool, kbm,
                     config.getint('BINARY_KEY', 'topGaussiansPerFrame'))
    print(Vg[0])
    print('Vg shape:', Vg.shape)

    t3 = time.time()
    KBM_t = t3 - t2
    print("Time used for traing KBM: ", KBM_t)

    print('Computing binary keys for all segments... ')
    segmentBKTable, segmentCVTable = getSegmentBKs(
        segmentTable, kbmSize, Vg,
        config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), speechMapping)

    print(segmentBKTable.shape)
    print(segmentCVTable.shape)

    t4 = time.time()
    BKCV_t = t4 - t3
    print("Time used to cal BK, CV: ", BKCV_t)

    print('Performing initial clustering... ')
    initialClustering = np.digitize(
        np.arange(numberOfSegments),
        np.arange(0, numberOfSegments,
                  numberOfSegments / config.getint('CLUSTERING', 'N_init')))

    print('initial clustering:', initialClustering.size)
    #print('initial clustering:', initialClustering)

    print('done')
    print('Performing agglomerative clustering... ')
    if config.getint('CLUSTERING', 'linkage'):
        finalClusteringTable, k = performClusteringLinkage(
            segmentBKTable, segmentCVTable,
            config.getint('CLUSTERING',
                          'N_init'), config['CLUSTERING']['linkageCriterion'],
            config['CLUSTERING']['metric'])
    else:
        finalClusteringTable, k = performClustering(
            speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg,
            config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), kbmSize,
            config.getint('CLUSTERING', 'N_init'), initialClustering,
            config['CLUSTERING']['metric'])

    t5 = time.time()
    clustering_t = t5 - t4
    print("Time used for clustering: ", clustering_t)

    print('Selecting best clustering...')
    if config['CLUSTERING_SELECTION']['bestClusteringCriterion'] == 'elbow':
        bestClusteringID = getBestClustering(
            config['CLUSTERING_SELECTION']['metric_clusteringSelection'],
            segmentBKTable, segmentCVTable, finalClusteringTable, k)
    elif config['CLUSTERING_SELECTION'][
            'bestClusteringCriterion'] == 'spectral':
        bestClusteringID = getSpectralClustering(
            config['CLUSTERING_SELECTION']['metric_clusteringSelection'],
            finalClusteringTable, config.getint(
                'CLUSTERING', 'N_init'), segmentBKTable, segmentCVTable, k,
            config.getint('CLUSTERING_SELECTION', 'sigma'),
            config.getint('CLUSTERING_SELECTION', 'percentile'),
            config.getint('CLUSTERING_SELECTION', 'maxNrSpeakers')) + 1
    print('Best clustering:\t', bestClusteringID.astype(int))
    print(
        'Number of clusters:\t',
        np.size(
            np.unique(finalClusteringTable[:,
                                           bestClusteringID.astype(int) - 1]),
            0))

    print(np.unique(finalClusteringTable))
    print(finalClusteringTable.shape)
    print(np.unique(finalClusteringTable[:, bestClusteringID.astype(int) - 1]))
    #print('best clustering results:')
    #print(finalClusteringTable[:,bestClusteringID.astype(int)-1])

    t6 = time.time()
    best_clustering_t = t6 - t5
    print("Time used for best clustering: ", best_clustering_t)

    final_clustering = finalClusteringTable[:,
                                            bestClusteringID.astype(int) - 1]

    if config.getint('RESEGMENTATION', 'resegmentation') and np.size(
            np.unique(finalClusteringTable[:,
                                           bestClusteringID.astype(int) - 1]),
            0) > 1:

        print(final_clustering.shape)
        print(final_clustering)

        print(segmentTable.shape)

        print('Performing GMM-ML resegmentation...')
        finalClusteringTableResegmentation, finalSegmentTable = performResegmentation(
            data, speechMapping, mask,
            finalClusteringTable[:, bestClusteringID.astype(int) - 1],
            segmentTable, config.getint('RESEGMENTATION', 'modelSize'),
            config.getint('RESEGMENTATION', 'nbIter'),
            config.getint('RESEGMENTATION', 'smoothWin'), nSpeechFeatures)

        print(finalClusteringTableResegmentation.shape)
        print(finalClusteringTableResegmentation)
        print(finalSegmentTable.shape)

        print('done')

        print(finalClusteringTableResegmentation.shape)
        print(finalSegmentTable.shape)
        print(segmentTable.shape)
        '''
        for i in range(0, 19):
            print(finalClusteringTableResegmentation[i],finalSegmentTable[i],finalSegmentTable[i][0]-finalSegmentTable[i][2])
        
        '''

        t7 = time.time()
        reseg_t = t7 - t6
        print("Time used for resegmentation: ", reseg_t)

        tu = t7 - t0
        print('Total time used:', tu)

        getSegmentationFile(config['OUTPUT']['format'],
                            config.getfloat('FEATURES',
                                            'frameshift'), finalSegmentTable,
                            np.squeeze(finalClusteringTableResegmentation),
                            showName, config['EXPERIMENT']['name'],
                            config['PATH']['output'],
                            config['EXTENSION']['output'])

        t1 = time.time()
        y, sr = librosa.load(wav_path, sr=None)
        audio_duration = librosa.get_duration(y, sr=sr)
        print('load data: ', time.time() - t1)

        print('audio duration: ', audio_duration)
        print('real-time factor: ', tu / audio_duration)

        print(wav_path)

        print(feature_t, SAD_t, KBM_t, clustering_t, reseg_t, tu)

        #wav_path = './audio_test/2.wav'
        #print(config['PATH']['audio']+showName+'.wav')

        speakerSlice = getSegResultForPlot(
            config.getfloat('FEATURES', 'frameshift'), finalSegmentTable,
            np.squeeze(finalClusteringTableResegmentation))

    else:
        clustering = rearrangeClusterID(final_clustering)
        #getSegmentationFile(config['OUTPUT']['format'],config.getfloat('FEATURES','frameshift'),segmentTable, finalClusteringTable[:,bestClusteringID.astype(int)-1], showName, config['EXPERIMENT']['name'], config['PATH']['output'], config['EXTENSION']['output'])
        speakerSlice = getSegResultForPlot(
            config.getfloat('FEATURES', 'frameshift'), segmentTable,
            clustering)

    p = PlotDiar(map=speakerSlice,
                 wav=wav_path,
                 title='Binary key diarization: ' + wav_path +
                 ', number of speakers: ' + str(len(speakerSlice)),
                 gui=True,
                 pick=True,
                 size=(25, 6))

    wm = p.plot.get_current_fig_manager()
    wm.window.state('zoomed')

    p.draw()
    p.plot.show()

    if config.getint('OUTPUT', 'returnAllPartialSolutions'):
        if not os.path.isdir(config['PATH']['output']):
            os.mkdir(config['PATH']['output'])
        outputPathInd = config['PATH']['output'] + config['EXPERIMENT'][
            'name'] + '/' + showName + '/'
        if not os.path.isdir(config['PATH']['output'] +
                             config['EXPERIMENT']['name']):
            os.mkdir(config['PATH']['output'] + config['EXPERIMENT']['name'])
        if not os.path.isdir(outputPathInd):
            os.mkdir(outputPathInd)
        for i in np.arange(k):
            getSegmentationFile(
                config['OUTPUT']['format'],
                config.getfloat('FEATURES', 'frameshift'), segmentTable,
                finalClusteringTable[:, i], showName, showName + '_' +
                str(np.size(np.unique(finalClusteringTable[:, i]), 0)) +
                '_spk', outputPathInd, config['EXTENSION']['output'])

    print('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
from viewer import PlotDiar

speakerSlice={'1': [{'start':175*100, 'stop':200}, {'start':30, 'stop':120}], '2': [{'start':90, 'stop':130*1000}]}
p = PlotDiar(map=speakerSlice, wav=r'example.wav', gui=True, size=(25, 6))
p.draw()
p.plot.show()
Example #4
0
def main(wav_path,
         embedding_per_second=1.0,
         n_classes=5994,
         overlap_rate=0.5,
         plot_results=True):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)
    #

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    print('intervals', intervals, len(intervals))
    print('mapTable', mapTable, len(mapTable))
    print('keys', keys, len(keys))
    # print('mapTable, keys', mapTable, keys)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        # print('v',v.shape)
        #print('feats', feats.shape)

        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)
    print(feats.shape)
    print(inference_args)
    print('predicted_label', predicted_label)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    print('time_spec_rate', time_spec_rate)
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    print('speakerSlice', speakerSlice)
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        print(spk, timeDicts)
        for tid, timeDict in enumerate(timeDicts):
            print(tid, timeDict)
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    print('offset', offset)
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

                print('i,s,e')
                print(i, s, e, tid, spk)
            print('>>>>>', i, s, e, tid, spk)
            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    speaker_assingments = []
    for spk, timeDicts in speakerSlice.items():
        speaker = str(spk)
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            start = timeDict['start']
            end = timeDict['stop']
            start = fmtTime(
                start)  # change point moves to the center of the slice
            end = fmtTime(end)
            print(start + ' ==> ' + end)
            speaker_assingments.append((start, end, speaker, wav_path))

    if plot_results:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)

    #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4)
    #mapTable1,keys1 =genMap(interval1)
    mapTable, keys = genMap(intervals)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]


# =============================================================================
#     for spec1 in specs1:
#         spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
#         v = network_eval.predict(spec1)
#         feats += [v]
# =============================================================================
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    #print(len(feats),'00000000')
    #predicted_label = uisrnnModel.predict(feats, inference_args)

    #silhoutte score
    # =============================================================================
    #     sli=[]
    #     fromsel=[]
    #     li=[]
    #     knum=[]
    #     for i in range(10):
    #         li=[]
    #         range_n_clusters = list (range(2,5))
    #         for n_clusters in range_n_clusters:
    #             clusterer = KMeans(n_clusters=n_clusters)
    #             preds = clusterer.fit_predict(feats)
    #             centers = clusterer.cluster_centers_
    #
    #             score = silhouette_score (feats, preds, metric='euclidean')
    #             print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
    #             li.append([n_clusters,score,clusterer,centers])
    #     # =============================================================================
    #     #     print([float(str(i[1])[:4]) for i in li])
    #     #     kvalue=(max([float(str(i[1])[:4]) for i in li]))
    #     #     for i in range(len(li)):
    #     #         if kvalue==float(str(li[i][1])[:4]):
    #     #             true_k=li[i][0]
    #     #             break
    #     # =============================================================================
    #         maxi=li[0][1]
    #         for i in range(1,len(li)):
    #             if li[i][1]-maxi>=0.005:
    #                 maxi=li[i][1]
    #         for i in li:
    #             if i[1]==maxi:
    #                 true_k=i[0]
    #     # =============================================================================
    #     #     maxi=max([i[1] for i in li])
    #     #     for i in li:
    #     #         if i[1]==maxi:
    #     #             true_k=i[0]
    #     # =============================================================================
    #         fromsel.append(li[true_k-2])
    #         print(true_k)
    #         knum.append(true_k)
    #     kval=(max(set(knum), key=knum.count))
    #     print(kval)
    # =============================================================================

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.95,
                                  gaussian_blur_sigma=1)
    predicted_label = clusterer.predict(feats)

    # =============================================================================
    #     clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0)
    #     clusters.fit(feats)
    #     tsne = TSNEVisualizer()
    #     tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_])
    #     tsne.poof()
    # =============================================================================

    global no_speakers
    no_speakers = len(set(predicted_label))
    #print(predicted_label,'**************************')
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()
Example #6
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         retain_audio_clip=False):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            get_transcript(str(spk), s, e)

    result = print_transcipt()
    try:
        for item in result:
            start = fmtTime(item[1])
            end = fmtTime(item[2])
            file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a')
            transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}"
            print(transcription)
            file.write(transcription)
    except Exception as exp:
        print(f"Failed in main() while writing to file with exception {exp}")
    finally:
        file.close()

    if not retain_audio_clip:
        shutil.rmtree(dir_name)
    else:
        print(
            f'Audio files of transcriptions can be found in {dir_name} folder')

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()

    return result