Beispiel #1
0
 def test_length_one_seq(self, numPairs=100):
     for pair in range(numPairs):
         x = randn()
         y = randn()
         minCost, path = dtw.dtw([x], [y], eucCost)
         assert_allclose(minCost, abs(x - y))
         assert path == [(0, 0)]
Beispiel #2
0
 def test_one_length_one_seq(self, numPairs=100):
     for pair in range(numPairs):
         xs = randn(randint(1, 10))
         y = randn()
         minCost, path = dtw.dtw(xs, [y], eucCost)
         assert_allclose(minCost, sum([ abs(x - y) for x in xs]))
         assert path == [ (i, 0) for i in range(len(xs)) ]
Beispiel #3
0
 def test_one_length_one_seq(self, numPairs=100):
     for pair in range(numPairs):
         xs = randn(randint(1, 10))
         y = randn()
         minCost, path = dtw.dtw(xs, [y], eucCost)
         assert_allclose(minCost, sum([abs(x - y) for x in xs]))
         assert path == [(i, 0) for i in range(len(xs))]
Beispiel #4
0
 def test_length_one_seq(self, numPairs=100):
     for pair in range(numPairs):
         x = randn()
         y = randn()
         minCost, path = dtw.dtw([x], [y], eucCost)
         assert_allclose(minCost, abs(x - y))
         assert path == [(0, 0)]
Beispiel #5
0
    def test_universal_properties(self, numPairs=100, numPathsPerPair=20):
        for pair in range(numPairs):
            dim = randint(0, 3) if randBool() else randint(0, 10)
            xs = randSeq(dim=dim, minLength=1)
            ys = randSeq(dim=dim, minLength=1)
            minCost, path = dtw.dtw(xs, ys, eucCost)

            # test cost along path agrees with minimum cost
            assert_allclose(minCost, getPathCost(path, xs, ys, eucCost))

            # test transpose
            minCost2, path2 = dtw.dtw(ys, xs, eucCost)
            assert_allclose(minCost2, minCost)
            # N.B. this is not a universal property but will almost always be
            #   true for the random sequences of floats that we generate.
            assert path2 == dtw.swapPath(path)

            # test path is a valid path
            assert dtw.isValidPath(path)
            assert path[-1] == (len(xs) - 1, len(ys) - 1)

            # test optimal subpaths property
            cutIndex = randint(len(path))
            iCut, jCut = path[cutIndex]
            pathA = path[:(cutIndex + 1)]
            pathB = path[cutIndex:]
            costA = getPathCost(pathA, xs, ys, eucCost)
            costB = getPathCost(pathB, xs, ys, eucCost)
            minCostA, _ = dtw.dtw(xs[:(iCut + 1)], ys[:(jCut + 1)], eucCost)
            minCostB, _ = dtw.dtw(xs[iCut:], ys[jCut:], eucCost)
            assert_allclose(costA, minCostA)
            assert_allclose(costB, minCostB)

            # test minCost <= cost for several randomly generated paths
            childrenDict, startNode = getDtwDag(len(xs), len(ys))
            for _ in range(numPathsPerPair):
                path = getRandomDagPath(childrenDict, startNode)
                cost = getPathCost(path, xs, ys, eucCost)
                assert minCost <= cost or np.allclose(minCost, cost)

            # minCost to itself should be zero
            assert dtw.dtw(xs, xs, eucCost)[0] == 0.0
Beispiel #6
0
    def test_universal_properties(self, numPairs=100, numPathsPerPair=20):
        for pair in range(numPairs):
            dim = randint(0, 3) if randBool() else randint(0, 10)
            xs = randSeq(dim=dim, minLength=1)
            ys = randSeq(dim=dim, minLength=1)
            minCost, path = dtw.dtw(xs, ys, eucCost)

            # test cost along path agrees with minimum cost
            assert_allclose(minCost, getPathCost(path, xs, ys, eucCost))

            # test transpose
            minCost2, path2 = dtw.dtw(ys, xs, eucCost)
            assert_allclose(minCost2, minCost)
            # N.B. this is not a universal property but will almost always be
            #   true for the random sequences of floats that we generate.
            assert path2 == dtw.swapPath(path)

            # test path is a valid path
            assert dtw.isValidPath(path)
            assert path[-1] == (len(xs) - 1, len(ys) - 1)

            # test optimal subpaths property
            cutIndex = randint(len(path))
            iCut, jCut = path[cutIndex]
            pathA = path[:(cutIndex + 1)]
            pathB = path[cutIndex:]
            costA = getPathCost(pathA, xs, ys, eucCost)
            costB = getPathCost(pathB, xs, ys, eucCost)
            minCostA, _ = dtw.dtw(xs[:(iCut + 1)], ys[:(jCut + 1)], eucCost)
            minCostB, _ = dtw.dtw(xs[iCut:], ys[jCut:], eucCost)
            assert_allclose(costA, minCostA)
            assert_allclose(costB, minCostB)

            # test minCost <= cost for several randomly generated paths
            childrenDict, startNode = getDtwDag(len(xs), len(ys))
            for _ in range(numPathsPerPair):
                path = getRandomDagPath(childrenDict, startNode)
                cost = getPathCost(path, xs, ys, eucCost)
                assert minCost <= cost or np.allclose(minCost, cost)

            # minCost to itself should be zero
            assert dtw.dtw(xs, xs, eucCost)[0] == 0.0
def compute_dtw_error(references, predictions, distance=mt.logSpecDbDist):
    minCostTot = 0.0
    framesTot = 0
    for (nat, synth) in tqdm(zip(references, predictions)):
        nat, synth = nat.astype('float64'), synth.astype('float64')
        minCost, path = dtw.dtw(nat, synth, distance)
        frames = len(nat)
        minCostTot += minCost
        framesTot += frames
    mean_score = minCostTot / framesTot
    print ('overall score = %f (%s frames nat/synth)' % (mean_score, framesTot))
    return mean_score
Beispiel #8
0
    def test_brute_force_small(self, numPairs=100):
        for pair in range(numPairs):
            dim = randint(0, 3) if randBool() else randint(0, 10)
            xs = randSeq(dim=dim, minLength=1, ensureShort=True)
            ys = randSeq(dim=dim, minLength=1, ensureShort=True)

            childrenDict, startNode = getDtwDag(len(xs), len(ys))
            minCostGood = min([
                getPathCost(path, xs, ys, eucCost)
                for path, _ in getDagPathIterator(childrenDict, startNode)
            ])

            minCost, _ = dtw.dtw(xs, ys, eucCost)
            assert_allclose(minCost, minCostGood)
Beispiel #9
0
    def test_brute_force_small(self, numPairs=100):
        for pair in range(numPairs):
            dim = randint(0, 3) if randBool() else randint(0, 10)
            xs = randSeq(dim=dim, minLength=1, ensureShort=True)
            ys = randSeq(dim=dim, minLength=1, ensureShort=True)

            childrenDict, startNode = getDtwDag(len(xs), len(ys))
            minCostGood = min([
                getPathCost(path, xs, ys, eucCost)
                for path, _ in getDagPathIterator(childrenDict, startNode)
            ])

            minCost, _ = dtw.dtw(xs, ys, eucCost)
            assert_allclose(minCost, minCostGood)
Beispiel #10
0
def mel_cep_dtw_dist(target, converted):
    """
    Compute the distance between two unaligned speech waveforms
    :param target: reference speech numpy array
    :param converted: synthesized speech numpy array
    :return: mel cep distance in dB
    """
    total_cost = 0
    total_frames = 0
    for (tar, conv) in zip(target, converted):
        tar, conv = tar.astype('float64'), conv.astype('float64')
        cost, _ = dtw.dtw(tar, conv, mt.logSpecDbDist)
        frames = len(tar)
        total_cost += cost
        total_frames += frames

    return total_cost / total_frames
def dtw_mcd(cvt, trg):

    cvt_mcep = cvt.astype('float64')
    trg_mcep = trg.astype('float64')
    cvt_mcep = cvt_mcep[:, 1:]
    trg_mcep = trg_mcep[:, 1:]
    '''
    for t, c in zip(trg_mcep, cvt_mcep):
        cost, _ = dtw.dtw(t, c, mt.logSpecDbDist)
        
        frames = len(trg_mcep)
        total_cost += cost
        total_frames += frames
    '''

    cost, _ = dtw.dtw(trg_mcep, cvt_mcep, mt.logSpecDbDist)
    return cost / len(trg_mcep)
Beispiel #12
0
def main(rawArgs):
    parser = argparse.ArgumentParser(
        description=(
            'Time-warps a speech parameter sequence based on a reference.'
            ' Dynamic time warping (DTW) is used to compute the time warping.'
            ' By default a speech parameter sequence consisting of three'
            ' portions (mgc,lf0,bap) is warped to match the timing of the'
            ' reference speech parameter sequence.'
            ' Only the first portion is used when computing the warping.'),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--exts',
        dest='exts',
        default='mgc,lf0,bap',
        metavar='EXTLIST',
        help=('file extensions added to uttId to get file containing speech'
              ' parameters'))
    parser.add_argument('--param_orders',
                        dest='paramOrders',
                        default='40,1,5',
                        metavar='ORDERLIST',
                        help='orders of the parameter files (mgc,lf0,bap)')
    parser.add_argument(dest='natDir',
                        metavar='NATDIR',
                        help='directory containing natural speech parameters')
    parser.add_argument(
        dest='synthDir',
        metavar='SYNTHDIR',
        help='directory containing synthetic speech parameters')
    parser.add_argument(dest='outDir',
                        metavar='OUTDIR',
                        help='directory to output warped speech parameters to')
    parser.add_argument(dest='uttIds',
                        metavar='UTTID',
                        nargs='+',
                        help='utterance ids (ext will be appended to these)')
    args = parser.parse_args(rawArgs[1:])

    costFn = mt.logSpecDbDist

    paramOrders = [
        int(paramOrderStr) for paramOrderStr in args.paramOrders.split(',')
    ]
    assert paramOrders
    mgcParamOrder = paramOrders[0]

    exts = args.exts.split(',')
    assert len(exts) == len(paramOrders)
    mgcExt = exts[0]

    vecSeqIo = vsio.VecSeqIo(mgcParamOrder)
    getNatVecSeq = DirReader(vecSeqIo, args.natDir, mgcExt)
    getSynthVecSeq = DirReader(vecSeqIo, args.synthDir, mgcExt)

    minCostTot = 0.0
    framesTot = 0
    for uttId in args.uttIds:
        print('processing', uttId)
        nat = getNatVecSeq(uttId)
        synth = getSynthVecSeq(uttId)
        # ignore 0th cepstral component
        nat = nat[:, 1:]
        synth = synth[:, 1:]

        minCost, path = dtw.dtw(nat, synth, costFn)
        frames = len(nat)

        minCostTot += minCost
        framesTot += frames

        print('MCD = %f (%d frames)' % (minCost / frames, frames))

        pathCosts = [costFn(nat[i], synth[j]) for i, j in path]
        synthIndexSeq = dtw.projectPathBestCost(path, pathCosts)
        assert len(synthIndexSeq) == len(nat)

        uniqueFrames = len(set(synthIndexSeq))
        repeatedFrames = len(synthIndexSeq) - uniqueFrames
        droppedFrames = len(synth) - uniqueFrames
        assert len(synth) - droppedFrames + repeatedFrames == len(nat)
        print(('warping %s frames -> %s frames (%s repeated, %s dropped)' %
               (len(synth), len(nat), repeatedFrames, droppedFrames)))
        print()

        for paramOrder, ext in zip(paramOrders, exts):
            vecSeqIo = vsio.VecSeqIo(paramOrder)

            synthFullFile = os.path.join(args.synthDir, uttId + '.' + ext)
            synthFull = vecSeqIo.readFile(synthFullFile)

            synthFullWarped = dtw.warpGeneral(synthFull, synthIndexSeq)

            synthFullWarpedFile = os.path.join(args.outDir, uttId + '.' + ext)
            vecSeqIo.writeFile(synthFullWarpedFile, synthFullWarped)

    print('overall MCD = %f (%d frames)' % (minCostTot / framesTot, framesTot))
Beispiel #13
0
def measure(output_directory, log_directory, checkpoint_path, warm_start,
            n_gpus, rank, group_name, hparams):
    """Handles all the validation scoring and printing"""
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    mellotron = load_model(hparams).cuda().eval()
    mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt'
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    denoiser = Denoiser(waveglow).cuda().eval()

    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt'
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(1)

    speaker_ids = TextMelLoader(
        "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt",
        hparams).speaker_ids
    speakers = pd.read_csv('filelists/libritts_speakerinfo.txt',
                           engine='python',
                           header=None,
                           comment=';',
                           sep=' *\| *',
                           names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
    speakers['MELLOTRON_ID'] = speakers['ID'].apply(
        lambda x: speaker_ids[x] if x in speaker_ids else -1)
    female_speakers = cycle(
        speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())
    male_speakers = cycle(
        speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())

    file_idx = 0
    MEL_DTW = []
    TPP_DTW = []
    RAND_DTW = []
    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
    while file_idx < len(dataloader):
        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

        # get audio path, encoded text, pitch contour and mel for gst
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,
                             arpabet_dict))[None, :].cuda()
        pitch_contour = dataloader[file_idx][3][None].cuda()
        mel = load_mel(audio_path, stft)
        fs, audio = read(audio_path)

        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)
        speaker_id = next(female_speakers) if np.random.randint(2) else next(
            male_speakers)
        speaker_id = torch.LongTensor([speaker_id]).cuda()

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_mel = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=True)
        with torch.no_grad():
            audio_tpp = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, np.random.randint(
                    0, 9), speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_rand = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
        audio = np.pad(audio, 128)

        MEL_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0]))
        TPP_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0]))
        RAND_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0]))
        print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1])
        print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW))
        print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW))
        print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW))
        file_idx += 1
Beispiel #14
0
def MCD(sp1, sp2, costFn):
    min_cost, path = dtw.dtw(sp1, sp2, costFn)
    frames = len(sp1)
    return min_cost, frames, path
Beispiel #15
0
def main(rawArgs):
    parser = argparse.ArgumentParser(
        description=(
            'Computes the MCD DTW metric for two sequences of mel cepstra.'
            ' Mel cepstral distortion (MCD) is a measure of the difference'
            ' between two sequences of mel cepstra.'
            ' This utility computes the MCD between two sequences allowing for'
            ' possible differences in timing.'
            ' Specifically it uses dynamic time warping (DTW) to compute the'
            ' minimum MCD that can be obtained by "aligning" the two sequences'
            ' subject to certain constraints on the form of the alignment.'),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--ext',
        dest='ext',
        default='mgc',
        metavar='EXT',
        help=('file extension added to uttId to get file containing speech'
              ' parameters'))
    parser.add_argument('--param_order',
                        dest='paramOrder',
                        default=40,
                        type=int,
                        metavar='ORDER',
                        help='parameter order of the cepstral files')
    parser.add_argument(dest='natDir',
                        metavar='NATDIR',
                        help='directory containing natural speech parameters')
    parser.add_argument(
        dest='synthDir',
        metavar='SYNTHDIR',
        help='directory containing synthetic speech parameters')
    parser.add_argument(dest='uttIds',
                        metavar='UTTID',
                        nargs='+',
                        help='utterance ids (ext will be appended to these)')
    args = parser.parse_args(rawArgs[1:])

    costFn = mt.logSpecDbDist

    vecSeqIo = vsio.VecSeqIo(args.paramOrder)
    getNatVecSeq = DirReader(vecSeqIo, args.natDir, args.ext)
    getSynthVecSeq = DirReader(vecSeqIo, args.synthDir, args.ext)

    minCostTot = 0.0
    framesTot = 0
    for uttId in args.uttIds:
        print('processing', uttId)
        nat = getNatVecSeq(uttId)
        synth = getSynthVecSeq(uttId)
        # ignore 0th cepstral component
        nat = nat[:, 1:]
        synth = synth[:, 1:]

        minCost, path = dtw.dtw(nat, synth, costFn)
        frames = len(nat)

        minCostTot += minCost
        framesTot += frames

    print('overall MCD = %f (%d frames)' % (minCostTot / framesTot, framesTot))