def test_length_one_seq(self, numPairs=100): for pair in range(numPairs): x = randn() y = randn() minCost, path = dtw.dtw([x], [y], eucCost) assert_allclose(minCost, abs(x - y)) assert path == [(0, 0)]
def test_one_length_one_seq(self, numPairs=100): for pair in range(numPairs): xs = randn(randint(1, 10)) y = randn() minCost, path = dtw.dtw(xs, [y], eucCost) assert_allclose(minCost, sum([ abs(x - y) for x in xs])) assert path == [ (i, 0) for i in range(len(xs)) ]
def test_one_length_one_seq(self, numPairs=100): for pair in range(numPairs): xs = randn(randint(1, 10)) y = randn() minCost, path = dtw.dtw(xs, [y], eucCost) assert_allclose(minCost, sum([abs(x - y) for x in xs])) assert path == [(i, 0) for i in range(len(xs))]
def test_universal_properties(self, numPairs=100, numPathsPerPair=20): for pair in range(numPairs): dim = randint(0, 3) if randBool() else randint(0, 10) xs = randSeq(dim=dim, minLength=1) ys = randSeq(dim=dim, minLength=1) minCost, path = dtw.dtw(xs, ys, eucCost) # test cost along path agrees with minimum cost assert_allclose(minCost, getPathCost(path, xs, ys, eucCost)) # test transpose minCost2, path2 = dtw.dtw(ys, xs, eucCost) assert_allclose(minCost2, minCost) # N.B. this is not a universal property but will almost always be # true for the random sequences of floats that we generate. assert path2 == dtw.swapPath(path) # test path is a valid path assert dtw.isValidPath(path) assert path[-1] == (len(xs) - 1, len(ys) - 1) # test optimal subpaths property cutIndex = randint(len(path)) iCut, jCut = path[cutIndex] pathA = path[:(cutIndex + 1)] pathB = path[cutIndex:] costA = getPathCost(pathA, xs, ys, eucCost) costB = getPathCost(pathB, xs, ys, eucCost) minCostA, _ = dtw.dtw(xs[:(iCut + 1)], ys[:(jCut + 1)], eucCost) minCostB, _ = dtw.dtw(xs[iCut:], ys[jCut:], eucCost) assert_allclose(costA, minCostA) assert_allclose(costB, minCostB) # test minCost <= cost for several randomly generated paths childrenDict, startNode = getDtwDag(len(xs), len(ys)) for _ in range(numPathsPerPair): path = getRandomDagPath(childrenDict, startNode) cost = getPathCost(path, xs, ys, eucCost) assert minCost <= cost or np.allclose(minCost, cost) # minCost to itself should be zero assert dtw.dtw(xs, xs, eucCost)[0] == 0.0
def compute_dtw_error(references, predictions, distance=mt.logSpecDbDist): minCostTot = 0.0 framesTot = 0 for (nat, synth) in tqdm(zip(references, predictions)): nat, synth = nat.astype('float64'), synth.astype('float64') minCost, path = dtw.dtw(nat, synth, distance) frames = len(nat) minCostTot += minCost framesTot += frames mean_score = minCostTot / framesTot print ('overall score = %f (%s frames nat/synth)' % (mean_score, framesTot)) return mean_score
def test_brute_force_small(self, numPairs=100): for pair in range(numPairs): dim = randint(0, 3) if randBool() else randint(0, 10) xs = randSeq(dim=dim, minLength=1, ensureShort=True) ys = randSeq(dim=dim, minLength=1, ensureShort=True) childrenDict, startNode = getDtwDag(len(xs), len(ys)) minCostGood = min([ getPathCost(path, xs, ys, eucCost) for path, _ in getDagPathIterator(childrenDict, startNode) ]) minCost, _ = dtw.dtw(xs, ys, eucCost) assert_allclose(minCost, minCostGood)
def mel_cep_dtw_dist(target, converted): """ Compute the distance between two unaligned speech waveforms :param target: reference speech numpy array :param converted: synthesized speech numpy array :return: mel cep distance in dB """ total_cost = 0 total_frames = 0 for (tar, conv) in zip(target, converted): tar, conv = tar.astype('float64'), conv.astype('float64') cost, _ = dtw.dtw(tar, conv, mt.logSpecDbDist) frames = len(tar) total_cost += cost total_frames += frames return total_cost / total_frames
def dtw_mcd(cvt, trg): cvt_mcep = cvt.astype('float64') trg_mcep = trg.astype('float64') cvt_mcep = cvt_mcep[:, 1:] trg_mcep = trg_mcep[:, 1:] ''' for t, c in zip(trg_mcep, cvt_mcep): cost, _ = dtw.dtw(t, c, mt.logSpecDbDist) frames = len(trg_mcep) total_cost += cost total_frames += frames ''' cost, _ = dtw.dtw(trg_mcep, cvt_mcep, mt.logSpecDbDist) return cost / len(trg_mcep)
def main(rawArgs): parser = argparse.ArgumentParser( description=( 'Time-warps a speech parameter sequence based on a reference.' ' Dynamic time warping (DTW) is used to compute the time warping.' ' By default a speech parameter sequence consisting of three' ' portions (mgc,lf0,bap) is warped to match the timing of the' ' reference speech parameter sequence.' ' Only the first portion is used when computing the warping.'), formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--exts', dest='exts', default='mgc,lf0,bap', metavar='EXTLIST', help=('file extensions added to uttId to get file containing speech' ' parameters')) parser.add_argument('--param_orders', dest='paramOrders', default='40,1,5', metavar='ORDERLIST', help='orders of the parameter files (mgc,lf0,bap)') parser.add_argument(dest='natDir', metavar='NATDIR', help='directory containing natural speech parameters') parser.add_argument( dest='synthDir', metavar='SYNTHDIR', help='directory containing synthetic speech parameters') parser.add_argument(dest='outDir', metavar='OUTDIR', help='directory to output warped speech parameters to') parser.add_argument(dest='uttIds', metavar='UTTID', nargs='+', help='utterance ids (ext will be appended to these)') args = parser.parse_args(rawArgs[1:]) costFn = mt.logSpecDbDist paramOrders = [ int(paramOrderStr) for paramOrderStr in args.paramOrders.split(',') ] assert paramOrders mgcParamOrder = paramOrders[0] exts = args.exts.split(',') assert len(exts) == len(paramOrders) mgcExt = exts[0] vecSeqIo = vsio.VecSeqIo(mgcParamOrder) getNatVecSeq = DirReader(vecSeqIo, args.natDir, mgcExt) getSynthVecSeq = DirReader(vecSeqIo, args.synthDir, mgcExt) minCostTot = 0.0 framesTot = 0 for uttId in args.uttIds: print('processing', uttId) nat = getNatVecSeq(uttId) synth = getSynthVecSeq(uttId) # ignore 0th cepstral component nat = nat[:, 1:] synth = synth[:, 1:] minCost, path = dtw.dtw(nat, synth, costFn) frames = len(nat) minCostTot += minCost framesTot += frames print('MCD = %f (%d frames)' % (minCost / frames, frames)) pathCosts = [costFn(nat[i], synth[j]) for i, j in path] synthIndexSeq = dtw.projectPathBestCost(path, pathCosts) assert len(synthIndexSeq) == len(nat) uniqueFrames = len(set(synthIndexSeq)) repeatedFrames = len(synthIndexSeq) - uniqueFrames droppedFrames = len(synth) - uniqueFrames assert len(synth) - droppedFrames + repeatedFrames == len(nat) print(('warping %s frames -> %s frames (%s repeated, %s dropped)' % (len(synth), len(nat), repeatedFrames, droppedFrames))) print() for paramOrder, ext in zip(paramOrders, exts): vecSeqIo = vsio.VecSeqIo(paramOrder) synthFullFile = os.path.join(args.synthDir, uttId + '.' + ext) synthFull = vecSeqIo.readFile(synthFullFile) synthFullWarped = dtw.warpGeneral(synthFull, synthIndexSeq) synthFullWarpedFile = os.path.join(args.outDir, uttId + '.' + ext) vecSeqIo.writeFile(synthFullWarpedFile, synthFullWarped) print('overall MCD = %f (%d frames)' % (minCostTot / framesTot, framesTot))
def measure(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Handles all the validation scoring and printing""" stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *', names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME']) speakers['MELLOTRON_ID'] = speakers['ID'].apply( lambda x: speaker_ids[x] if x in speaker_ids else -1) female_speakers = cycle( speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) male_speakers = cycle( speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) file_idx = 0 MEL_DTW = [] TPP_DTW = [] RAND_DTW = [] logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0) while file_idx < len(dataloader): audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path, stft) fs, audio = read(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_mel = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=True) with torch.no_grad(): audio_tpp = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, np.random.randint( 0, 9), speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_rand = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] audio = np.pad(audio, 128) MEL_DTW.append( logSpecDbConst * np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0])) TPP_DTW.append( logSpecDbConst * np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0])) RAND_DTW.append( logSpecDbConst * np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0])) print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1]) print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW)) print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW)) print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW)) file_idx += 1
def MCD(sp1, sp2, costFn): min_cost, path = dtw.dtw(sp1, sp2, costFn) frames = len(sp1) return min_cost, frames, path
def main(rawArgs): parser = argparse.ArgumentParser( description=( 'Computes the MCD DTW metric for two sequences of mel cepstra.' ' Mel cepstral distortion (MCD) is a measure of the difference' ' between two sequences of mel cepstra.' ' This utility computes the MCD between two sequences allowing for' ' possible differences in timing.' ' Specifically it uses dynamic time warping (DTW) to compute the' ' minimum MCD that can be obtained by "aligning" the two sequences' ' subject to certain constraints on the form of the alignment.'), formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--ext', dest='ext', default='mgc', metavar='EXT', help=('file extension added to uttId to get file containing speech' ' parameters')) parser.add_argument('--param_order', dest='paramOrder', default=40, type=int, metavar='ORDER', help='parameter order of the cepstral files') parser.add_argument(dest='natDir', metavar='NATDIR', help='directory containing natural speech parameters') parser.add_argument( dest='synthDir', metavar='SYNTHDIR', help='directory containing synthetic speech parameters') parser.add_argument(dest='uttIds', metavar='UTTID', nargs='+', help='utterance ids (ext will be appended to these)') args = parser.parse_args(rawArgs[1:]) costFn = mt.logSpecDbDist vecSeqIo = vsio.VecSeqIo(args.paramOrder) getNatVecSeq = DirReader(vecSeqIo, args.natDir, args.ext) getSynthVecSeq = DirReader(vecSeqIo, args.synthDir, args.ext) minCostTot = 0.0 framesTot = 0 for uttId in args.uttIds: print('processing', uttId) nat = getNatVecSeq(uttId) synth = getSynthVecSeq(uttId) # ignore 0th cepstral component nat = nat[:, 1:] synth = synth[:, 1:] minCost, path = dtw.dtw(nat, synth, costFn) frames = len(nat) minCostTot += minCost framesTot += frames print('overall MCD = %f (%d frames)' % (minCostTot / framesTot, framesTot))