def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe( "This program is used to compute MFCC feature and CMVN statistics") # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add("--useEnergy", abbr="-u", dtype=bool, default=False, discription="Whether add energy to MFCC feature.") args.add( "--parallel", abbr="-p", dtype=int, default=4, minV=1, maxV=10, discription= "The number of parallel process to compute train feature of train dataset." ) # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments args.print_args() # print arguments to display argsLogFile = os.path.join(args.expDir, "conf", "compute_mfcc.args") args.save(argsLogFile) # ---------- Compute mfcc feature of train, dev and test dataset ----------- if args.useEnergy: mfccConfig = {"--use-energy": "true"} else: mfccConfig = {"--use-energy": "false"} for Name in ["train", "dev", "test"]: print(f"Compute {Name} MFCC feature.") # 1. compute feature if Name == "train" and args.parallel > 1: # use mutiple processes wavFiles = exkaldi.utils.split_txt_file( os.path.join(args.expDir, "data", "train", "wav.scp"), chunks=args.parallel, ) feats = exkaldi.compute_mfcc(wavFiles, config=mfccConfig, outFile=os.path.join( args.expDir, "mfcc", "train", "raw_mfcc.ark")) feat = exkaldi.merge_archives(feats) else: feat = exkaldi.compute_mfcc( os.path.join(args.expDir, "data", Name, "wav.scp"), config=mfccConfig, ) feat.save(os.path.join(args.expDir, "mfcc", Name, "raw_mfcc.ark")) print(f"Generate raw MFCC feature done.") # Compute CMVN cmvn = exkaldi.compute_cmvn_stats( feat=feat, spk2utt=os.path.join(args.expDir, "data", Name, "spk2utt"), ) cmvn.save(os.path.join(args.expDir, "mfcc", Name, "cmvn.ark")) print(f"Generate CMVN statistics done.") # Apply CMVN feat = exkaldi.use_cmvn( feat=feat, cmvn=cmvn, utt2spk=os.path.join(args.expDir, "data", Name, "utt2spk"), ) feat.save(os.path.join(args.expDir, "mfcc", Name, "mfcc_cmvn.ark")) print(f"Generate MFCC feature (applied CMVN) done.") print("Compute MFCC done.")
def output_probability(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe( "This program is used to output DNN probability for realigning") # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add("--dropout", abbr="-d", dtype=float, default=0.2, discription="Dropout.") args.add("--useCMVN", dtype=bool, default=False, discription="Wether apply CMVN to fmllr feature.") args.add( "--splice", dtype=int, default=10, discription="Splice how many frames to head and tail for Fmllr feature." ) args.add("--delta", dtype=int, default=2, discription="Wether add delta to fmllr feature.") args.add("--normalizeFeat", dtype=bool, default=True, discription="Wether normalize the chunk dataset.") args.add("--predictModel", abbr="-m", dtype=str, default="", discription="If not void, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() declare.is_file(args.predictModel) dims = exkaldi.load_list_table(f"{args.expDir}/train_dnn/data/dims") featDim = int(dims["fmllr"]) pdfDim = int(dims["pdfs"]) phoneDim = int(dims["phones"]) # Initialize model if args.delta > 0: featDim *= (args.delta + 1) if args.splice > 0: featDim *= (2 * args.splice + 1) model = make_DNN_model(featDim, pdfDim, phoneDim) model.load_weights(args.predictModel) print(f"Restorage model from: {args.predictModel}") for Name in ["train", "test", "dev"]: print(f"Processing: {Name} dataset") feat = exkaldi.load_feat( f"{args.expDir}/train_dnn/data/{Name}/fmllr.ark") if args.useCMVN: print("Apply CMVN") cmvn = exkaldi.load_cmvn( f"{args.expDir}/train_dnn/data/{Name}/cmvn_of_fmllr.ark") feat = exkaldi.use_cmvn( feat, cmvn, utt2spk=f"{args.expDir}/train_dnn/data/{Name}/utt2spk") del cmvn if args.delta > 0: print("Add delta to feature") feat = feat.add_delta(args.delta) if args.splice > 0: print("Splice feature") feat = feat.splice(args.splice) feat = feat.to_numpy() if args.normalizeFeat: print("Normalize") feat = feat.normalize(std=True) outProb = {} print("Forward model...") for utt, mat in feat.items(): predPdf, predPhone = model(mat, training=False) outProb[utt] = exkaldi.nn.log_softmax(predPdf.numpy(), axis=1) #outProb = exkaldi.load_prob(outProb) #outProb.save(f"{args.expDir}/train_dnn/prob/{Name}.npy") outProb = exkaldi.load_prob(outProb).to_bytes() outProb.save(f"{args.expDir}/train_dnn/prob/{Name}.ark") print("Save done!")
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe( "This program is used to train triphone DNN acoustic model with Tensorflow" ) # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add( "--LDAsplice", dtype=int, default=3, discription="Splice how many frames to head and tail for LDA feature.") args.add("--randomSeed", dtype=int, default=1234, discription="Random seed.") args.add("--batchSize", abbr="-b", dtype=int, default=128, discription="Mini batch size.") args.add("--gpu", abbr="-g", dtype=str, default="all", choices=["all", "0", "1"], discription="Use GPU.") args.add("--epoch", dtype=int, default=30, discription="Epoches.") args.add("--testStartEpoch", dtype=int, default=5, discription="Start to evaluate test dataset.") args.add("--dropout", abbr="-d", dtype=float, default=0.2, discription="Dropout.") args.add("--useCMVN", dtype=bool, default=False, discription="Wether apply CMVN to fmllr feature.") args.add( "--splice", dtype=int, default=10, discription="Splice how many frames to head and tail for Fmllr feature." ) args.add("--delta", dtype=int, default=2, discription="Wether add delta to fmllr feature.") args.add("--normalizeFeat", dtype=bool, default=True, discription="Wether normalize the chunk dataset.") args.add("--normalizeAMP", dtype=bool, default=False, discription="Wether normalize the post-probability.") args.add("--order", abbr="-o", dtype=int, default=6, discription="Language model order.") args.add("--beam", dtype=int, default=13, discription="Decode beam size.") args.add("--latBeam", dtype=int, default=6, discription="Lattice beam size.") args.add("--acwt", dtype=float, default=0.083333, discription="Acoustic model weight.") args.add("--predictModel", abbr="-m", dtype=str, default="", discription="If not void, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments argsLogFile = os.path.join(args.expDir, "conf", "train_dnn.args") args.save(argsLogFile) random.seed(args.randomSeed) np.random.seed(args.randomSeed) tf.random.set_seed(args.randomSeed) # ------------- Prepare data for dnn training ---------------------- if not os.path.isfile(f"./{args.expDir}/train_dnn/data/dims"): prepare_DNN_data() # ------------- Prepare data for dnn training ---------------------- stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") outDir = f"{args.expDir}/train_dnn/out_{stamp}" exkaldi.utils.make_dependent_dirs(outDir, pathIsFile=False) #------------------------ Training and Validation ----------------------------- dims = exkaldi.load_list_table(f"{args.expDir}/train_dnn/data/dims") featDim = int(dims["fmllr"]) pdfDim = int(dims["pdfs"]) phoneDim = int(dims["phones"]) # Initialize model if args.delta > 0: featDim *= (args.delta + 1) if args.splice > 0: featDim *= (2 * args.splice + 1) if len(args.predictModel.strip()) == 0: print('Prepare Data Iterator...') # Prepare fMLLR feature files trainDataset = process_feat_ali(training=True) traindataLen = len(trainDataset) train_gen = tf.data.Dataset.from_generator( lambda: make_generator(trainDataset), (tf.float32, { "pdfID": tf.int32, "phoneID": tf.int32 })).batch(args.batchSize).prefetch(3) steps_per_epoch = traindataLen // args.batchSize devDataset = process_feat_ali(training=False) devdataLen = len(devDataset) dev_gen = tf.data.Dataset.from_generator( lambda: make_generator(devDataset), (tf.float32, { "pdfID": tf.int32, "phoneID": tf.int32 })).batch(args.batchSize).prefetch(3) validation_steps = devdataLen // args.batchSize print('Prepare test data') testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim) def train_step(): model = make_DNN_model(featDim, pdfDim, phoneDim) model.summary() model.compile( loss={ "pdfID": keras.losses.SparseCategoricalCrossentropy( from_logits=True), "phoneID": keras.losses.SparseCategoricalCrossentropy( from_logits=True), }, loss_weights={ "pdfID": 1, "phoneID": 1 }, metrics={ "pdfID": keras.metrics.SparseCategoricalAccuracy(), "phoneID": keras.metrics.SparseCategoricalAccuracy(), }, optimizer=keras.optimizers.SGD(0.08, momentum=0.0), ) def lrScheduler(epoch): if epoch > 25: return 0.001 elif epoch > 22: return 0.0025 elif epoch > 19: return 0.005 elif epoch > 17: return 0.01 elif epoch > 15: return 0.02 elif epoch > 10: return 0.04 else: return 0.08 model.fit( x=train_gen, steps_per_epoch=steps_per_epoch, epochs=args.epoch, validation_data=dev_gen, validation_steps=validation_steps, verbose=1, initial_epoch=0, callbacks=[ keras.callbacks.EarlyStopping(patience=5, verbose=1), keras.callbacks.TensorBoard(log_dir=outDir), keras.callbacks.LearningRateScheduler(lrScheduler), EvaluateWER(model, testFeat, testBias, testTrans, outDir), ModelSaver(model, outDir), ], ) print("Using GPU: ", args.gpu) if args.gpu != "all": os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu train_step() else: my_strategy = tf.distribute.MirroredStrategy() with my_strategy.scope(): train_step() else: declare.is_file(args.predictModel) model = make_DNN_model(featDim, pdfDim, phoneDim) model.summary() model.load_weights(args.predictModel) print('Prepare test data') testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim) scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir) logs = {} scorer.on_epoch_end(5, logs)
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe("This program is used to train triphone GMM-HMM model") # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add("--splice", abbr="-c", dtype=int, default=3, discription="How many left-right frames to splice.") args.add("--numIters", abbr="-n", dtype=int, default=35, discription="How many iterations to train.") args.add("--maxIterInc", abbr="-m", dtype=int, default=25, discription="The final iteration of increasing gaussians.") args.add("--realignIter", abbr="-r", dtype=int, default=[10, 20, 30], discription="The iteration to realign feature.") args.add("--fmllrIter", abbr="-f", dtype=int, default=[2, 4, 6, 12], discription="The iteration to estimate fmllr matrix.") args.add("--order", abbr="-o", dtype=int, default=6, discription="Which N-grams model to use.") args.add("--beam", abbr="-b", dtype=int, default=13, discription="Decode beam size.") args.add("--latBeam", abbr="-l", dtype=int, default=6, discription="Lattice beam size.") args.add("--acwt", abbr="-a", dtype=float, default=0.083333, discription="Acoustic model weight.") args.add( "--parallel", abbr="-p", dtype=int, default=4, minV=1, maxV=10, discription= "The number of parallel process to compute feature of train dataset.") args.add("--skipTrain", abbr="-s", dtype=bool, default=False, discription="If True, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments argsLogFile = os.path.join(args.expDir, "conf", "train_sat.args") args.save(argsLogFile) if not args.skipTrain: # ------------- Prepare feature and previous alignment for training ---------------------- # 1. Load the feature for training print(f"Load MFCC+CMVN feature.") feat = exkaldi.load_index_table( os.path.join(args.expDir, "mfcc", "train", "mfcc_cmvn.ark")) print(f"Splice {args.splice} frames.") originalFeat = exkaldi.splice_feature(feat, left=args.splice, right=args.splice, outFile=os.path.join( args.expDir, "train_delta", "mfcc_cmvn_splice.ark")) print(f"Transform LDA feature") ldaFeat = exkaldi.transform_feat( feat=originalFeat, matFile=os.path.join(args.expDir, "train_lda_mllt", "trans.mat"), outFile=os.path.join(args.expDir, "train_sat", "lda_feat.ark"), ) del originalFeat # 2. Load previous alignment and lexicons ali = exkaldi.load_index_table(os.path.join(args.expDir, "train_lda_mllt", "*final.ali"), useSuffix="ark") lexicons = exkaldi.load_lex( os.path.join(args.expDir, "dict", "lexicons.lex")) # 3. Estimate the primary fMLLR transform matrix print("Estiminate the primary fMLLR transform matrixs") fmllrTransMat = exkaldi.hmm.estimate_fMLLR_matrix( aliOrLat=ali, lexicons=lexicons, aliHmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"), feat=ldaFeat, spk2utt=os.path.join(args.expDir, "data", "train", "spk2utt"), outFile=os.path.join(args.expDir, "train_sat", "trans.ark"), ) print("Transform feature") fmllrFeat = exkaldi.use_fmllr( ldaFeat, fmllrTransMat, utt2spk=os.path.join("exp", "data", "train", "utt2spk"), outFile=os.path.join(args.expDir, "train_sat", "fmllr_feat.ark"), ) # -------------- Build the decision tree ------------------------ print("Start build a tree") tree = exkaldi.hmm.DecisionTree(lexicons=lexicons, contextWidth=3, centralPosition=1) tree.train( feat=fmllrFeat, hmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"), ali=ali, topoFile=os.path.join(args.expDir, "dict", "topo"), numLeaves=2500, tempDir=os.path.join(args.expDir, "train_sat"), ) tree.save(os.path.join(args.expDir, "train_sat", "tree")) print(f"Build tree done.") del fmllrFeat # ------------- Start training ---------------------- # 1. Initialize a monophone HMM object print("Initialize a triphone HMM object") model = exkaldi.hmm.TriphoneHMM(lexicons=lexicons) model.initialize( tree=tree, topoFile=os.path.join(args.expDir, "dict", "topo"), treeStatsFile=os.path.join(args.expDir, "train_sat", "treeStats.acc"), ) print(f"Initialized a monophone HMM-GMM model: {model.info}.") # 2. convert the previous alignment print(f"Transform the alignment") newAli = exkaldi.hmm.convert_alignment( ali=ali, originHmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"), targetHmm=model, tree=tree, outFile=os.path.join(args.expDir, "train_sat", "initial.ali"), ) # 2. Split data for parallel training transcription = exkaldi.load_transcription( os.path.join(args.expDir, "data", "train", "text")) transcription = transcription.sort() if args.parallel > 1: # split feature ldaFeat = ldaFeat.sort(by="utt").subset(chunks=args.parallel) # split transcription depending on utterance IDs of each feat tempTrans = [] tempAli = [] tempFmllrMat = [] for f in ldaFeat: tempTrans.append(transcription.subset(keys=f.utts)) tempAli.append(newAli.subset(keys=f.utts)) spks = exkaldi.utt_to_spk(f.utts, utt2spk=os.path.join( args.expDir, "data", "train", "utt2spk")) tempFmllrMat.append(fmllrTransMat.subset(keys=spks)) transcription = tempTrans newAli = tempAli fmllrTransMat = tempFmllrMat # 3. Train print("Train the triphone model") model.train( ldaFeat, transcription, os.path.join(args.expDir, "dict", "L.fst"), tree, tempDir=os.path.join(args.expDir, "train_sat"), initialAli=newAli, fmllrTransMat=fmllrTransMat, spk2utt=os.path.join(args.expDir, "data", "train", "spk2utt"), utt2spk=os.path.join(args.expDir, "data", "train", "utt2spk"), numIters=args.numIters, maxIterInc=args.maxIterInc, totgauss=15000, realignIter=args.realignIter, fmllrIter=args.fmllrIter, boostSilence=1.0, power=0.2, fmllrSilWt=0.0, ) print(model.info) del ldaFeat del fmllrTransMat del newAli else: declare.is_file(os.path.join(args.expDir, "train_sat", "final.mdl")) declare.is_file(os.path.join(args.expDir, "train_sat", "tree")) model = exkaldi.load_hmm( os.path.join(args.expDir, "train_sat", "final.mdl")) tree = exkaldi.load_tree(os.path.join(args.expDir, "train_sat", "tree")) # ------------- Compile WFST training ---------------------- # Make a WFST decoding graph make_WFST_graph( outDir=os.path.join(args.expDir, "train_sat", "graph"), hmm=model, tree=tree, ) # Decode test data GMM_decode_fmllr_and_score( outDir=os.path.join(args.expDir, "train_sat", f"decode_{args.order}grams"), hmm=model, HCLGfile=os.path.join(args.expDir, "train_sat", "graph", f"HCLG.{args.order}.fst"), tansformMatFile=os.path.join(args.expDir, "train_lda_mllt", "trans.mat"), )
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe("This program is used to train monophone GMM-HMM model") # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add("--delta", abbr="-d", dtype=int, default=2, discription="Add n-order to feature.") args.add("--numIters", abbr="-n", dtype=int, default=40, discription="How many iterations to train.") args.add("--maxIterInc", abbr="-m", dtype=int, default=30, discription="The final iteration of increasing gaussians.") args.add("--realignIter", abbr="-r", dtype=int, default=[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 38 ], discription="the iteration to realign feature.") args.add("--order", abbr="-o", dtype=int, default=6, minV=1, maxV=6, discription="Which N-grams model to use.") args.add("--beam", abbr="-b", dtype=int, default=13, discription="Decode beam size.") args.add("--latBeam", abbr="-l", dtype=int, default=6, discription="Lattice beam size.") args.add("--acwt", abbr="-a", dtype=float, default=0.083333, discription="Acoustic model weight.") args.add( "--parallel", abbr="-p", dtype=int, default=4, minV=1, maxV=10, discription= "The number of parallel process to compute feature of train dataset.") args.add("--skipTrain", abbr="-s", dtype=bool, default=False, discription="If True, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments args.print_args() # print arguments to display argsLogFile = os.path.join(args.expDir, "conf", "train_mono.args") args.save(argsLogFile) if not args.skipTrain: # ------------- Prepare feature for training ---------------------- # 1. Load the feature for training (We use the index table format) feat = exkaldi.load_index_table( os.path.join(args.expDir, "mfcc", "train", "mfcc_cmvn.ark")) print(f"Load MFCC+CMVN feature.") feat = exkaldi.add_delta(feat, order=args.delta, outFile=os.path.join(args.expDir, "train_mono", "mfcc_cmvn_delta.ark")) print(f"Add {args.delta}-order deltas.") # 2. Load lexicon bank lexicons = exkaldi.load_lex( os.path.join(args.expDir, "dict", "lexicons.lex")) print(f"Restorage lexicon bank.") # ------------- Start training ---------------------- # 1. Initialize a monophone HMM object model = exkaldi.hmm.MonophoneHMM(lexicons=lexicons, name="mono") model.initialize(feat=feat, topoFile=os.path.join(args.expDir, "dict", "topo")) print(f"Initialized a monophone HMM-GMM model: {model.info}.") # 2. Split data for parallel training transcription = exkaldi.load_transcription( os.path.join(args.expDir, "data", "train", "text")) transcription = transcription.sort() if args.parallel > 1: # split feature feat = feat.sort(by="utt").subset(chunks=args.parallel) # split transcription depending on utterance IDs of each feature temp = [] for f in feat: temp.append(transcription.subset(keys=f.utts)) transcription = temp # 3. Train model.train( feat, transcription, LFile=os.path.join(args.expDir, "dict", "L.fst"), tempDir=os.path.join(args.expDir, "train_mono"), numIters=args.numIters, maxIterInc=args.maxIterInc, totgauss=1000, realignIter=args.realignIter, boostSilence=1.0, ) print(model.info) # Save the tree model.tree.save(os.path.join(args.expDir, "train_mono", "tree")) print(f"Tree has been saved.") # 4. Realign with boostSilence 1.25 print("Realign the training feature (boost silence = 1.25)") trainGraphFiles = exkaldi.utils.list_files( os.path.join(args.expDir, "train_mono", "*train_graph")) model.align( feat, trainGraphFile= trainGraphFiles, # train graphs have been generated in the train step. boostSilence=1.25, #1.5 outFile=os.path.join(args.expDir, "train_mono", "final.ali")) del feat print("Save the new alignment done.") tree = model.tree else: declare.is_file(os.path.join(args.expDir, "train_mono", "final.mdl")) declare.is_file(os.path.join(args.expDir, "train_mono", "tree")) model = exkaldi.load_hmm( os.path.join(args.expDir, "train_mono", "final.mdl")) tree = exkaldi.load_tree( os.path.join(args.expDir, "train_mono", "tree")) # ------------- Compile WFST training ---------------------- # Make a WFST decoding graph make_WFST_graph( outDir=os.path.join(args.expDir, "train_mono", "graph"), hmm=model, tree=tree, ) # Decode test data GMM_decode_mfcc_and_score( outDir=os.path.join(args.expDir, "train_mono", f"decode_{args.order}grams"), hmm=model, HCLGfile=os.path.join(args.expDir, "train_mono", "graph", f"HCLG.{args.order}.fst"), )
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe( "This program is used to train triphone LSTM scoustic model with Tensorflow" ) # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add( "--LDAsplice", dtype=int, default=3, discription="Splice how many frames to head and tail for LDA feature.") args.add("--randomSeed", dtype=int, default=1234, discription="Random seed.") args.add("--batchSize", abbr="-b", dtype=int, default=8, discription="Mini batch size.") args.add("--gpu", abbr="-g", dtype=str, default="all", choices=["all", "0", "1"], discription="Use GPU.") args.add("--epoch", dtype=int, default=30, discription="Epoches.") args.add("--testStartEpoch", dtype=int, default=5, discription="Start to evaluate test dataset.") args.add("--dropout", abbr="-d", dtype=float, default=0.2, discription="Dropout.") args.add("--useCMVN", dtype=bool, default=False, discription="Wether apply CMVN to fmllr feature.") args.add( "--splice", dtype=int, default=0, discription="Splice how many frames to head and tail for Fmllr feature." ) args.add("--delta", dtype=int, default=2, discription="Wether add delta to fmllr feature.") args.add("--normalizeFeat", dtype=bool, default=True, discription="Wether normalize the chunk dataset.") args.add("--normalizeAMP", dtype=bool, default=False, discription="Wether normalize the post-probability.") args.add("--order", abbr="-o", dtype=int, default=6, discription="Language model order.") args.add("--beam", dtype=int, default=13, discription="Decode beam size.") args.add("--latBeam", dtype=int, default=6, discription="Lattice beam size.") args.add("--acwt", dtype=float, default=0.083333, discription="Acoustic model weight.") args.add("--predictModel", abbr="-m", dtype=str, default="", discription="If not void, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments args.save(f"./{args.expDir}/conf/train_lstm.args") random.seed(args.randomSeed) np.random.seed(args.randomSeed) tf.random.set_seed(args.randomSeed) # ------------- Prepare data for dnn training ---------------------- if not os.path.isfile(f"./{args.expDir}/train_lstm/data/dims"): prepare_LSTM_data() # ------------- Prepare data for lstm training ---------------------- stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") outDir = f"{args.expDir}/train_lstm/out_{stamp}" exkaldi.utils.make_dependent_dirs(outDir, pathIsFile=False) #------------------------ Training and Validation ----------------------------- dims = exkaldi.load_list_table(f"{args.expDir}/train_lstm/data/dims") featDim = int(dims["fmllr"]) pdfDim = int(dims["pdfs"]) phoneDim = int(dims["phones"]) # Initialize model if args.delta > 0: featDim *= (args.delta + 1) if args.splice > 0: featDim *= (2 * args.splice + 1) if len(args.predictModel.strip()) == 0: print('Prepare Data Iterator...') # Prepare fMLLR feature files trainIterator = DataIterator(batchSize=args.batchSize, training=True) devIterator = DataIterator(batchSize=args.batchSize, training=False) print('Prepare test data') testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim) metris = { "train_loss": keras.metrics.Mean(name="train/loss", dtype=tf.float32), "train_pdfID_accuracy": keras.metrics.Mean(name="train/pdfID_accuracy", dtype=tf.float32), "train_phoneID_accuracy": keras.metrics.Mean(name="train/phoneID_accuracy", dtype=tf.float32), "dev_loss": keras.metrics.Mean(name="eval/loss", dtype=tf.float32), "dev_pdfID_accuracy": keras.metrics.Mean(name="eval/pdfID_accuracy", dtype=tf.float32), "dev_phoneID_accuracy": keras.metrics.Mean(name="eval/phoneID_accuracy", dtype=tf.float32), } def train_step(model, optimizer, batch): feat, pdfAli, phoneAli = batch with tf.GradientTape() as tape: pdfPred, phonePred = model(feat, training=True) L1 = keras.losses.sparse_categorical_crossentropy( pdfAli, pdfPred, from_logits=True) L2 = keras.losses.sparse_categorical_crossentropy( phoneAli, phonePred, from_logits=True) loss = L1 + L2 gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) metris["train_loss"](loss) #pdfPred = tf.convert_to_tensor(pdfPred, np.float32) A1 = keras.metrics.sparse_categorical_accuracy(pdfAli, pdfPred) metris["train_pdfID_accuracy"](A1) #phonePred = tf.convert_to_tensor(phonePred, np.float32) A2 = keras.metrics.sparse_categorical_accuracy(phoneAli, phonePred) metris["train_phoneID_accuracy"](A2) return float(np.mean(L1.numpy())), float(np.mean( L2.numpy())), float(np.mean(A1.numpy())), float( np.mean(A2.numpy())) def dev_step(model, batch): feat, pdfAli, phoneAli = batch pdfPred, phonePred = model(feat, training=False) L1 = keras.losses.sparse_categorical_crossentropy(pdfAli, pdfPred, from_logits=True) L2 = keras.losses.sparse_categorical_crossentropy(phoneAli, phonePred, from_logits=True) loss = L1 + L2 metris["dev_loss"](loss) #pdfPred = tf.convert_to_tensor(pdfPred, np.float32) A1 = keras.metrics.sparse_categorical_accuracy(pdfAli, pdfPred) metris["dev_pdfID_accuracy"](A1) #phonePred = tf.convert_to_tensor(phonePred, np.float32) A2 = keras.metrics.sparse_categorical_accuracy(phoneAli, phonePred) metris["dev_phoneID_accuracy"](A2) return float(np.mean(L1.numpy())), float(np.mean( L2.numpy())), float(np.mean(A1.numpy())), float( np.mean(A2.numpy())) def main_loop(): model = make_LSTM_model(args, featDim, pdfDim, phoneDim) model.summary() optimizer = keras.optimizers.RMSprop(learning_rate=0.0004, rho=0.95, momentum=0.0, epsilon=1e-07) scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir) modelSaver = ModelSaver(model, outDir) for e in range(args.epoch): # Training startTime = time.time() for i in range(trainIterator.epochSize): batch = trainIterator.next() pdfLoss, phoneLoss, pdfAcc, phoneAcc = train_step( model, optimizer, batch) tf.print( f"\rtraining: {i}/{trainIterator.epochSize} pdfID loss {pdfLoss:.3f} phoneID loss {phoneLoss:.3f} pdfID accuracy {pdfAcc:.3f} phoneID accuracy {phoneAcc:.3f}", end="\t") tf.print() # Evaluate for i in range(devIterator.epochSize): batch = devIterator.next() pdfLoss, phoneLoss, pdfAcc, phoneAcc = dev_step( model, batch) tf.print( f"\revaluate: {i}/{devIterator.epochSize} pdfID loss {pdfLoss:.3f} phoneID loss {phoneLoss:.3f} pdfID accuracy {pdfAcc:.3f} phoneID accuracy {phoneAcc:.3f}", end="\t") tf.print() # Test tf.print("testing:", end=" ") testWER = scorer.test(e) tf.print() endTime = time.time() message = f"epoch {e} " for Name in metris.keys(): message += f"{Name} {float(metris[Name].result().numpy()):.3f} " metris[Name].reset_states() message += f"test PER {testWER:.2f} time cost {int(endTime-startTime)}s" tf.print(message) modelSaver.save(e) main_loop() else: declare.is_file(args.predictModel) model = make_LSTM_model(featDim, pdfDim, phoneDim) model.summary() model.load_weights(args.predictModel) print('Prepare test data') testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim) scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir) scorer.test(0)
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe("This program is used to prepare TIMIT data.") # 2. Add some options args.add("--timitRoot", dtype=str, abbr="-t", default="/Corpus/TIMIT", discription="The root path of timit dataset.") args.add("--expDir", dtype=str, abbr="-e", default="exp", discription="The output path to save generated data.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments args.save(os.path.join(args.expDir, "conf", "prepare_data.args")) # ------------- Do some preparative work ---------------------- # 2. Ensure Kaldi has existed declare.kaldi_existed() # 3. sph2pipe tool will be used if the timit data is sph format. sph2pipeTool = os.path.join(info.KALDI_ROOT, "tools", "sph2pipe_v2.5", "sph2pipe") declare.is_file("sph2pipe tool", sph2pipeTool) # ------------- Check TIMIT data format ------------- # 1. Get the directory name declare.is_dir("TIMIT root directory", args.timitRoot) dirNames = os.listdir(args.timitRoot) if "TRAIN" in dirNames and "TEST" in dirNames: uppercaseFlag = True trainResourceDir = "TRAIN" testResourceDir = "TEST" testWavFile = os.path.join(args.timitRoot, "TRAIN", "DR1", "FCJF0", "SA1.WAV") # used to test the file format wavFileSuffix = "WAV" txtFileSuffix = "PHN" elif "train" in dirNames and "test" in dirNames: uppercaseFlag = False trainResourceDir = "train" testResourceDir = "test" testWavFile = os.path.join(args.timitRoot, "train", "dr1", "fcjf0", "sa1.wav") # used to test the file format wavFileSuffix = "wav" txtFileSuffix = "phn" else: raise Exception(f"Wrong format of train or test data directories.") # 2. check whether wave file is sph format. formatCheckCmd = f"{sph2pipeTool} -f wav {testWavFile}" out, err, cod = exkaldi.utils.run_shell_command(formatCheckCmd, stderr="PIPE") if cod == 0: sphFlag = True else: sphFlag = False # --------- Generate phone-map dictionary -------- # 1. Generate 60-48 catagories and 48-39 catagories mapping dictionary phoneMap_60_to_48 = exkaldi.ListTable(name="69-48") phoneMap_48_to_39 = exkaldi.ListTable(name="48-39") mapFile = os.path.join(info.KALDI_ROOT, "egs", "timit", "s5", "conf", "phones.60-48-39.map") declare.is_file("60-48-39 phone map", mapFile) # Check whether or not it existed with open(mapFile, "r", encoding="utf-8") as fr: lines = fr.readlines() for line in lines: line = line.strip().split() if len(line) < 3: #phone "q" will be omitted temporarily. continue phoneMap_60_to_48[line[0]] = line[1] phoneMap_48_to_39[line[1]] = line[2] # 2. Save 48-39 phone map for futher use. phoneMap_48_to_39.save( os.path.join(args.expDir, "dict", "phones.48_to_39.map")) # --------- Generate train dataset -------- wavs = glob.glob( os.path.join(args.timitRoot, trainResourceDir, "*", "*", f"*.{wavFileSuffix}")) out = os.path.join(args.expDir, "data", "train") generate_data(wavs, out, sphFlag, sph2pipeTool, txtFileSuffix, phoneMap_60_to_48) # --------- Generate dev and test data -------- for Name in ["dev", "test"]: spkListFile = os.path.join(info.KALDI_ROOT, "egs", "timit", "s5", "conf", f"{Name}_spk.list") declare.is_file(f"speakers list for {Name}", spkListFile) # Check whether or not it existed with open(spkListFile, "r", encoding="utf-8") as fr: spkList = fr.readlines() wavs = [] for spk in spkList: spk = spk.strip() if len(spk) == 0: continue if uppercaseFlag: spk = spk.upper() wavs.extend( glob.glob( os.path.join(args.timitRoot, testResourceDir, "*", spk, f"*.{wavFileSuffix}"))) out = os.path.join(args.expDir, "data", Name) generate_data(wavs, out, sphFlag, sph2pipeTool, txtFileSuffix, phoneMap_60_to_48)
def main(): # ------------- Parse arguments from command line ---------------------- # 1. Add a discription of this program args.describe("This program is used to train triphone GMM-HMM model") # 2. Add options args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.") args.add("--delta", abbr="-d", dtype=int, default=2, discription="Add n-order to feature.") args.add("--numIters", abbr="-n", dtype=int, default=35, discription="How many iterations to train.") args.add("--maxIterInc", abbr="-m", dtype=int, default=25, discription="The final iteration of increasing gaussians.") args.add("--realignIter", abbr="-r", dtype=int, default=[10,20,30], discription="the iteration to realign feature.") args.add("--order", abbr="-o", dtype=int, default=6, discription="Which N-grams model to use.") args.add("--beam", abbr="-b", dtype=int, default=13, discription="Decode beam size.") args.add("--latBeam", abbr="-l", dtype=int, default=6, discription="Lattice beam size.") args.add("--acwt", abbr="-a", dtype=float, default=0.083333, discription="Acoustic model weight.") args.add("--parallel", abbr="-p", dtype=int, default=4, minV=1, maxV=10, discription="The number of parallel process to compute feature of train dataset.") args.add("--skipTrain", abbr="-s", dtype=bool, default=False, discription="If True, skip training. Do decoding only.") # 3. Then start to parse arguments. args.parse() # 4. Take a backup of arguments argsLogFile = os.path.join(args.expDir, "conf", "train_delta.args") args.save(argsLogFile) if not args.skipTrain: # ------------- Prepare feature and previous alignment for training ---------------------- # 1. Load the feature for training feat = exkaldi.load_index_table(os.path.join(args.expDir,"mfcc","train","mfcc_cmvn.ark")) print(f"Load MFCC+CMVN feature.") feat = exkaldi.add_delta(feat, order=args.delta, outFile=os.path.join(args.expDir,"train_delta","mfcc_cmvn_delta.ark")) print(f"Add {args.delta}-order deltas.") # 2. Load lexicon bank lexicons = exkaldi.load_lex(os.path.join(args.expDir,"dict","lexicons.lex")) print(f"Restorage lexicon bank.") # 3. Load previous alignment ali = exkaldi.load_index_table(os.path.join(args.expDir,"train_mono","*final.ali"),useSuffix="ark") # -------------- Build the decision tree ------------------------ print("Start build a tree") tree = exkaldi.hmm.DecisionTree(lexicons=lexicons, contextWidth=3, centralPosition=1) tree.train( feat=feat, hmm=os.path.join(args.expDir,"train_mono","final.mdl"), ali=ali, topoFile=os.path.join(args.expDir,"dict","topo"), numLeaves=2500, tempDir=os.path.join(args.expDir,"train_delta"), ) print(f"Build tree done.") # ------------- Start training ---------------------- # 1. Initialize a monophone HMM object model = exkaldi.hmm.TriphoneHMM(lexicons=lexicons, name="mono") model.initialize( tree=tree, topoFile=os.path.join(args.expDir,"dict","topo"), treeStatsFile=os.path.join(args.expDir,"train_delta","treeStats.acc"), ) print(f"Initialized a monophone HMM-GMM model: {model.info}.") # 2. convert the previous alignment print(f"Transform the alignment") newAli = exkaldi.hmm.convert_alignment( ali=ali, originHmm=os.path.join("exp","train_mono","final.mdl"), targetHmm=model, tree=tree, outFile=os.path.join(args.expDir,"train_delta","initial.ali"), ) # 2. Split data for parallel training transcription = exkaldi.load_transcription(os.path.join(args.expDir,"data","train","text")) transcription = transcription.sort() if args.parallel > 1: # split feature feat = feat.sort(by="utt").subset(chunks=args.parallel) # split transcription depending on utterance IDs of each feat tempTrans = [] tempAli = [] for f in feat: tempTrans.append( transcription.subset(keys=f.utts) ) tempAli.append( newAli.subset(keys=f.utts) ) transcription = tempTrans newAli = tempAli # 3. Train print("Train the triphone model") model.train(feat, transcription, os.path.join("exp","dict","L.fst"), tree, tempDir=os.path.join(args.expDir,"train_delta"), initialAli=newAli, numIters=args.numIters, maxIterInc=args.maxIterInc, totgauss=15000, realignIter=args.realignIter, boostSilence=1.0, ) print(model.info) # Save the tree model.tree.save(os.path.join(args.expDir,"train_delta","tree")) print(f"Tree has been saved.") del feat else: declare.is_file( os.path.join(args.expDir,"train_delta","final.mdl") ) declare.is_file( os.path.join(args.expDir,"train_delta","tree") ) model = exkaldi.load_hmm( os.path.join(args.expDir,"train_delta","final.mdl") ) tree = exkaldi.load_tree( os.path.join(args.expDir,"train_delta","tree") ) # ------------- Compile WFST training ---------------------- # Make a WFST decoding graph make_WFST_graph( outDir=os.path.join(args.expDir,"train_delta","graph"), hmm=model, tree=tree, ) # Decode test data GMM_decode_mfcc_and_score( outDir=os.path.join(args.expDir,"train_delta",f"decode_{args.order}grams"), hmm=model, HCLGfile=os.path.join(args.expDir,"train_delta","graph",f"HCLG.{args.order}.fst"), )