Ejemplo n.º 1
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe(
        "This program is used to compute MFCC feature and CMVN statistics")
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add("--useEnergy",
             abbr="-u",
             dtype=bool,
             default=False,
             discription="Whether add energy to MFCC feature.")
    args.add(
        "--parallel",
        abbr="-p",
        dtype=int,
        default=4,
        minV=1,
        maxV=10,
        discription=
        "The number of parallel process to compute train feature of train dataset."
    )
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    args.print_args()  # print arguments to display
    argsLogFile = os.path.join(args.expDir, "conf", "compute_mfcc.args")
    args.save(argsLogFile)

    # ---------- Compute mfcc feature of train, dev and test dataset -----------
    if args.useEnergy:
        mfccConfig = {"--use-energy": "true"}
    else:
        mfccConfig = {"--use-energy": "false"}

    for Name in ["train", "dev", "test"]:
        print(f"Compute {Name} MFCC feature.")

        # 1. compute feature
        if Name == "train" and args.parallel > 1:  # use mutiple processes
            wavFiles = exkaldi.utils.split_txt_file(
                os.path.join(args.expDir, "data", "train", "wav.scp"),
                chunks=args.parallel,
            )
            feats = exkaldi.compute_mfcc(wavFiles,
                                         config=mfccConfig,
                                         outFile=os.path.join(
                                             args.expDir, "mfcc", "train",
                                             "raw_mfcc.ark"))
            feat = exkaldi.merge_archives(feats)
        else:
            feat = exkaldi.compute_mfcc(
                os.path.join(args.expDir, "data", Name, "wav.scp"),
                config=mfccConfig,
            )
            feat.save(os.path.join(args.expDir, "mfcc", Name, "raw_mfcc.ark"))
        print(f"Generate raw MFCC feature done.")
        # Compute CMVN
        cmvn = exkaldi.compute_cmvn_stats(
            feat=feat,
            spk2utt=os.path.join(args.expDir, "data", Name, "spk2utt"),
        )
        cmvn.save(os.path.join(args.expDir, "mfcc", Name, "cmvn.ark"))
        print(f"Generate CMVN statistics done.")
        # Apply CMVN
        feat = exkaldi.use_cmvn(
            feat=feat,
            cmvn=cmvn,
            utt2spk=os.path.join(args.expDir, "data", Name, "utt2spk"),
        )
        feat.save(os.path.join(args.expDir, "mfcc", Name, "mfcc_cmvn.ark"))
        print(f"Generate MFCC feature (applied CMVN) done.")

    print("Compute MFCC done.")
Ejemplo n.º 2
0
def output_probability():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe(
        "This program is used to output DNN probability for realigning")
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add("--dropout",
             abbr="-d",
             dtype=float,
             default=0.2,
             discription="Dropout.")
    args.add("--useCMVN",
             dtype=bool,
             default=False,
             discription="Wether apply CMVN to fmllr feature.")
    args.add(
        "--splice",
        dtype=int,
        default=10,
        discription="Splice how many frames to head and tail for Fmllr feature."
    )
    args.add("--delta",
             dtype=int,
             default=2,
             discription="Wether add delta to fmllr feature.")
    args.add("--normalizeFeat",
             dtype=bool,
             default=True,
             discription="Wether normalize the chunk dataset.")
    args.add("--predictModel",
             abbr="-m",
             dtype=str,
             default="",
             discription="If not void, skip training. Do decoding only.")
    # 3. Then start to parse arguments.
    args.parse()

    declare.is_file(args.predictModel)

    dims = exkaldi.load_list_table(f"{args.expDir}/train_dnn/data/dims")
    featDim = int(dims["fmllr"])
    pdfDim = int(dims["pdfs"])
    phoneDim = int(dims["phones"])

    # Initialize model
    if args.delta > 0:
        featDim *= (args.delta + 1)
    if args.splice > 0:
        featDim *= (2 * args.splice + 1)

    model = make_DNN_model(featDim, pdfDim, phoneDim)
    model.load_weights(args.predictModel)
    print(f"Restorage model from: {args.predictModel}")

    for Name in ["train", "test", "dev"]:
        print(f"Processing: {Name} dataset")
        feat = exkaldi.load_feat(
            f"{args.expDir}/train_dnn/data/{Name}/fmllr.ark")

        if args.useCMVN:
            print("Apply CMVN")
            cmvn = exkaldi.load_cmvn(
                f"{args.expDir}/train_dnn/data/{Name}/cmvn_of_fmllr.ark")
            feat = exkaldi.use_cmvn(
                feat,
                cmvn,
                utt2spk=f"{args.expDir}/train_dnn/data/{Name}/utt2spk")
            del cmvn

        if args.delta > 0:
            print("Add delta to feature")
            feat = feat.add_delta(args.delta)

        if args.splice > 0:
            print("Splice feature")
            feat = feat.splice(args.splice)

        feat = feat.to_numpy()
        if args.normalizeFeat:
            print("Normalize")
            feat = feat.normalize(std=True)

        outProb = {}
        print("Forward model...")
        for utt, mat in feat.items():
            predPdf, predPhone = model(mat, training=False)
            outProb[utt] = exkaldi.nn.log_softmax(predPdf.numpy(), axis=1)

        #outProb = exkaldi.load_prob(outProb)
        #outProb.save(f"{args.expDir}/train_dnn/prob/{Name}.npy")
        outProb = exkaldi.load_prob(outProb).to_bytes()
        outProb.save(f"{args.expDir}/train_dnn/prob/{Name}.ark")
        print("Save done!")
Ejemplo n.º 3
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe(
        "This program is used to train triphone DNN acoustic model with Tensorflow"
    )
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add(
        "--LDAsplice",
        dtype=int,
        default=3,
        discription="Splice how many frames to head and tail for LDA feature.")
    args.add("--randomSeed",
             dtype=int,
             default=1234,
             discription="Random seed.")
    args.add("--batchSize",
             abbr="-b",
             dtype=int,
             default=128,
             discription="Mini batch size.")
    args.add("--gpu",
             abbr="-g",
             dtype=str,
             default="all",
             choices=["all", "0", "1"],
             discription="Use GPU.")
    args.add("--epoch", dtype=int, default=30, discription="Epoches.")
    args.add("--testStartEpoch",
             dtype=int,
             default=5,
             discription="Start to evaluate test dataset.")
    args.add("--dropout",
             abbr="-d",
             dtype=float,
             default=0.2,
             discription="Dropout.")
    args.add("--useCMVN",
             dtype=bool,
             default=False,
             discription="Wether apply CMVN to fmllr feature.")
    args.add(
        "--splice",
        dtype=int,
        default=10,
        discription="Splice how many frames to head and tail for Fmllr feature."
    )
    args.add("--delta",
             dtype=int,
             default=2,
             discription="Wether add delta to fmllr feature.")
    args.add("--normalizeFeat",
             dtype=bool,
             default=True,
             discription="Wether normalize the chunk dataset.")
    args.add("--normalizeAMP",
             dtype=bool,
             default=False,
             discription="Wether normalize the post-probability.")
    args.add("--order",
             abbr="-o",
             dtype=int,
             default=6,
             discription="Language model order.")
    args.add("--beam", dtype=int, default=13, discription="Decode beam size.")
    args.add("--latBeam",
             dtype=int,
             default=6,
             discription="Lattice beam size.")
    args.add("--acwt",
             dtype=float,
             default=0.083333,
             discription="Acoustic model weight.")
    args.add("--predictModel",
             abbr="-m",
             dtype=str,
             default="",
             discription="If not void, skip training. Do decoding only.")
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    argsLogFile = os.path.join(args.expDir, "conf", "train_dnn.args")
    args.save(argsLogFile)

    random.seed(args.randomSeed)
    np.random.seed(args.randomSeed)
    tf.random.set_seed(args.randomSeed)

    # ------------- Prepare data for dnn training ----------------------
    if not os.path.isfile(f"./{args.expDir}/train_dnn/data/dims"):
        prepare_DNN_data()

    # ------------- Prepare data for dnn training ----------------------
    stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    outDir = f"{args.expDir}/train_dnn/out_{stamp}"
    exkaldi.utils.make_dependent_dirs(outDir, pathIsFile=False)

    #------------------------ Training and Validation -----------------------------
    dims = exkaldi.load_list_table(f"{args.expDir}/train_dnn/data/dims")
    featDim = int(dims["fmllr"])
    pdfDim = int(dims["pdfs"])
    phoneDim = int(dims["phones"])

    # Initialize model
    if args.delta > 0:
        featDim *= (args.delta + 1)
    if args.splice > 0:
        featDim *= (2 * args.splice + 1)

    if len(args.predictModel.strip()) == 0:
        print('Prepare Data Iterator...')
        # Prepare fMLLR feature files
        trainDataset = process_feat_ali(training=True)
        traindataLen = len(trainDataset)
        train_gen = tf.data.Dataset.from_generator(
            lambda: make_generator(trainDataset), (tf.float32, {
                "pdfID": tf.int32,
                "phoneID": tf.int32
            })).batch(args.batchSize).prefetch(3)
        steps_per_epoch = traindataLen // args.batchSize

        devDataset = process_feat_ali(training=False)
        devdataLen = len(devDataset)
        dev_gen = tf.data.Dataset.from_generator(
            lambda: make_generator(devDataset), (tf.float32, {
                "pdfID": tf.int32,
                "phoneID": tf.int32
            })).batch(args.batchSize).prefetch(3)
        validation_steps = devdataLen // args.batchSize

        print('Prepare test data')
        testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim)

        def train_step():

            model = make_DNN_model(featDim, pdfDim, phoneDim)
            model.summary()

            model.compile(
                loss={
                    "pdfID":
                    keras.losses.SparseCategoricalCrossentropy(
                        from_logits=True),
                    "phoneID":
                    keras.losses.SparseCategoricalCrossentropy(
                        from_logits=True),
                },
                loss_weights={
                    "pdfID": 1,
                    "phoneID": 1
                },
                metrics={
                    "pdfID": keras.metrics.SparseCategoricalAccuracy(),
                    "phoneID": keras.metrics.SparseCategoricalAccuracy(),
                },
                optimizer=keras.optimizers.SGD(0.08, momentum=0.0),
            )

            def lrScheduler(epoch):
                if epoch > 25:
                    return 0.001
                elif epoch > 22:
                    return 0.0025
                elif epoch > 19:
                    return 0.005
                elif epoch > 17:
                    return 0.01
                elif epoch > 15:
                    return 0.02
                elif epoch > 10:
                    return 0.04
                else:
                    return 0.08

            model.fit(
                x=train_gen,
                steps_per_epoch=steps_per_epoch,
                epochs=args.epoch,
                validation_data=dev_gen,
                validation_steps=validation_steps,
                verbose=1,
                initial_epoch=0,
                callbacks=[
                    keras.callbacks.EarlyStopping(patience=5, verbose=1),
                    keras.callbacks.TensorBoard(log_dir=outDir),
                    keras.callbacks.LearningRateScheduler(lrScheduler),
                    EvaluateWER(model, testFeat, testBias, testTrans, outDir),
                    ModelSaver(model, outDir),
                ],
            )

        print("Using GPU: ", args.gpu)
        if args.gpu != "all":
            os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
            train_step()

        else:
            my_strategy = tf.distribute.MirroredStrategy()
            with my_strategy.scope():
                train_step()

    else:
        declare.is_file(args.predictModel)

        model = make_DNN_model(featDim, pdfDim, phoneDim)
        model.summary()

        model.load_weights(args.predictModel)

        print('Prepare test data')
        testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim)
        scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir)

        logs = {}
        scorer.on_epoch_end(5, logs)
Ejemplo n.º 4
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe("This program is used to train triphone GMM-HMM model")
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add("--splice",
             abbr="-c",
             dtype=int,
             default=3,
             discription="How many left-right frames to splice.")
    args.add("--numIters",
             abbr="-n",
             dtype=int,
             default=35,
             discription="How many iterations to train.")
    args.add("--maxIterInc",
             abbr="-m",
             dtype=int,
             default=25,
             discription="The final iteration of increasing gaussians.")
    args.add("--realignIter",
             abbr="-r",
             dtype=int,
             default=[10, 20, 30],
             discription="The iteration to realign feature.")
    args.add("--fmllrIter",
             abbr="-f",
             dtype=int,
             default=[2, 4, 6, 12],
             discription="The iteration to estimate fmllr matrix.")
    args.add("--order",
             abbr="-o",
             dtype=int,
             default=6,
             discription="Which N-grams model to use.")
    args.add("--beam",
             abbr="-b",
             dtype=int,
             default=13,
             discription="Decode beam size.")
    args.add("--latBeam",
             abbr="-l",
             dtype=int,
             default=6,
             discription="Lattice beam size.")
    args.add("--acwt",
             abbr="-a",
             dtype=float,
             default=0.083333,
             discription="Acoustic model weight.")
    args.add(
        "--parallel",
        abbr="-p",
        dtype=int,
        default=4,
        minV=1,
        maxV=10,
        discription=
        "The number of parallel process to compute feature of train dataset.")
    args.add("--skipTrain",
             abbr="-s",
             dtype=bool,
             default=False,
             discription="If True, skip training. Do decoding only.")
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    argsLogFile = os.path.join(args.expDir, "conf", "train_sat.args")
    args.save(argsLogFile)

    if not args.skipTrain:
        # ------------- Prepare feature and previous alignment for training ----------------------
        # 1. Load the feature for training
        print(f"Load MFCC+CMVN feature.")
        feat = exkaldi.load_index_table(
            os.path.join(args.expDir, "mfcc", "train", "mfcc_cmvn.ark"))
        print(f"Splice {args.splice} frames.")
        originalFeat = exkaldi.splice_feature(feat,
                                              left=args.splice,
                                              right=args.splice,
                                              outFile=os.path.join(
                                                  args.expDir, "train_delta",
                                                  "mfcc_cmvn_splice.ark"))
        print(f"Transform LDA feature")
        ldaFeat = exkaldi.transform_feat(
            feat=originalFeat,
            matFile=os.path.join(args.expDir, "train_lda_mllt", "trans.mat"),
            outFile=os.path.join(args.expDir, "train_sat", "lda_feat.ark"),
        )
        del originalFeat
        # 2. Load previous alignment and lexicons
        ali = exkaldi.load_index_table(os.path.join(args.expDir,
                                                    "train_lda_mllt",
                                                    "*final.ali"),
                                       useSuffix="ark")
        lexicons = exkaldi.load_lex(
            os.path.join(args.expDir, "dict", "lexicons.lex"))
        # 3. Estimate the primary fMLLR transform matrix
        print("Estiminate the primary fMLLR transform matrixs")
        fmllrTransMat = exkaldi.hmm.estimate_fMLLR_matrix(
            aliOrLat=ali,
            lexicons=lexicons,
            aliHmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"),
            feat=ldaFeat,
            spk2utt=os.path.join(args.expDir, "data", "train", "spk2utt"),
            outFile=os.path.join(args.expDir, "train_sat", "trans.ark"),
        )
        print("Transform feature")
        fmllrFeat = exkaldi.use_fmllr(
            ldaFeat,
            fmllrTransMat,
            utt2spk=os.path.join("exp", "data", "train", "utt2spk"),
            outFile=os.path.join(args.expDir, "train_sat", "fmllr_feat.ark"),
        )

        # -------------- Build the decision tree ------------------------
        print("Start build a tree")
        tree = exkaldi.hmm.DecisionTree(lexicons=lexicons,
                                        contextWidth=3,
                                        centralPosition=1)
        tree.train(
            feat=fmllrFeat,
            hmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"),
            ali=ali,
            topoFile=os.path.join(args.expDir, "dict", "topo"),
            numLeaves=2500,
            tempDir=os.path.join(args.expDir, "train_sat"),
        )
        tree.save(os.path.join(args.expDir, "train_sat", "tree"))
        print(f"Build tree done.")
        del fmllrFeat

        # ------------- Start training ----------------------
        # 1. Initialize a monophone HMM object
        print("Initialize a triphone HMM object")
        model = exkaldi.hmm.TriphoneHMM(lexicons=lexicons)
        model.initialize(
            tree=tree,
            topoFile=os.path.join(args.expDir, "dict", "topo"),
            treeStatsFile=os.path.join(args.expDir, "train_sat",
                                       "treeStats.acc"),
        )
        print(f"Initialized a monophone HMM-GMM model: {model.info}.")

        # 2. convert the previous alignment
        print(f"Transform the alignment")
        newAli = exkaldi.hmm.convert_alignment(
            ali=ali,
            originHmm=os.path.join(args.expDir, "train_lda_mllt", "final.mdl"),
            targetHmm=model,
            tree=tree,
            outFile=os.path.join(args.expDir, "train_sat", "initial.ali"),
        )

        # 2. Split data for parallel training
        transcription = exkaldi.load_transcription(
            os.path.join(args.expDir, "data", "train", "text"))
        transcription = transcription.sort()

        if args.parallel > 1:
            # split feature
            ldaFeat = ldaFeat.sort(by="utt").subset(chunks=args.parallel)
            # split transcription depending on utterance IDs of each feat
            tempTrans = []
            tempAli = []
            tempFmllrMat = []
            for f in ldaFeat:
                tempTrans.append(transcription.subset(keys=f.utts))
                tempAli.append(newAli.subset(keys=f.utts))
                spks = exkaldi.utt_to_spk(f.utts,
                                          utt2spk=os.path.join(
                                              args.expDir, "data", "train",
                                              "utt2spk"))
                tempFmllrMat.append(fmllrTransMat.subset(keys=spks))
            transcription = tempTrans
            newAli = tempAli
            fmllrTransMat = tempFmllrMat

        # 3. Train
        print("Train the triphone model")
        model.train(
            ldaFeat,
            transcription,
            os.path.join(args.expDir, "dict", "L.fst"),
            tree,
            tempDir=os.path.join(args.expDir, "train_sat"),
            initialAli=newAli,
            fmllrTransMat=fmllrTransMat,
            spk2utt=os.path.join(args.expDir, "data", "train", "spk2utt"),
            utt2spk=os.path.join(args.expDir, "data", "train", "utt2spk"),
            numIters=args.numIters,
            maxIterInc=args.maxIterInc,
            totgauss=15000,
            realignIter=args.realignIter,
            fmllrIter=args.fmllrIter,
            boostSilence=1.0,
            power=0.2,
            fmllrSilWt=0.0,
        )
        print(model.info)
        del ldaFeat
        del fmllrTransMat
        del newAli

    else:
        declare.is_file(os.path.join(args.expDir, "train_sat", "final.mdl"))
        declare.is_file(os.path.join(args.expDir, "train_sat", "tree"))
        model = exkaldi.load_hmm(
            os.path.join(args.expDir, "train_sat", "final.mdl"))
        tree = exkaldi.load_tree(os.path.join(args.expDir, "train_sat",
                                              "tree"))

    # ------------- Compile WFST training ----------------------
    # Make a WFST decoding graph
    make_WFST_graph(
        outDir=os.path.join(args.expDir, "train_sat", "graph"),
        hmm=model,
        tree=tree,
    )
    # Decode test data
    GMM_decode_fmllr_and_score(
        outDir=os.path.join(args.expDir, "train_sat",
                            f"decode_{args.order}grams"),
        hmm=model,
        HCLGfile=os.path.join(args.expDir, "train_sat", "graph",
                              f"HCLG.{args.order}.fst"),
        tansformMatFile=os.path.join(args.expDir, "train_lda_mllt",
                                     "trans.mat"),
    )
Ejemplo n.º 5
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe("This program is used to train monophone GMM-HMM model")
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add("--delta",
             abbr="-d",
             dtype=int,
             default=2,
             discription="Add n-order to feature.")
    args.add("--numIters",
             abbr="-n",
             dtype=int,
             default=40,
             discription="How many iterations to train.")
    args.add("--maxIterInc",
             abbr="-m",
             dtype=int,
             default=30,
             discription="The final iteration of increasing gaussians.")
    args.add("--realignIter",
             abbr="-r",
             dtype=int,
             default=[
                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29,
                 32, 35, 38
             ],
             discription="the iteration to realign feature.")
    args.add("--order",
             abbr="-o",
             dtype=int,
             default=6,
             minV=1,
             maxV=6,
             discription="Which N-grams model to use.")
    args.add("--beam",
             abbr="-b",
             dtype=int,
             default=13,
             discription="Decode beam size.")
    args.add("--latBeam",
             abbr="-l",
             dtype=int,
             default=6,
             discription="Lattice beam size.")
    args.add("--acwt",
             abbr="-a",
             dtype=float,
             default=0.083333,
             discription="Acoustic model weight.")
    args.add(
        "--parallel",
        abbr="-p",
        dtype=int,
        default=4,
        minV=1,
        maxV=10,
        discription=
        "The number of parallel process to compute feature of train dataset.")
    args.add("--skipTrain",
             abbr="-s",
             dtype=bool,
             default=False,
             discription="If True, skip training. Do decoding only.")
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    args.print_args()  # print arguments to display
    argsLogFile = os.path.join(args.expDir, "conf", "train_mono.args")
    args.save(argsLogFile)

    if not args.skipTrain:
        # ------------- Prepare feature for training ----------------------
        # 1. Load the feature for training (We use the index table format)
        feat = exkaldi.load_index_table(
            os.path.join(args.expDir, "mfcc", "train", "mfcc_cmvn.ark"))
        print(f"Load MFCC+CMVN feature.")
        feat = exkaldi.add_delta(feat,
                                 order=args.delta,
                                 outFile=os.path.join(args.expDir,
                                                      "train_mono",
                                                      "mfcc_cmvn_delta.ark"))
        print(f"Add {args.delta}-order deltas.")
        # 2. Load lexicon bank
        lexicons = exkaldi.load_lex(
            os.path.join(args.expDir, "dict", "lexicons.lex"))
        print(f"Restorage lexicon bank.")

        # ------------- Start training ----------------------
        # 1. Initialize a monophone HMM object
        model = exkaldi.hmm.MonophoneHMM(lexicons=lexicons, name="mono")
        model.initialize(feat=feat,
                         topoFile=os.path.join(args.expDir, "dict", "topo"))
        print(f"Initialized a monophone HMM-GMM model: {model.info}.")

        # 2. Split data for parallel training
        transcription = exkaldi.load_transcription(
            os.path.join(args.expDir, "data", "train", "text"))
        transcription = transcription.sort()
        if args.parallel > 1:
            # split feature
            feat = feat.sort(by="utt").subset(chunks=args.parallel)
            # split transcription depending on utterance IDs of each feature
            temp = []
            for f in feat:
                temp.append(transcription.subset(keys=f.utts))
            transcription = temp

        # 3. Train
        model.train(
            feat,
            transcription,
            LFile=os.path.join(args.expDir, "dict", "L.fst"),
            tempDir=os.path.join(args.expDir, "train_mono"),
            numIters=args.numIters,
            maxIterInc=args.maxIterInc,
            totgauss=1000,
            realignIter=args.realignIter,
            boostSilence=1.0,
        )
        print(model.info)
        # Save the tree
        model.tree.save(os.path.join(args.expDir, "train_mono", "tree"))
        print(f"Tree has been saved.")

        # 4. Realign with boostSilence 1.25
        print("Realign the training feature (boost silence = 1.25)")
        trainGraphFiles = exkaldi.utils.list_files(
            os.path.join(args.expDir, "train_mono", "*train_graph"))
        model.align(
            feat,
            trainGraphFile=
            trainGraphFiles,  # train graphs have been generated in the train step.
            boostSilence=1.25,  #1.5
            outFile=os.path.join(args.expDir, "train_mono", "final.ali"))
        del feat
        print("Save the new alignment done.")
        tree = model.tree

    else:
        declare.is_file(os.path.join(args.expDir, "train_mono", "final.mdl"))
        declare.is_file(os.path.join(args.expDir, "train_mono", "tree"))
        model = exkaldi.load_hmm(
            os.path.join(args.expDir, "train_mono", "final.mdl"))
        tree = exkaldi.load_tree(
            os.path.join(args.expDir, "train_mono", "tree"))

    # ------------- Compile WFST training ----------------------
    # Make a WFST decoding graph
    make_WFST_graph(
        outDir=os.path.join(args.expDir, "train_mono", "graph"),
        hmm=model,
        tree=tree,
    )

    # Decode test data
    GMM_decode_mfcc_and_score(
        outDir=os.path.join(args.expDir, "train_mono",
                            f"decode_{args.order}grams"),
        hmm=model,
        HCLGfile=os.path.join(args.expDir, "train_mono", "graph",
                              f"HCLG.{args.order}.fst"),
    )
Ejemplo n.º 6
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe(
        "This program is used to train triphone LSTM scoustic model with Tensorflow"
    )
    # 2. Add options
    args.add("--expDir",
             abbr="-e",
             dtype=str,
             default="exp",
             discription="The data and output path of current experiment.")
    args.add(
        "--LDAsplice",
        dtype=int,
        default=3,
        discription="Splice how many frames to head and tail for LDA feature.")
    args.add("--randomSeed",
             dtype=int,
             default=1234,
             discription="Random seed.")
    args.add("--batchSize",
             abbr="-b",
             dtype=int,
             default=8,
             discription="Mini batch size.")
    args.add("--gpu",
             abbr="-g",
             dtype=str,
             default="all",
             choices=["all", "0", "1"],
             discription="Use GPU.")
    args.add("--epoch", dtype=int, default=30, discription="Epoches.")
    args.add("--testStartEpoch",
             dtype=int,
             default=5,
             discription="Start to evaluate test dataset.")
    args.add("--dropout",
             abbr="-d",
             dtype=float,
             default=0.2,
             discription="Dropout.")
    args.add("--useCMVN",
             dtype=bool,
             default=False,
             discription="Wether apply CMVN to fmllr feature.")
    args.add(
        "--splice",
        dtype=int,
        default=0,
        discription="Splice how many frames to head and tail for Fmllr feature."
    )
    args.add("--delta",
             dtype=int,
             default=2,
             discription="Wether add delta to fmllr feature.")
    args.add("--normalizeFeat",
             dtype=bool,
             default=True,
             discription="Wether normalize the chunk dataset.")
    args.add("--normalizeAMP",
             dtype=bool,
             default=False,
             discription="Wether normalize the post-probability.")
    args.add("--order",
             abbr="-o",
             dtype=int,
             default=6,
             discription="Language model order.")
    args.add("--beam", dtype=int, default=13, discription="Decode beam size.")
    args.add("--latBeam",
             dtype=int,
             default=6,
             discription="Lattice beam size.")
    args.add("--acwt",
             dtype=float,
             default=0.083333,
             discription="Acoustic model weight.")
    args.add("--predictModel",
             abbr="-m",
             dtype=str,
             default="",
             discription="If not void, skip training. Do decoding only.")
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    args.save(f"./{args.expDir}/conf/train_lstm.args")

    random.seed(args.randomSeed)
    np.random.seed(args.randomSeed)
    tf.random.set_seed(args.randomSeed)

    # ------------- Prepare data for dnn training ----------------------
    if not os.path.isfile(f"./{args.expDir}/train_lstm/data/dims"):
        prepare_LSTM_data()

    # ------------- Prepare data for lstm training ----------------------
    stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    outDir = f"{args.expDir}/train_lstm/out_{stamp}"
    exkaldi.utils.make_dependent_dirs(outDir, pathIsFile=False)

    #------------------------ Training and Validation -----------------------------
    dims = exkaldi.load_list_table(f"{args.expDir}/train_lstm/data/dims")
    featDim = int(dims["fmllr"])
    pdfDim = int(dims["pdfs"])
    phoneDim = int(dims["phones"])

    # Initialize model
    if args.delta > 0:
        featDim *= (args.delta + 1)
    if args.splice > 0:
        featDim *= (2 * args.splice + 1)

    if len(args.predictModel.strip()) == 0:

        print('Prepare Data Iterator...')
        # Prepare fMLLR feature files
        trainIterator = DataIterator(batchSize=args.batchSize, training=True)
        devIterator = DataIterator(batchSize=args.batchSize, training=False)

        print('Prepare test data')
        testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim)

        metris = {
            "train_loss":
            keras.metrics.Mean(name="train/loss", dtype=tf.float32),
            "train_pdfID_accuracy":
            keras.metrics.Mean(name="train/pdfID_accuracy", dtype=tf.float32),
            "train_phoneID_accuracy":
            keras.metrics.Mean(name="train/phoneID_accuracy",
                               dtype=tf.float32),
            "dev_loss":
            keras.metrics.Mean(name="eval/loss", dtype=tf.float32),
            "dev_pdfID_accuracy":
            keras.metrics.Mean(name="eval/pdfID_accuracy", dtype=tf.float32),
            "dev_phoneID_accuracy":
            keras.metrics.Mean(name="eval/phoneID_accuracy", dtype=tf.float32),
        }

        def train_step(model, optimizer, batch):
            feat, pdfAli, phoneAli = batch
            with tf.GradientTape() as tape:
                pdfPred, phonePred = model(feat, training=True)
                L1 = keras.losses.sparse_categorical_crossentropy(
                    pdfAli, pdfPred, from_logits=True)
                L2 = keras.losses.sparse_categorical_crossentropy(
                    phoneAli, phonePred, from_logits=True)
                loss = L1 + L2
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))

            metris["train_loss"](loss)

            #pdfPred = tf.convert_to_tensor(pdfPred, np.float32)
            A1 = keras.metrics.sparse_categorical_accuracy(pdfAli, pdfPred)
            metris["train_pdfID_accuracy"](A1)

            #phonePred = tf.convert_to_tensor(phonePred, np.float32)
            A2 = keras.metrics.sparse_categorical_accuracy(phoneAli, phonePred)
            metris["train_phoneID_accuracy"](A2)

            return float(np.mean(L1.numpy())), float(np.mean(
                L2.numpy())), float(np.mean(A1.numpy())), float(
                    np.mean(A2.numpy()))

        def dev_step(model, batch):
            feat, pdfAli, phoneAli = batch
            pdfPred, phonePred = model(feat, training=False)
            L1 = keras.losses.sparse_categorical_crossentropy(pdfAli,
                                                              pdfPred,
                                                              from_logits=True)
            L2 = keras.losses.sparse_categorical_crossentropy(phoneAli,
                                                              phonePred,
                                                              from_logits=True)
            loss = L1 + L2

            metris["dev_loss"](loss)

            #pdfPred = tf.convert_to_tensor(pdfPred, np.float32)
            A1 = keras.metrics.sparse_categorical_accuracy(pdfAli, pdfPred)
            metris["dev_pdfID_accuracy"](A1)

            #phonePred = tf.convert_to_tensor(phonePred, np.float32)
            A2 = keras.metrics.sparse_categorical_accuracy(phoneAli, phonePred)
            metris["dev_phoneID_accuracy"](A2)

            return float(np.mean(L1.numpy())), float(np.mean(
                L2.numpy())), float(np.mean(A1.numpy())), float(
                    np.mean(A2.numpy()))

        def main_loop():

            model = make_LSTM_model(args, featDim, pdfDim, phoneDim)
            model.summary()

            optimizer = keras.optimizers.RMSprop(learning_rate=0.0004,
                                                 rho=0.95,
                                                 momentum=0.0,
                                                 epsilon=1e-07)

            scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir)
            modelSaver = ModelSaver(model, outDir)

            for e in range(args.epoch):
                # Training
                startTime = time.time()
                for i in range(trainIterator.epochSize):
                    batch = trainIterator.next()
                    pdfLoss, phoneLoss, pdfAcc, phoneAcc = train_step(
                        model, optimizer, batch)
                    tf.print(
                        f"\rtraining: {i}/{trainIterator.epochSize} pdfID loss {pdfLoss:.3f} phoneID loss {phoneLoss:.3f} pdfID accuracy {pdfAcc:.3f} phoneID accuracy {phoneAcc:.3f}",
                        end="\t")
                tf.print()
                # Evaluate
                for i in range(devIterator.epochSize):
                    batch = devIterator.next()
                    pdfLoss, phoneLoss, pdfAcc, phoneAcc = dev_step(
                        model, batch)
                    tf.print(
                        f"\revaluate: {i}/{devIterator.epochSize} pdfID loss {pdfLoss:.3f} phoneID loss {phoneLoss:.3f} pdfID accuracy {pdfAcc:.3f} phoneID accuracy {phoneAcc:.3f}",
                        end="\t")
                tf.print()
                # Test
                tf.print("testing:", end=" ")
                testWER = scorer.test(e)
                tf.print()

                endTime = time.time()
                message = f"epoch {e} "
                for Name in metris.keys():
                    message += f"{Name} {float(metris[Name].result().numpy()):.3f} "
                    metris[Name].reset_states()
                message += f"test PER {testWER:.2f} time cost {int(endTime-startTime)}s"
                tf.print(message)

                modelSaver.save(e)

        main_loop()

    else:
        declare.is_file(args.predictModel)

        model = make_LSTM_model(featDim, pdfDim, phoneDim)
        model.summary()

        model.load_weights(args.predictModel)

        print('Prepare test data')
        testFeat, testBias, testTrans = prepare_test_data(postProbDim=pdfDim)
        scorer = EvaluateWER(model, testFeat, testBias, testTrans, outDir)

        scorer.test(0)
Ejemplo n.º 7
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe("This program is used to prepare TIMIT data.")
    # 2. Add some options
    args.add("--timitRoot",
             dtype=str,
             abbr="-t",
             default="/Corpus/TIMIT",
             discription="The root path of timit dataset.")
    args.add("--expDir",
             dtype=str,
             abbr="-e",
             default="exp",
             discription="The output path to save generated data.")
    # 3. Then start to parse arguments.
    args.parse()
    # 4. Take a backup of arguments
    args.save(os.path.join(args.expDir, "conf", "prepare_data.args"))

    # ------------- Do some preparative work ----------------------
    # 2. Ensure Kaldi has existed
    declare.kaldi_existed()
    # 3. sph2pipe tool will be used if the timit data is sph format.
    sph2pipeTool = os.path.join(info.KALDI_ROOT, "tools", "sph2pipe_v2.5",
                                "sph2pipe")
    declare.is_file("sph2pipe tool", sph2pipeTool)

    # ------------- Check TIMIT data format -------------
    # 1. Get the directory name
    declare.is_dir("TIMIT root directory", args.timitRoot)
    dirNames = os.listdir(args.timitRoot)
    if "TRAIN" in dirNames and "TEST" in dirNames:
        uppercaseFlag = True
        trainResourceDir = "TRAIN"
        testResourceDir = "TEST"
        testWavFile = os.path.join(args.timitRoot, "TRAIN", "DR1", "FCJF0",
                                   "SA1.WAV")  # used to test the file format
        wavFileSuffix = "WAV"
        txtFileSuffix = "PHN"
    elif "train" in dirNames and "test" in dirNames:
        uppercaseFlag = False
        trainResourceDir = "train"
        testResourceDir = "test"
        testWavFile = os.path.join(args.timitRoot, "train", "dr1", "fcjf0",
                                   "sa1.wav")  # used to test the file format
        wavFileSuffix = "wav"
        txtFileSuffix = "phn"
    else:
        raise Exception(f"Wrong format of train or test data directories.")
    # 2. check whether wave file is sph format.
    formatCheckCmd = f"{sph2pipeTool} -f wav {testWavFile}"
    out, err, cod = exkaldi.utils.run_shell_command(formatCheckCmd,
                                                    stderr="PIPE")
    if cod == 0:
        sphFlag = True
    else:
        sphFlag = False

    # --------- Generate phone-map dictionary --------
    # 1. Generate 60-48 catagories and 48-39 catagories mapping dictionary
    phoneMap_60_to_48 = exkaldi.ListTable(name="69-48")
    phoneMap_48_to_39 = exkaldi.ListTable(name="48-39")
    mapFile = os.path.join(info.KALDI_ROOT, "egs", "timit", "s5", "conf",
                           "phones.60-48-39.map")
    declare.is_file("60-48-39 phone map",
                    mapFile)  # Check whether or not it existed
    with open(mapFile, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
        for line in lines:
            line = line.strip().split()
            if len(line) < 3:  #phone "q" will be omitted temporarily.
                continue
            phoneMap_60_to_48[line[0]] = line[1]
            phoneMap_48_to_39[line[1]] = line[2]
    # 2. Save 48-39 phone map for futher use.
    phoneMap_48_to_39.save(
        os.path.join(args.expDir, "dict", "phones.48_to_39.map"))

    # --------- Generate train dataset --------
    wavs = glob.glob(
        os.path.join(args.timitRoot, trainResourceDir, "*", "*",
                     f"*.{wavFileSuffix}"))
    out = os.path.join(args.expDir, "data", "train")
    generate_data(wavs, out, sphFlag, sph2pipeTool, txtFileSuffix,
                  phoneMap_60_to_48)

    # --------- Generate dev and test data --------
    for Name in ["dev", "test"]:
        spkListFile = os.path.join(info.KALDI_ROOT, "egs", "timit", "s5",
                                   "conf", f"{Name}_spk.list")
        declare.is_file(f"speakers list for {Name}",
                        spkListFile)  # Check whether or not it existed
        with open(spkListFile, "r", encoding="utf-8") as fr:
            spkList = fr.readlines()
        wavs = []
        for spk in spkList:
            spk = spk.strip()
            if len(spk) == 0:
                continue
            if uppercaseFlag:
                spk = spk.upper()
            wavs.extend(
                glob.glob(
                    os.path.join(args.timitRoot, testResourceDir, "*", spk,
                                 f"*.{wavFileSuffix}")))

        out = os.path.join(args.expDir, "data", Name)
        generate_data(wavs, out, sphFlag, sph2pipeTool, txtFileSuffix,
                      phoneMap_60_to_48)
Ejemplo n.º 8
0
def main():

    # ------------- Parse arguments from command line ----------------------
    # 1. Add a discription of this program
    args.describe("This program is used to train triphone GMM-HMM model")
    # 2. Add options
    args.add("--expDir", abbr="-e", dtype=str, default="exp", discription="The data and output path of current experiment.")
    args.add("--delta", abbr="-d", dtype=int, default=2, discription="Add n-order to feature.")
    args.add("--numIters", abbr="-n", dtype=int, default=35, discription="How many iterations to train.")
    args.add("--maxIterInc", abbr="-m", dtype=int, default=25, discription="The final iteration of increasing gaussians.")
    args.add("--realignIter", abbr="-r", dtype=int, default=[10,20,30], discription="the iteration to realign feature.")
    args.add("--order", abbr="-o", dtype=int, default=6, discription="Which N-grams model to use.")
    args.add("--beam", abbr="-b", dtype=int, default=13, discription="Decode beam size.")
    args.add("--latBeam", abbr="-l", dtype=int, default=6, discription="Lattice beam size.")
    args.add("--acwt", abbr="-a", dtype=float, default=0.083333, discription="Acoustic model weight.")
    args.add("--parallel", abbr="-p", dtype=int, default=4, minV=1, maxV=10, discription="The number of parallel process to compute feature of train dataset.")
    args.add("--skipTrain", abbr="-s", dtype=bool, default=False, discription="If True, skip training. Do decoding only.")
    # 3. Then start to parse arguments. 
    args.parse()
    # 4. Take a backup of arguments
    argsLogFile = os.path.join(args.expDir, "conf", "train_delta.args")
    args.save(argsLogFile)

    if not args.skipTrain:
        # ------------- Prepare feature and previous alignment for training ----------------------
        # 1. Load the feature for training
        feat = exkaldi.load_index_table(os.path.join(args.expDir,"mfcc","train","mfcc_cmvn.ark"))
        print(f"Load MFCC+CMVN feature.")
        feat = exkaldi.add_delta(feat, order=args.delta, outFile=os.path.join(args.expDir,"train_delta","mfcc_cmvn_delta.ark"))
        print(f"Add {args.delta}-order deltas.")
        # 2. Load lexicon bank
        lexicons = exkaldi.load_lex(os.path.join(args.expDir,"dict","lexicons.lex"))
        print(f"Restorage lexicon bank.")
        # 3. Load previous alignment
        ali = exkaldi.load_index_table(os.path.join(args.expDir,"train_mono","*final.ali"),useSuffix="ark")
        
        # -------------- Build the decision tree ------------------------
        print("Start build a tree")
        tree = exkaldi.hmm.DecisionTree(lexicons=lexicons, contextWidth=3, centralPosition=1)
        tree.train(
                    feat=feat, 
                    hmm=os.path.join(args.expDir,"train_mono","final.mdl"), 
                    ali=ali, 
                    topoFile=os.path.join(args.expDir,"dict","topo"), 
                    numLeaves=2500,
                    tempDir=os.path.join(args.expDir,"train_delta"), 
                )
        print(f"Build tree done.")

        # ------------- Start training ----------------------
        # 1. Initialize a monophone HMM object
        model = exkaldi.hmm.TriphoneHMM(lexicons=lexicons, name="mono")
        model.initialize(
                    tree=tree, 
                    topoFile=os.path.join(args.expDir,"dict","topo"),
                    treeStatsFile=os.path.join(args.expDir,"train_delta","treeStats.acc"),
                )
        print(f"Initialized a monophone HMM-GMM model: {model.info}.")

        # 2. convert the previous alignment
        print(f"Transform the alignment")
        newAli = exkaldi.hmm.convert_alignment(
                                        ali=ali,
                                        originHmm=os.path.join("exp","train_mono","final.mdl"), 
                                        targetHmm=model, 
                                        tree=tree,
                                        outFile=os.path.join(args.expDir,"train_delta","initial.ali"),
                                    )

        # 2. Split data for parallel training
        transcription = exkaldi.load_transcription(os.path.join(args.expDir,"data","train","text"))
        transcription = transcription.sort()
        if args.parallel > 1:
            # split feature
            feat = feat.sort(by="utt").subset(chunks=args.parallel)
            # split transcription depending on utterance IDs of each feat
            tempTrans = []
            tempAli = []
            for f in feat:
                tempTrans.append( transcription.subset(keys=f.utts) )
                tempAli.append( newAli.subset(keys=f.utts) )
            transcription = tempTrans
            newAli = tempAli

        # 3. Train
        print("Train the triphone model")
        model.train(feat,
                    transcription, 
                    os.path.join("exp","dict","L.fst"), 
                    tree,
                    tempDir=os.path.join(args.expDir,"train_delta"),
                    initialAli=newAli,
                    numIters=args.numIters, 
                    maxIterInc=args.maxIterInc,
                    totgauss=15000,
                    realignIter=args.realignIter,
                    boostSilence=1.0,
                )
        print(model.info)
        # Save the tree
        model.tree.save(os.path.join(args.expDir,"train_delta","tree"))
        print(f"Tree has been saved.")
        del feat

    else:
        declare.is_file( os.path.join(args.expDir,"train_delta","final.mdl") )
        declare.is_file( os.path.join(args.expDir,"train_delta","tree") )
        model = exkaldi.load_hmm( os.path.join(args.expDir,"train_delta","final.mdl") )
        tree = exkaldi.load_tree( os.path.join(args.expDir,"train_delta","tree") )

    # ------------- Compile WFST training ----------------------
    # Make a WFST decoding graph
    make_WFST_graph(
                outDir=os.path.join(args.expDir,"train_delta","graph"),
                hmm=model,
                tree=tree,
            )
    # Decode test data
    GMM_decode_mfcc_and_score(
                outDir=os.path.join(args.expDir,"train_delta",f"decode_{args.order}grams"), 
                hmm=model,
                HCLGfile=os.path.join(args.expDir,"train_delta","graph",f"HCLG.{args.order}.fst"),
            )