Example #1
0
    def test(self,
             gaussians=1,
             iter=8,
             mmi=False,
             diag=False,
             xword_id='',
             output_dir=None):

        ## Copy config file to the experiment dir
        config_output = '%s/config' % self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TESTING with config [%s]' % config_output)

        if self.test_pipeline['coding']:
            import coding
            coding_dir = '%s/Coding' % self.exp
            util.create_new_dir(coding_dir)
            count = coding.wav_to_mfc(self, coding_dir, self.mfc_list)
            log(self.logfh, 'CODING finished [%d files]' % count)

        if self.test_pipeline['test']:
            import dict_and_lm
            start_time = time.time()
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(
                model,
                self.dict,
                self.setup,
                self.data,
                self.word_mlf,
                self.mfc_list,
                skip_oov=True)
            log(self.logfh,
                'wrote word mlf [%d utts] [%s]' % (num_utts, self.word_mlf))

            self.decode(model, self.mfc_list, self.word_mlf, self.lm,
                        gaussians, iter, mmi, diag, xword_id, output_dir)
            total_time = time.time() - start_time
            log(self.logfh,
                'TESTING finished; secs elapsed [%1.2f]' % total_time)
Example #2
0
    def test(self, gaussians=1, iter=8, mmi=False, diag=False, xword_id='', output_dir=None):

        ## Copy config file to the experiment dir
        config_output = '%s/config' %self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TESTING with config [%s]' %config_output)

        if self.test_pipeline['coding']:
            import coding
            coding_dir = '%s/Coding' %self.exp
            util.create_new_dir(coding_dir)
            count = coding.wav_to_mfc(self, coding_dir, self.mfc_list)
            log(self.logfh, 'CODING finished [%d files]' %count)

        if self.test_pipeline['test']:
            import dict_and_lm
            start_time = time.time()
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(model, self.dict, self.setup, self.data, self.word_mlf, self.mfc_list, skip_oov=True)
            log(self.logfh, 'wrote word mlf [%d utts] [%s]' %(num_utts, self.word_mlf))

            self.decode(model, self.mfc_list, self.word_mlf, self.lm, gaussians, iter, mmi, diag, xword_id, output_dir)
            total_time = time.time() - start_time
            log(self.logfh, 'TESTING finished; secs elapsed [%1.2f]' %total_time)
Example #3
0
    def train(self):

        ## Copy config file to the experiment dir
        config_output = '%s/config' %self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TRAINING with config [%s]' %config_output)

        if self.train_pipeline['coding']:
            log(self.logfh, 'CODING started')
            import coding
            util.create_new_dir(self.coding_root)
            coding.create_config(self)
            count = coding.wav_to_mfc(self, self.coding_root, self.mfc_list)
            os.system('cp %s %s/mfc.list.original' %(self.mfc_list, self.misc))
            log(self.logfh, 'wrote mfc files [%d]' %count)
            log(self.logfh, 'CODING finished')

        if self.train_pipeline['lm']:
            log(self.logfh, 'MLF/LM/DICT started')
            import dict_and_lm
            phone_set = dict_and_lm.fix_cmu_dict(self.orig_dict, self.htk_dict)
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(self, self.htk_dict, self.setup, self.data, self.word_mlf, self.mfc_list)
            log(self.logfh, 'wrote word mlf [%d utts] [%s]' %(num_utts, self.word_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.dict' %(self.mfc_list, self.misc))
            num_entries = dict_and_lm.make_train_dict(self.htk_dict, self.train_dict, words)
            dict_and_lm.make_decode_dict(self.htk_dict, self.decode_dict, words)
            log(self.logfh, 'wrote training dictionary [%d entries] [%s]' %(num_entries, self.train_dict))

            util.create_new_dir(self.lm_dir)
            train_vocab = '%s/vocab' %self.lm_dir
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf, self.train_dict, train_vocab, self.lm_dir, self.lm, self.lm_order)
            log(self.logfh, 'wrote lm [%s] training ppl [%1.2f]' %(self.lm, ppl))
            log(self.logfh, 'MLF/LM/DICT finished')
            
        if self.train_pipeline['flat_start']:
            log(self.logfh, 'FLAT START started')
            import init_hmm
            init_hmm.word_to_phone_mlf(self, self.train_dict, self.word_mlf, self.phone_mlf, self.phone_list)
            log(self.logfh, 'wrote phone mlf [%s]' %self.phone_mlf)

            os.system('cp %s %s/phone.mlf.from.dict' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.dict' %self.misc)
            init_hmm.make_proto_hmm(self, self.mfc_list, self.proto_hmm)
            hmm_dir, num_mfcs = init_hmm.initialize_hmms(self, self.mono_root, self.mfc_list, self.phone_list, self.proto_hmm)
            log(self.logfh, 'initialized an HMM for each phone in [%s]' %hmm_dir)
            log(self.logfh, 'used [%d] mfc files to compute variance floor' %num_mfcs)

            import train_hmm
            for iter in range(1, self.initial_mono_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root, hmm_dir, self.phone_mlf, self.phone_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            align_config = '%s/config.align' %self.mono_root
            fh = open(align_config, 'w')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.close()
            
            align_dir = train_hmm.align(self, self.mono_root, self.mfc_list, hmm_dir, self.word_mlf, self.phone_mlf, self.phone_list, self.train_dict, align_config)
            log(self.logfh, 'aligned with model in [%s], wrote phone mlf [%s]' %(hmm_dir, self.phone_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.mono.align' %(self.mfc_list, self.misc))

            os.system('cp %s %s/phone.mlf.from.mono.align' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.mono.align' %self.misc)

            for iter in range(self.initial_mono_iters+1, self.initial_mono_iters+1+self.mono_iters):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root, hmm_dir, self.phone_mlf, self.phone_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'FLAT START finished')

        if self.train_pipeline['mixup_mono']:
            log(self.logfh, 'MIXUP MONO started')
            import train_hmm

            hmm_dir = '%s/HMM-%d-%d' %(self.mono_root, 1, self.initial_mono_iters+self.mono_iters)
            
            ## mixup everything
            for mix_size in self.mono_mixup_schedule:
                hmm_dir = train_hmm.mixup(self, self.mixup_mono_root, hmm_dir, self.phone_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.mono_iters+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.mixup_mono_root, hmm_dir, self.phone_mlf, self.phone_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MIXUP MONO finished')

        if self.train_pipeline['mixdown_mono']:
            log(self.logfh, 'MIXDOWN MONO started')
            import train_hmm

            num_gaussians = self.mono_mixup_schedule[-1]
            hmm_dir = '%s/HMM-%d-%d' %(self.mixup_mono_root, num_gaussians, self.mono_iters)
            train_hmm.mixdown_mono(self, self.mixdown_mono_root, hmm_dir, self.phone_list)

            log(self.logfh, 'MIXDOWN MONO finished')

        if self.train_pipeline['mono_to_tri']:
            log(self.logfh, 'MONO TO TRI started')
            import train_hmm

            if self.train_pipeline['mixdown_mono']:
                mono_final_dir = '%s/HMM-1-0' %self.mixdown_mono_root
            else:
                mono_final_dir = '%s/HMM-%d-%d' %(self.mono_root, 1, self.initial_mono_iters+self.mono_iters)
                
            hmm_dir = train_hmm.mono_to_tri(self, self.xword_root, mono_final_dir, self.phone_mlf, self.tri_mlf, self.phone_list, self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' %hmm_dir)
            log(self.logfh, 'created triphone mlf [%s]' %self.tri_mlf)

            os.system('cp %s %s/tri.mlf.from.mono.align' %(self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.mono.align' %self.misc)
            os.system('cp %s %s/tri.list.from.mono.align' %(self.tri_list, self.misc))

            for iter in range(1, self.initial_tri_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tri_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            xword_tie_dir = '%s/HMM-%d-%d' %(self.xword_root, 1, self.initial_tri_iters+1)
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir, self.phone_list, self.tri_list, self.tied_list)
            log(self.logfh, 'tied states in [%s]' %hmm_dir)

            os.system('cp %s %s/tied.list.initial' %(self.tied_list, self.misc))

            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, 1, self.initial_tri_iters+1)
            for iter in range(self.initial_tri_iters+2, self.initial_tri_iters+1+self.tri_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tied_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MONO TO TRI finished')

        if self.train_pipeline['mixup_tri']:
            log(self.logfh, 'MIXUP TRI started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.initial_tri_iters+self.tri_iters+1
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, start_gaussians, start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size==2:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir, self.tied_list, mix_size, estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir, self.tied_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tied_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            log(self.logfh, 'MIXUP TRI finished')

        if self.train_pipeline['align_with_xword']:
            log(self.logfh, 'XWORD ALIGN started')
            import train_hmm

            align_config = '%s/config.align' %self.xword_root
            train_hmm.make_hvite_xword_config(self, align_config, 'MFCC_0_D_A_Z')
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            realigned_mlf = '%s/raw_tri_xword_realigned.mlf' %self.misc

            # Use the original, mfc list that has prons for every word
            os.system('cp %s/mfc.list.filtered.by.dict %s' %(self.misc, self.mfc_list))
            
            align_dir = train_hmm.align(self, self.xword_root, self.mfc_list, hmm_dir, self.word_mlf, realigned_mlf, self.tied_list, self.train_dict, align_config)
            log(self.logfh, 'aligned with model in [%s], tri mlf [%s]' %(hmm_dir, realigned_mlf))

            # Because of state tying, the triphones in the mlf will only be
            # valid for this state tying. Strip down to monophones, the
            # correct triphones will be created later in mono_to_tri
            train_hmm.map_tri_to_mono(self, align_dir, realigned_mlf, self.phone_mlf)
            os.system('cp %s %s/phone.mlf.from.xword.align' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.xword.align' %self.misc)
            os.system('bzip2 -f %s' %realigned_mlf)

            log(self.logfh, 'XWORD ALIGN finished')


        if self.train_pipeline['mono_to_tri_from_xword']:
            log(self.logfh, 'MONO TO TRI FROM XWORD started')
            import train_hmm

            #Assume that midown mono happened?
            mono_final_dir = '%s/HMM-1-0' %self.mixdown_mono_root

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_1_root, mono_final_dir, self.phone_mlf, self.tri_mlf, self.phone_list, self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' %hmm_dir)

            os.system('cp %s %s/tri.mlf.from.xword.align' %(self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.xword.align' %self.misc)
            os.system('cp %s %s/tri.list.from.xword.align' %(self.tri_list, self.misc))

            two_model_config = '%s/config.two_model' %self.xword_1_root
            fh = open(two_model_config, 'w')
            fh.write('ALIGNMODELMMF = %s/HMM-%d-%d/MMF\n' %(self.xword_root, self.tri_mixup_schedule[-1], self.tri_iters_per_split))
            fh.write('ALIGNHMMLIST = %s\n' %self.tied_list)
            fh.close()

            # Do one pass of two-model re-estimation
            extra = ' -C %s' %two_model_config
            hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tri_list, 1, 1, extra)
            log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            xword_tie_dir = '%s/HMM-1-2' %self.xword_1_root
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir, self.phone_list, self.tri_list, self.tied_list)
            log(self.logfh, 'tied states in [%s]' %hmm_dir)

            os.system('cp %s %s/tied.list.second' %(self.tied_list, self.misc))

            hmm_dir = '%s/HMM-1-2' %self.xword_1_root
            for iter in range(3, self.tri_iters+3):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tied_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MONO TO TRI FROM XWORD finished')

        if self.train_pipeline['mixup_tri_2']:
            log(self.logfh, 'MIXUP TRI 2 started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.tri_iters+2
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_1_root, start_gaussians, start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size==2:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir, self.tied_list, mix_size, estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir, self.tied_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tied_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            log(self.logfh, 'MIXUP TRI 2 finished')
            
        if self.train_pipeline['diag']:
            log(self.logfh, 'DIAG started')
            import train_hmm
 
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['mixup_tri_2']:
                seed_dir = '%s/HMM-%d-%d' %(self.xword_1_root, num_gaussians, iter_num)
            else:
                seed_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            hmm_dir, L = train_hmm.diagonalize(self, self.diag_root, seed_dir, self.tied_list, self.tri_mlf, num_gaussians)
            log(self.logfh, 'ran diag in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            for iter in range(1, self.tri_iters_per_split+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.diag_root, hmm_dir, self.tri_mlf, self.tied_list, num_gaussians, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'DIAG finished')
            
        if self.train_pipeline['mmi']:
            log(self.logfh, 'DISCRIM started')
            
            ## Common items
            import mmi
            mmi_dir = '%s/MMI' %self.exp
            util.create_new_dir(mmi_dir)
            mfc_list_mmi = '%s/mfc.list' %mmi_dir
            os.system('cp %s %s' %(self.mfc_list, mfc_list_mmi))

            ## Create weak LM
            import dict_and_lm
            train_vocab = '%s/vocab' %self.lm_dir
            lm_order = 2
            target_ppl_ratio = 8
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf, self.train_dict, train_vocab, self.lm_dir, self.mmi_lm, lm_order, target_ppl_ratio)
            log(self.logfh, 'wrote lm for mmi [%s] training ppl [%1.2f]' %(self.mmi_lm, ppl))

            ## Create decoding lattices for every utterance
            lattice_dir = '%s/Denom/Lat_word' %mmi_dir
            util.create_new_dir(lattice_dir)
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['diag']:
                model_dir = '%s/HMM-%d-%d' %(self.diag_root, num_gaussians, iter_num)
            elif self.train_pipeline['mixup_tri_2']:
                model_dir = '%s/HMM-%d-%d' %(self.xword_1_root, num_gaussians, iter_num)
            else:
                model_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            mmi.decode_to_lattices(model, lattice_dir, model_dir, mfc_list_mmi, self.mmi_lm, self.decode_dict,
                                   self.tied_list, self.word_mlf)
            log(self.logfh, 'generated training lattices in [%s]' %lattice_dir)

            ## Prune and determinize lattices
            pruned_lattice_dir = '%s/Denom/Lat_prune' %mmi_dir
            util.create_new_dir(pruned_lattice_dir)
            mmi.prune_lattices(model, lattice_dir, pruned_lattice_dir, self.decode_dict)
            log(self.logfh, 'pruned lattices in [%s]' %pruned_lattice_dir)

            ## Phone-mark lattices
            phone_lattice_dir = '%s/Denom/Lat_phone' %mmi_dir
            util.create_new_dir(phone_lattice_dir)
            mmi.phonemark_lattices(model, pruned_lattice_dir, phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict, self.tied_list)
            log(self.logfh, 'phone-marked lattices in [%s]' %phone_lattice_dir)

            ## Create numerator word lattices
            num_lattice_dir = '%s/Num/Lat_word' %mmi_dir
            util.create_new_dir(num_lattice_dir)
            mmi.create_num_lattices(model, num_lattice_dir, self.mmi_lm, self.decode_dict, self.word_mlf)
            log(self.logfh, 'generated numerator lattices in [%s]' %num_lattice_dir)

            ## Phone-mark numerator lattices
            num_phone_lattice_dir = '%s/Num/Lat_phone' %mmi_dir
            util.create_new_dir(num_phone_lattice_dir)
            mmi.phonemark_lattices(model, num_lattice_dir, num_phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict, self.tied_list)
            log(self.logfh, 'phone-marked numerator lattices in [%s]' %num_phone_lattice_dir)

            ## Add LM scores to numerator phone lattices
            num_phone_lm_lattice_dir = '%s/Num/Lat_phone_lm' %mmi_dir
            util.create_new_dir(num_phone_lm_lattice_dir)
            mmi.add_lm_lattices(model, num_phone_lattice_dir, num_phone_lm_lattice_dir, self.decode_dict, self.mmi_lm)
            log(self.logfh, 'added LM scores to numerator lattices in [%s]' %num_phone_lm_lattice_dir)

            ## Modified Baum-Welch estimation
            root_dir = '%s/Models' %mmi_dir
            util.create_new_dir(root_dir)
            mmi_iters = 12
            mix_size = num_gaussians
            for iter in range(1, mmi_iters+1):
                model_dir = mmi.run_iter(model, model_dir, num_phone_lm_lattice_dir, phone_lattice_dir, root_dir,
                                         self.tied_list, mfc_list_mmi, mix_size, iter)
                log(self.logfh, 'ran an iteration of Modified BW in [%s]' %model_dir)

            log(self.logfh, 'DISCRIM finished')
Example #4
0
    def train(self):

        ## Copy config file to the experiment dir
        config_output = '%s/config' % self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TRAINING with config [%s]' % config_output)

        if self.train_pipeline['coding']:
            log(self.logfh, 'CODING started')
            import coding
            util.create_new_dir(self.coding_root)
            coding.create_config(self)
            count = coding.wav_to_mfc(self, self.coding_root, self.mfc_list)
            os.system('cp %s %s/mfc.list.original' %
                      (self.mfc_list, self.misc))
            log(self.logfh, 'wrote mfc files [%d]' % count)
            log(self.logfh, 'CODING finished')

        if self.train_pipeline['lm']:
            log(self.logfh, 'MLF/LM/DICT started')
            import dict_and_lm
            phone_set = dict_and_lm.fix_cmu_dict(self.orig_dict, self.htk_dict)
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(
                self, self.htk_dict, self.setup, self.data, self.word_mlf,
                self.mfc_list)
            log(self.logfh,
                'wrote word mlf [%d utts] [%s]' % (num_utts, self.word_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.dict' %
                      (self.mfc_list, self.misc))
            num_entries = dict_and_lm.make_train_dict(self.htk_dict,
                                                      self.train_dict, words)
            dict_and_lm.make_decode_dict(self.htk_dict, self.decode_dict,
                                         words)
            log(
                self.logfh, 'wrote training dictionary [%d entries] [%s]' %
                (num_entries, self.train_dict))

            util.create_new_dir(self.lm_dir)
            train_vocab = '%s/vocab' % self.lm_dir
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf,
                                                self.train_dict, train_vocab,
                                                self.lm_dir, self.lm,
                                                self.lm_order)
            log(self.logfh,
                'wrote lm [%s] training ppl [%1.2f]' % (self.lm, ppl))
            log(self.logfh, 'MLF/LM/DICT finished')

        if self.train_pipeline['flat_start']:
            log(self.logfh, 'FLAT START started')
            import init_hmm
            init_hmm.word_to_phone_mlf(self, self.train_dict, self.word_mlf,
                                       self.phone_mlf, self.phone_list)
            log(self.logfh, 'wrote phone mlf [%s]' % self.phone_mlf)

            os.system('cp %s %s/phone.mlf.from.dict' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.dict' % self.misc)
            init_hmm.make_proto_hmm(self, self.mfc_list, self.proto_hmm)
            hmm_dir, num_mfcs = init_hmm.initialize_hmms(
                self, self.mono_root, self.mfc_list, self.phone_list,
                self.proto_hmm)
            log(self.logfh,
                'initialized an HMM for each phone in [%s]' % hmm_dir)
            log(self.logfh,
                'used [%d] mfc files to compute variance floor' % num_mfcs)

            import train_hmm
            for iter in range(1, self.initial_mono_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root,
                                                   hmm_dir, self.phone_mlf,
                                                   self.phone_list, 1, iter,
                                                   '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            align_config = '%s/config.align' % self.mono_root
            fh = open(align_config, 'w')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.close()

            align_dir = train_hmm.align(self, self.mono_root, self.mfc_list,
                                        hmm_dir, self.word_mlf, self.phone_mlf,
                                        self.phone_list, self.train_dict,
                                        align_config)
            log(
                self.logfh,
                'aligned with model in [%s], wrote phone mlf [%s]' %
                (hmm_dir, self.phone_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.mono.align' %
                      (self.mfc_list, self.misc))

            os.system('cp %s %s/phone.mlf.from.mono.align' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.mono.align' % self.misc)

            for iter in range(self.initial_mono_iters + 1,
                              self.initial_mono_iters + 1 + self.mono_iters):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root,
                                                   hmm_dir, self.phone_mlf,
                                                   self.phone_list, 1, iter,
                                                   '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'FLAT START finished')

        if self.train_pipeline['mixup_mono']:
            log(self.logfh, 'MIXUP MONO started')
            import train_hmm

            hmm_dir = '%s/HMM-%d-%d' % (
                self.mono_root, 1, self.initial_mono_iters + self.mono_iters)

            ## mixup everything
            for mix_size in self.mono_mixup_schedule:
                hmm_dir = train_hmm.mixup(self, self.mixup_mono_root, hmm_dir,
                                          self.phone_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.mono_iters + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self,
                                                       self.mixup_mono_root,
                                                       hmm_dir, self.phone_mlf,
                                                       self.phone_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))

            log(self.logfh, 'MIXUP MONO finished')

        if self.train_pipeline['mixdown_mono']:
            log(self.logfh, 'MIXDOWN MONO started')
            import train_hmm

            num_gaussians = self.mono_mixup_schedule[-1]
            hmm_dir = '%s/HMM-%d-%d' % (self.mixup_mono_root, num_gaussians,
                                        self.mono_iters)
            train_hmm.mixdown_mono(self, self.mixdown_mono_root, hmm_dir,
                                   self.phone_list)

            log(self.logfh, 'MIXDOWN MONO finished')

        if self.train_pipeline['mono_to_tri']:
            log(self.logfh, 'MONO TO TRI started')
            import train_hmm

            if self.train_pipeline['mixdown_mono']:
                mono_final_dir = '%s/HMM-1-0' % self.mixdown_mono_root
            else:
                mono_final_dir = '%s/HMM-%d-%d' % (self.mono_root, 1,
                                                   self.initial_mono_iters +
                                                   self.mono_iters)

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_root,
                                            mono_final_dir, self.phone_mlf,
                                            self.tri_mlf, self.phone_list,
                                            self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' % hmm_dir)
            log(self.logfh, 'created triphone mlf [%s]' % self.tri_mlf)

            os.system('cp %s %s/tri.mlf.from.mono.align' %
                      (self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.mono.align' % self.misc)
            os.system('cp %s %s/tri.list.from.mono.align' %
                      (self.tri_list, self.misc))

            for iter in range(1, self.initial_tri_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tri_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            xword_tie_dir = '%s/HMM-%d-%d' % (self.xword_root, 1,
                                              self.initial_tri_iters + 1)
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir,
                                                  self.phone_list,
                                                  self.tri_list,
                                                  self.tied_list)
            log(self.logfh, 'tied states in [%s]' % hmm_dir)

            os.system('cp %s %s/tied.list.initial' %
                      (self.tied_list, self.misc))

            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, 1,
                                        self.initial_tri_iters + 1)
            for iter in range(self.initial_tri_iters + 2,
                              self.initial_tri_iters + 1 + self.tri_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'MONO TO TRI finished')

        if self.train_pipeline['mixup_tri']:
            log(self.logfh, 'MIXUP TRI started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.initial_tri_iters + self.tri_iters + 1
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, start_gaussians,
                                        start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size == 2:
                    hmm_dir = train_hmm.mixup(self,
                                              self.xword_root,
                                              hmm_dir,
                                              self.tied_list,
                                              mix_size,
                                              estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir,
                                              self.tied_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                       hmm_dir, self.tri_mlf,
                                                       self.tied_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))
            log(self.logfh, 'MIXUP TRI finished')

        if self.train_pipeline['align_with_xword']:
            log(self.logfh, 'XWORD ALIGN started')
            import train_hmm

            align_config = '%s/config.align' % self.xword_root
            train_hmm.make_hvite_xword_config(self, align_config,
                                              'MFCC_0_D_A_Z')
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                        iter_num)
            realigned_mlf = '%s/raw_tri_xword_realigned.mlf' % self.misc

            # Use the original, mfc list that has prons for every word
            os.system('cp %s/mfc.list.filtered.by.dict %s' %
                      (self.misc, self.mfc_list))

            align_dir = train_hmm.align(self, self.xword_root, self.mfc_list,
                                        hmm_dir, self.word_mlf, realigned_mlf,
                                        self.tied_list, self.train_dict,
                                        align_config)
            log(
                self.logfh, 'aligned with model in [%s], tri mlf [%s]' %
                (hmm_dir, realigned_mlf))

            # Because of state tying, the triphones in the mlf will only be
            # valid for this state tying. Strip down to monophones, the
            # correct triphones will be created later in mono_to_tri
            train_hmm.map_tri_to_mono(self, align_dir, realigned_mlf,
                                      self.phone_mlf)
            os.system('cp %s %s/phone.mlf.from.xword.align' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.xword.align' % self.misc)
            os.system('bzip2 -f %s' % realigned_mlf)

            log(self.logfh, 'XWORD ALIGN finished')

        if self.train_pipeline['mono_to_tri_from_xword']:
            log(self.logfh, 'MONO TO TRI FROM XWORD started')
            import train_hmm

            #Assume that midown mono happened?
            mono_final_dir = '%s/HMM-1-0' % self.mixdown_mono_root

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_1_root,
                                            mono_final_dir, self.phone_mlf,
                                            self.tri_mlf, self.phone_list,
                                            self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' % hmm_dir)

            os.system('cp %s %s/tri.mlf.from.xword.align' %
                      (self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.xword.align' % self.misc)
            os.system('cp %s %s/tri.list.from.xword.align' %
                      (self.tri_list, self.misc))

            two_model_config = '%s/config.two_model' % self.xword_1_root
            fh = open(two_model_config, 'w')
            fh.write('ALIGNMODELMMF = %s/HMM-%d-%d/MMF\n' %
                     (self.xword_root, self.tri_mixup_schedule[-1],
                      self.tri_iters_per_split))
            fh.write('ALIGNHMMLIST = %s\n' % self.tied_list)
            fh.close()

            # Do one pass of two-model re-estimation
            extra = ' -C %s' % two_model_config
            hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                               hmm_dir, self.tri_mlf,
                                               self.tri_list, 1, 1, extra)
            log(self.logfh,
                'ran an iteration of BW in [%s] lik/fr [%1.4f]' % (hmm_dir, L))

            xword_tie_dir = '%s/HMM-1-2' % self.xword_1_root
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir,
                                                  self.phone_list,
                                                  self.tri_list,
                                                  self.tied_list)
            log(self.logfh, 'tied states in [%s]' % hmm_dir)

            os.system('cp %s %s/tied.list.second' %
                      (self.tied_list, self.misc))

            hmm_dir = '%s/HMM-1-2' % self.xword_1_root
            for iter in range(3, self.tri_iters + 3):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'MONO TO TRI FROM XWORD finished')

        if self.train_pipeline['mixup_tri_2']:
            log(self.logfh, 'MIXUP TRI 2 started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.tri_iters + 2
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_1_root, start_gaussians,
                                        start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size == 2:
                    hmm_dir = train_hmm.mixup(self,
                                              self.xword_1_root,
                                              hmm_dir,
                                              self.tied_list,
                                              mix_size,
                                              estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir,
                                              self.tied_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                                       hmm_dir, self.tri_mlf,
                                                       self.tied_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))
            log(self.logfh, 'MIXUP TRI 2 finished')

        if self.train_pipeline['diag']:
            log(self.logfh, 'DIAG started')
            import train_hmm

            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['mixup_tri_2']:
                seed_dir = '%s/HMM-%d-%d' % (self.xword_1_root, num_gaussians,
                                             iter_num)
            else:
                seed_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                             iter_num)
            hmm_dir, L = train_hmm.diagonalize(self, self.diag_root, seed_dir,
                                               self.tied_list, self.tri_mlf,
                                               num_gaussians)
            log(self.logfh, 'ran diag in [%s] lik/fr [%1.4f]' % (hmm_dir, L))

            for iter in range(1, self.tri_iters_per_split + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.diag_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list,
                                                   num_gaussians, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'DIAG finished')

        if self.train_pipeline['mmi']:
            log(self.logfh, 'DISCRIM started')

            ## Common items
            import mmi
            mmi_dir = '%s/MMI' % self.exp
            util.create_new_dir(mmi_dir)
            mfc_list_mmi = '%s/mfc.list' % mmi_dir
            os.system('cp %s %s' % (self.mfc_list, mfc_list_mmi))

            ## Create weak LM
            import dict_and_lm
            train_vocab = '%s/vocab' % self.lm_dir
            lm_order = 2
            target_ppl_ratio = 8
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf,
                                                self.train_dict, train_vocab,
                                                self.lm_dir, self.mmi_lm,
                                                lm_order, target_ppl_ratio)
            log(
                self.logfh, 'wrote lm for mmi [%s] training ppl [%1.2f]' %
                (self.mmi_lm, ppl))

            ## Create decoding lattices for every utterance
            lattice_dir = '%s/Denom/Lat_word' % mmi_dir
            util.create_new_dir(lattice_dir)
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['diag']:
                model_dir = '%s/HMM-%d-%d' % (self.diag_root, num_gaussians,
                                              iter_num)
            elif self.train_pipeline['mixup_tri_2']:
                model_dir = '%s/HMM-%d-%d' % (self.xword_1_root, num_gaussians,
                                              iter_num)
            else:
                model_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                              iter_num)
            mmi.decode_to_lattices(model, lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict,
                                   self.tied_list, self.word_mlf)
            log(self.logfh,
                'generated training lattices in [%s]' % lattice_dir)

            ## Prune and determinize lattices
            pruned_lattice_dir = '%s/Denom/Lat_prune' % mmi_dir
            util.create_new_dir(pruned_lattice_dir)
            mmi.prune_lattices(model, lattice_dir, pruned_lattice_dir,
                               self.decode_dict)
            log(self.logfh, 'pruned lattices in [%s]' % pruned_lattice_dir)

            ## Phone-mark lattices
            phone_lattice_dir = '%s/Denom/Lat_phone' % mmi_dir
            util.create_new_dir(phone_lattice_dir)
            mmi.phonemark_lattices(model, pruned_lattice_dir,
                                   phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict,
                                   self.tied_list)
            log(self.logfh,
                'phone-marked lattices in [%s]' % phone_lattice_dir)

            ## Create numerator word lattices
            num_lattice_dir = '%s/Num/Lat_word' % mmi_dir
            util.create_new_dir(num_lattice_dir)
            mmi.create_num_lattices(model, num_lattice_dir, self.mmi_lm,
                                    self.decode_dict, self.word_mlf)
            log(self.logfh,
                'generated numerator lattices in [%s]' % num_lattice_dir)

            ## Phone-mark numerator lattices
            num_phone_lattice_dir = '%s/Num/Lat_phone' % mmi_dir
            util.create_new_dir(num_phone_lattice_dir)
            mmi.phonemark_lattices(model, num_lattice_dir,
                                   num_phone_lattice_dir, model_dir,
                                   mfc_list_mmi, self.mmi_lm, self.decode_dict,
                                   self.tied_list)
            log(
                self.logfh, 'phone-marked numerator lattices in [%s]' %
                num_phone_lattice_dir)

            ## Add LM scores to numerator phone lattices
            num_phone_lm_lattice_dir = '%s/Num/Lat_phone_lm' % mmi_dir
            util.create_new_dir(num_phone_lm_lattice_dir)
            mmi.add_lm_lattices(model, num_phone_lattice_dir,
                                num_phone_lm_lattice_dir, self.decode_dict,
                                self.mmi_lm)
            log(
                self.logfh, 'added LM scores to numerator lattices in [%s]' %
                num_phone_lm_lattice_dir)

            ## Modified Baum-Welch estimation
            root_dir = '%s/Models' % mmi_dir
            util.create_new_dir(root_dir)
            mmi_iters = 12
            mix_size = num_gaussians
            for iter in range(1, mmi_iters + 1):
                model_dir = mmi.run_iter(model, model_dir,
                                         num_phone_lm_lattice_dir,
                                         phone_lattice_dir, root_dir,
                                         self.tied_list, mfc_list_mmi,
                                         mix_size, iter)
                log(self.logfh,
                    'ran an iteration of Modified BW in [%s]' % model_dir)

            log(self.logfh, 'DISCRIM finished')