def combine_txt(indirpath, outfilepath, overwrite=False): lines = io.load_txt_dir(indirpath) wf = io.open_writefile_safe(outfilepath, overwrite) for line in lines: wf.write(" ".join(line[1:]) + "\n") wf.close()
def get_text_utts(indir, compilexpath): txt = load_txt_dir(indir) dct = dictionary.Dictionary(compilexpath) oov = get_oov_words(txt, dct) if len(oov) != 0: print "Please remove all OOV word containing sents or add the words to dictonary before proceeding." for w in oov: print w raise SiReError("OOV words present, cannot continue.") args.dictionary = dct args.intype = "txt" utts = get_utts(txt, args) return utts
raise SiReError("REDUCTION_LEVEL must be a float value! Was {0}!".format(args.pron_reduced[0])) else: args.pron_reduced = False if args.stanford_pcfg_parse: args.pcfgdict = read_stanford_pcfg_parses(args.parsedir) if args.stanford_dependency_parse: args.dependencydict = read_stanford_dependency_parses(args.parsedir) if args.intype == "txt": if not os.path.isdir(args.inpath): raise SiReError("Input path is not a directory! It must be when creating labs from text.") args.txtdir = args.inpath labs = io.load_txt_dir(args.txtdir, args.comma_is_pause) if args.dict == None: raise SiReError("No path to dictionary. Please use -dict option.") args.dictionary = dictionary.Dictionary(args.dict[1], args.dict[0]) #The phoneme set used must match the dictionary. args.phoneme_features = args.dictionary.phoneme_feats elif args.intype == "hts_lab": labs = io.open_labdir_line_by_line(args.inpath) args.intype = "hts_mlf" elif args.intype == "sire_lab": labs = io.open_labdir_line_by_line(args.inpath) else: if not os.path.exists(args.inpath): raise SiReError("Input path to mlf does no exist!") mlf = open(args.inpath, "r").readlines() labs = io.parse_mlf(mlf, args.intype)
options += " " + " ".join(args.lm_binary_options) else: if args.lm_type == "WORD_NGRAM": options = " -debug 2 -tolower -unk" elif args.lm_type == "PHONEME_NGRAM": # options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10 -out-nbest-dir "+args.outdirpath options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10" if args.f: options += " -overwrite" if args.lm_type == "WORD_NGRAM": if not args.pre_scored: scores = score_word_ngram(args) else: scores = open(os.path.join(args.outdirpath, "scored.txt"), "r").read() #Do the work write_word_score_files(scores, args) elif args.lm_type == "PHONEME_NGRAM": #First create lattices if not args.lattices_exist: txt = io.load_txt_dir(args.txtdir) dictionary = dictionary.Dictionary(args.combilexpath) create_lattices_and_list(txt, args.outdirpath, dictionary, args.f) #Then score them lattice_list_path = os.path.join(args.outdirpath, "lattices.list") subprocess.call(args.lm_binary + " -lm " + args.lm_path + " -in-lattice-list " + lattice_list_path + options + " > " + os.path.join(args.outdirpath, "scored.txt"), shell=True)
raise SiReError("You must be doing at least one mlf or slf type.") #Used for utt creation args.intype = "txt" args.stanford_pcfg_parse = False args.stanford_dependency_parse = False args.festival_features = False args.dictionary = dictionary.Dictionary(args.combilexpath) if args.pronoun_variant: if args.slf_phoneme or args.slf_align: pass else: raise SiReError("Cannot create pronounciation variant mlfs. Please output slfs.") txtfiles = io.load_txt_dir(args.txtdir) #Opening the mlf files here means we don't have to loop twice if outputting slfs as well. if args.mlf: #Out mlf with short pause wfsp = open(os.path.join(args.outdir, args.mlfname+"_sp.mlf"), "w") wfsp.write("#!MLF!#\n") #out mlf without short pause wfnosp = open(os.path.join(args.outdir, args.mlfname+"_no_sp.mlf"), "w") wfnosp.write("#!MLF!#\n") for txt in txtfiles: print "Processing {0}".format(txt) if args.mlf: #Make an utt utt = utterance.Utterance(txt, args)
#Used for utt creation args.intype = "txt" args.stanford_pcfg_parse = False args.stanford_dependency_parse = False args.festival_features = False args.dictionary = dictionary.Dictionary(args.combilexpath) if args.pronoun_variant: if args.slf_phoneme or args.slf_align: pass else: raise SiReError( "Cannot create pronounciation variant mlfs. Please output slfs." ) txtfiles = io.load_txt_dir(args.txtdir) #Opening the mlf files here means we don't have to loop twice if outputting slfs as well. if args.mlf: #Out mlf with short pause wfsp = open(os.path.join(args.outdir, args.mlfname + "_sp.mlf"), "w") wfsp.write("#!MLF!#\n") #out mlf without short pause wfnosp = open(os.path.join(args.outdir, args.mlfname + "_no_sp.mlf"), "w") wfnosp.write("#!MLF!#\n") for txt in txtfiles: print "Processing {0}".format(txt) if args.mlf: #Make an utt
# options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10 -out-nbest-dir "+args.outdirpath options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10" if args.f: options += " -overwrite" if args.lm_type == "WORD_NGRAM": if not args.pre_scored: scores = score_word_ngram(args) else: scores = open(os.path.join(args.outdirpath, "scored.txt"), "r").read() # Do the work write_word_score_files(scores, args) elif args.lm_type == "PHONEME_NGRAM": # First create lattices if not args.lattices_exist: txt = io.load_txt_dir(args.txtdir) dictionary = dictionary.Dictionary(args.combilexpath) create_lattices_and_list(txt, args.outdirpath, dictionary, args.f) # Then score them lattice_list_path = os.path.join(args.outdirpath, "lattices.list") subprocess.call( args.lm_binary + " -lm " + args.lm_path + " -in-lattice-list " + lattice_list_path + options + " > " + os.path.join(args.outdirpath, "scored.txt"), shell=True, )
else: args.pron_reduced = False if args.stanford_pcfg_parse: args.pcfgdict = read_stanford_pcfg_parses(args.parsedir) if args.stanford_dependency_parse: args.dependencydict = read_stanford_dependency_parses(args.parsedir) if args.intype == "txt": if not os.path.isdir(args.inpath): raise SiReError( "Input path is not a directory! It must be when creating labs from text." ) args.txtdir = args.inpath labs = io.load_txt_dir(args.txtdir, args.comma_is_pause) if args.dict == None: raise SiReError("No path to dictionary. Please use -dict option.") args.dictionary = dictionary.Dictionary(args.dict[1], args.dict[0]) #The phoneme set used must match the dictionary. args.phoneme_features = args.dictionary.phoneme_feats elif args.intype == "hts_lab": labs = io.open_labdir_line_by_line(args.inpath) # print "This is a lab", len(labs[0]) # labs is a list of lists. Each list within the list of one of the labs args.intype = "hts_mlf" elif args.intype == "sire_lab": labs = io.open_labdir_line_by_line(args.inpath) else: if not os.path.exists(args.inpath): raise SiReError("Input path to mlf does no exist!")