Beispiel #1
0
def combine_txt(indirpath, outfilepath, overwrite=False):
    lines = io.load_txt_dir(indirpath)

    wf = io.open_writefile_safe(outfilepath, overwrite)

    for line in lines:
        wf.write(" ".join(line[1:]) + "\n")

    wf.close()
Beispiel #2
0
def combine_txt(indirpath, outfilepath, overwrite=False):
    lines = io.load_txt_dir(indirpath)

    wf = io.open_writefile_safe(outfilepath, overwrite)

    for line in lines:
        wf.write(" ".join(line[1:]) + "\n")

    wf.close()
Beispiel #3
0
def get_text_utts(indir, compilexpath):
    txt = load_txt_dir(indir)

    dct = dictionary.Dictionary(compilexpath)

    oov = get_oov_words(txt, dct)

    if len(oov) != 0:
        print "Please remove all OOV word containing sents or add the words to dictonary before proceeding."
        for w in oov:
            print w
        raise SiReError("OOV words present, cannot continue.")

    args.dictionary = dct
    args.intype = "txt"
    utts = get_utts(txt, args)
    return utts
Beispiel #4
0
def get_text_utts(indir, compilexpath):
  txt = load_txt_dir(indir)
  
  dct = dictionary.Dictionary(compilexpath)
  
  oov = get_oov_words(txt, dct)
  
  if len(oov) != 0:
    print "Please remove all OOV word containing sents or add the words to dictonary before proceeding."
    for w in oov:
      print w
    raise SiReError("OOV words present, cannot continue.")
  
  args.dictionary = dct
  args.intype = "txt"
  utts = get_utts(txt, args)
  return utts
Beispiel #5
0
     raise SiReError("REDUCTION_LEVEL must be a float value! Was {0}!".format(args.pron_reduced[0]))
 else:
   args.pron_reduced = False
 
 if args.stanford_pcfg_parse:
   args.pcfgdict = read_stanford_pcfg_parses(args.parsedir)
 
 if args.stanford_dependency_parse:
   args.dependencydict = read_stanford_dependency_parses(args.parsedir)
 
 
 if args.intype == "txt":
   if not os.path.isdir(args.inpath):
     raise SiReError("Input path is not a directory! It must be when creating labs from text.")
   args.txtdir = args.inpath
   labs = io.load_txt_dir(args.txtdir, args.comma_is_pause)
   if args.dict == None:
     raise SiReError("No path to dictionary. Please use -dict option.")
   args.dictionary = dictionary.Dictionary(args.dict[1], args.dict[0])
   #The phoneme set used must match the dictionary.
   args.phoneme_features = args.dictionary.phoneme_feats
 elif args.intype == "hts_lab":
   labs = io.open_labdir_line_by_line(args.inpath)
   args.intype = "hts_mlf"
 elif args.intype == "sire_lab":
   labs = io.open_labdir_line_by_line(args.inpath)
 else:
   if not os.path.exists(args.inpath):
     raise SiReError("Input path to mlf does no exist!")
   mlf = open(args.inpath, "r").readlines()
   labs = io.parse_mlf(mlf, args.intype)
Beispiel #6
0
        options += " " + " ".join(args.lm_binary_options)
    else:
        if args.lm_type == "WORD_NGRAM":
            options = " -debug 2 -tolower -unk"
        elif args.lm_type == "PHONEME_NGRAM":
            #      options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10 -out-nbest-dir "+args.outdirpath
            options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10"
            if args.f:
                options += " -overwrite"

    if args.lm_type == "WORD_NGRAM":
        if not args.pre_scored:
            scores = score_word_ngram(args)
        else:
            scores = open(os.path.join(args.outdirpath, "scored.txt"),
                          "r").read()
        #Do the work
        write_word_score_files(scores, args)
    elif args.lm_type == "PHONEME_NGRAM":
        #First create lattices
        if not args.lattices_exist:
            txt = io.load_txt_dir(args.txtdir)
            dictionary = dictionary.Dictionary(args.combilexpath)
            create_lattices_and_list(txt, args.outdirpath, dictionary, args.f)
        #Then score them
        lattice_list_path = os.path.join(args.outdirpath, "lattices.list")
        subprocess.call(args.lm_binary + " -lm " + args.lm_path +
                        " -in-lattice-list " + lattice_list_path + options +
                        " > " + os.path.join(args.outdirpath, "scored.txt"),
                        shell=True)
Beispiel #7
0
   raise SiReError("You must be doing at least one mlf or slf type.")
 
 #Used for utt creation
 args.intype = "txt"
 args.stanford_pcfg_parse = False
 args.stanford_dependency_parse = False
 args.festival_features = False
 args.dictionary = dictionary.Dictionary(args.combilexpath)
 
 if args.pronoun_variant:
   if args.slf_phoneme or args.slf_align:
     pass
   else:
     raise SiReError("Cannot create pronounciation variant mlfs. Please output slfs.")
 
 txtfiles = io.load_txt_dir(args.txtdir)
 
 #Opening the mlf files here means we don't have to loop twice if outputting slfs as well.
 if args.mlf:
   #Out mlf with short pause
   wfsp = open(os.path.join(args.outdir, args.mlfname+"_sp.mlf"), "w")
   wfsp.write("#!MLF!#\n")
   #out mlf without short pause
   wfnosp = open(os.path.join(args.outdir, args.mlfname+"_no_sp.mlf"), "w")
   wfnosp.write("#!MLF!#\n")
 
 for txt in txtfiles:
   print "Processing {0}".format(txt)
   if args.mlf:
     #Make an utt
     utt = utterance.Utterance(txt, args)
Beispiel #8
0
    #Used for utt creation
    args.intype = "txt"
    args.stanford_pcfg_parse = False
    args.stanford_dependency_parse = False
    args.festival_features = False
    args.dictionary = dictionary.Dictionary(args.combilexpath)

    if args.pronoun_variant:
        if args.slf_phoneme or args.slf_align:
            pass
        else:
            raise SiReError(
                "Cannot create pronounciation variant mlfs. Please output slfs."
            )

    txtfiles = io.load_txt_dir(args.txtdir)

    #Opening the mlf files here means we don't have to loop twice if outputting slfs as well.
    if args.mlf:
        #Out mlf with short pause
        wfsp = open(os.path.join(args.outdir, args.mlfname + "_sp.mlf"), "w")
        wfsp.write("#!MLF!#\n")
        #out mlf without short pause
        wfnosp = open(os.path.join(args.outdir, args.mlfname + "_no_sp.mlf"),
                      "w")
        wfnosp.write("#!MLF!#\n")

    for txt in txtfiles:
        print "Processing {0}".format(txt)
        if args.mlf:
            #Make an utt
Beispiel #9
0
            #      options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10 -out-nbest-dir "+args.outdirpath
            options = " -viterbi-decode -read-htk -order 4 -debug 1 -no-expansion -nbest-decode 10"
            if args.f:
                options += " -overwrite"

    if args.lm_type == "WORD_NGRAM":
        if not args.pre_scored:
            scores = score_word_ngram(args)
        else:
            scores = open(os.path.join(args.outdirpath, "scored.txt"), "r").read()
        # Do the work
        write_word_score_files(scores, args)
    elif args.lm_type == "PHONEME_NGRAM":
        # First create lattices
        if not args.lattices_exist:
            txt = io.load_txt_dir(args.txtdir)
            dictionary = dictionary.Dictionary(args.combilexpath)
            create_lattices_and_list(txt, args.outdirpath, dictionary, args.f)
        # Then score them
        lattice_list_path = os.path.join(args.outdirpath, "lattices.list")
        subprocess.call(
            args.lm_binary
            + " -lm "
            + args.lm_path
            + " -in-lattice-list "
            + lattice_list_path
            + options
            + " > "
            + os.path.join(args.outdirpath, "scored.txt"),
            shell=True,
        )
Beispiel #10
0
    else:
        args.pron_reduced = False

    if args.stanford_pcfg_parse:
        args.pcfgdict = read_stanford_pcfg_parses(args.parsedir)

    if args.stanford_dependency_parse:
        args.dependencydict = read_stanford_dependency_parses(args.parsedir)

    if args.intype == "txt":
        if not os.path.isdir(args.inpath):
            raise SiReError(
                "Input path is not a directory! It must be when creating labs from text."
            )
        args.txtdir = args.inpath
        labs = io.load_txt_dir(args.txtdir, args.comma_is_pause)
        if args.dict == None:
            raise SiReError("No path to dictionary. Please use -dict option.")
        args.dictionary = dictionary.Dictionary(args.dict[1], args.dict[0])
        #The phoneme set used must match the dictionary.
        args.phoneme_features = args.dictionary.phoneme_feats
    elif args.intype == "hts_lab":
        labs = io.open_labdir_line_by_line(args.inpath)
        # print "This is a lab", len(labs[0])
        # labs is a list of lists. Each list within the list of one of the labs
        args.intype = "hts_mlf"
    elif args.intype == "sire_lab":
        labs = io.open_labdir_line_by_line(args.inpath)
    else:
        if not os.path.exists(args.inpath):
            raise SiReError("Input path to mlf does no exist!")