def train_language_model(target, source, env): """Train an n-gram language model using a plain text transcript. Uses IBM's compiled LM tools that ship with Attila. This can also be used on a segmented transcript, in which case the n-grams are over morphs rather than words. Sources: transcript file, n Targets: language model file """ text_file = source[0].rstr() n = source[1].read() with temp_dir() as prefix_dir, temp_file() as vocab_file, temp_file(suffix=".txt") as sentence_file, meta_open(text_file) as text_fd: sentences = ["<s> %s </s>" % (l) for l in text_fd] words = set(sum([s.split() for s in sentences], []) + ["<s>", "</s>", "<UNK>"]) with meta_open(vocab_file, "w") as ofd: ofd.write("\n".join(words)) with meta_open(sentence_file, "w") as ofd: ofd.write("\n".join(sentences)) prefix = os.path.join(prefix_dir, "counts") cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, sentence_file, vocab_file, prefix) out, err, success = run_command(env.subst(cmd)) if not success: return err lm = ".".join(target[0].rstr().split(".")[0:-2]) cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm) out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")}) if not success: return err return None
def _run_emma(target, source, env): with temp_file() as gold, temp_file() as guess, meta_open(source[0].rstr()) as _guess, meta_open(source[1].rstr()) as _gold: guesses = [x for x in _guess] words = [x.split()[0] for x in guesses] keep = set([x for x in guesses if re.match(r"^\w+$", x) and not re.match(r".*\d.*", x)]) with meta_open(gold, "w") as gold_fd: gold_fd.write("\n".join([x for x in _gold if x.split()[0] in keep])) with meta_open(guess, "w") as guess_fd: guess_fd.write("\n".join([x for x in guesses if x.split()[0] in keep])) cmd = env.subst("python ${EMMA} -g %s -p %s -L ${LPSOLVE_PATH}" % (guess, gold), source=source, target=target) pid = Popen(cmd.split(), stdout=PIPE) #out, err = pid.communicate() #prec, rec, fscore = [float(x.strip().split()[-1]) for x in out.strip().split("\n")[-3:]] with meta_open(target[0].rstr(), "w") as ofd: pass #ofd.write("\t".join(["MorphP", "MorphR", "MorphF"]) + "\n") #ofd.write("\t".join(["%.3f" % x for x in [prec, rec, fscore]]) + "\n") return None
def run_g2p(target, source, env): with temp_file() as tfname, meta_open(source[0].rstr()) as pl_fd: words = set([x.split()[0].split("(")[0] for x in pl_fd]) with meta_open(tfname, "w") as t_fd: t_fd.write("\n".join(words)) out, err, success = run_command(env.subst("%s %s/bin/g2p.py --model %s --encoding=%s --apply %s --variants-mass=%f --variants-number=%d" % (env["PYTHON"], env["OVERLAY"], source[1].rstr(), "utf-8", tfname, .9, 4)), env={"PYTHONPATH" : env.subst("${OVERLAY}/lib/python2.7/site-packages")}, ) if not success: return err else: with meta_open(target[0].rstr(), "w") as out_fd: out_fd.write(out) return None
def decode(target, source, env): """Decode some audio using a decoding network and some models (based on the example pipelines from IBM). This is the heart, and by far the most complicated and error-prone part, of the pipeline. Basically, the models IBM sent us are similar, but have small variations so that some need to be run differently. This builder tries to figure out what to do based on what model files exist, and then run the appropriate code. If it can't figure out what to run, it throws an error. It is also aware of how many jobs the experiment has been split into, and only runs the job it was told to. Most of the code was just slightly-adapted from the cfg.py, construct.py, and test.py files in the acoustic models IBM sent us. Sources: decoding network file, vocabulary file, pronunciation file, language model file Targets: ctm transcript file, consensus network file """ dnet, vocabulary, pronunciations, language_model = source out_path, tail = os.path.split(os.path.dirname(target[0].rstr())) env.Replace(VOCABULARY_FILE=vocabulary.rstr(), PRONUNCIATIONS_FILE=pronunciations.rstr(), LANGUAGE_MODEL_FILE=language_model, NETWORK_FILE=dnet, ) cfg = CFG(env) postThresh = 1e-04 mlpFile = env.maybe(env.subst("${MLP_FILE}")) melFile = env.maybe(env.subst("${MEL_FILE}")) warpFile = env.maybe(env.subst("${WARP_FILE}")) ldaFile = env.maybe(env.subst("${LDA_FILE}")) priorsFile = env.maybe(env.subst("${PRIORS_FILE}")) mlp = os.path.exists(cfg.mlpFile) and "weights.mlp" in mlpFile nmlp = os.path.exists(cfg.mlpFile) and "weights.mlp" not in mlpFile layer = os.path.exists(env.subst("${MODEL_PATH}/layer0")) db = dbase.DB(dirFn=dbase.getFlatDir) fe = FeCombo(db, int(env["SAMPLING_RATE"]), env["FEATURE_TYPE"]) fe.end = fe.fmllr fe.pcm.pcmDir = cfg.pcmDir fe.pcm.readMode = 'speaker' fe.norm.normMode = 1 fe.norm.normDir = env.subst("${CMS_PATH}") fe.fmllr.fmllrDir = env.subst("${FMLLR_PATH}") # # from test.py # jid = int(env["JOB_ID"]) jnr = int(env["JOB_COUNT"]) genLat = True genCons = True writeLat = False writeCons = True cfg.useDispatcher = False if nmlp: chunkSize = 10 else: chunkSize = 5 acweight = float(env.subst("${ACOUSTIC_WEIGHT}")) db.init(cfg.dbFile, 'utterance', False, jid, jnr, chunkSize=chunkSize) fe.mel.readFilter(melFile) fe.mel.readWarp(warpFile) fe.lda.readLDA(ldaFile) se = dsearch.Decoder(speed=12, scale=acweight, lmType=32, genLat=genLat) se.initGraph(cfg) se.latBeam = 7 se.linkMax = 700 if mlp: fe.ctx2 = frontend.FeCTX([fe.fmllr]) fe.ctx2.spliceN = 4 fe.ctx2.db = db fe.mlp.depL = [fe.ctx2] fe.mlp.db = db fe.end = fe.mlp fe.mlp.mlp.read(mlpFile) fe.mlp.mlp.layerL[0].afct = Act_Rectified() fe.mlp.mlp.layerL[1].afct = Act_Rectified() fe.mlp.mlp.layerL[2].afct = Act_Rectified() fe.mlp.mlp.layerL[3].afct = Act_Sigmoid() fe.mlp.mlp.layerL[4].afct = Act_ID() se.sc = NNScorer() se.dnet.scorer = se.sc se.sc.scale = acweight se.sc.feat = fe.end.feat se.sc.logInput = True se.sc.readPriors(priorsFile) pass elif layer: fe.ctx2 = frontend.FeCTX([fe.fmllr]) fe.ctx2.spliceN = 4 fe.ctx2.db = db fe.end = fe.ctx2 layerL = [] for i in range(6): l = nnet.LayerWeights() l.name = 'layer%d'%i l.isTrainable = False l.initWeightFile = env.subst('${MODEL_PATH}/layer%d') % i layerL.append(l) if i < 5: l = nnet.LayerSigmoid() l.name = 'layer%d-nonl' % i layerL.append(l) layerL[-1].matrixOut = True nn = nnet.NeuralNet(layerL=layerL, depL=[fe.end]) nn.db = db nn.configure() se.sc = NNScorer() se.dnet.scorer = se.sc se.sc.scale = acweight se.sc.feat = nn.feat se.sc.logInput = True se.sc.readPriors(priorsFile) elif nmlp: se.initAM(cfg) mlp = fe.mlp.mlp mlp.feat = MatrixCU() sigmoid = Act_Sigmoid() tanh = Act_Tanh() actid = Act_ID() softmax = Act_Softmax() softmax.logOutput = True mlp.read(mlpFile) for layerX in range(mlp.layerL.size()): mlp.layerL[layerX].afct = sigmoid mlp.layerL[-1].afct = actid se.sc = NNScorer() se.dnet.scorer = se.sc se.sc.scale = acweight se.sc.logInput = True se.sc.feat = mlp.layerL[-1].Y.mat se.sc.readPriors(priorsFile) se.latBeam = 6.5 se.linkMax = 700 binThresh = 1.0e-10 writeSIL = 0 totUtt = 0 totArc = 0 totNonSil = 0 totDur = 0.0 totDens = 0.0 else: return "Don't know how to run ASR with these models!" with meta_open(target[0].rstr(), "w") as ctm_ofd, tarfile.open(target[1].rstr(), "w|gz") as tf_ofd, temp_file() as temp_fname: for utt in db: key = utt + ' ' + os.path.splitext(db.getFile(utt))[0] if mlp or nmlp: fe.end.eval(utt) else: nn.eval(utt) se.search() txt = se.getHyp().strip() hyp = se.getCTM(key, db.getFrom(utt)) tscore = se.getScore() for c in hyp: ctm_ofd.write("%s\n" % (c)) se.rescore(env["RESCORE_BEAM"]) with meta_open(temp_fname, "w") as ofd: pass if writeLat: fname = "%s.fsm" % (utt) se.lat.write(temp_fname, db.getFrom(utt)) elif writeCons: fname = "%s.cons" % (utt) arcN = len(se.lat.arcs) durS = db.getTo(utt)- db.getFrom(utt) dens = arcN / durS se.consensus(postThresh) binThresh = 1.0e-10 writeSIL = 0 se.cons.write(temp_fname, db.getFrom(utt), binThresh, writeSIL) tf_ofd.add(temp_fname, arcname=fname) tf_ofd.close() return None