def run_gp(mapping,crf,lm): """ Run the g2p server. """ # # loading # click.echo(u"Loading...", err=True) click.echo(u"...data alignment", err=True) aligner = gp.Aligner(mapping=mapping) click.echo(u"...transcription CRF model", err=True) transcriber = gp.Transcriber() transcriber.load(crf) click.echo(u"...n-gram language model", err=True) rater = gp.Rater.load(lm) click.echo(u"...output formatter", err=True) formatter = gp.Formatter() # # load app and run # app = apps.create_gp_app(aligner,transcriber,rater,formatter) app.run()
def apply_gp(mapping, crf, lm, strings): """Convert strings""" # # loading # click.echo("Loading...", err=True) click.echo("...data alignment", err=True) aligner = gp.Aligner(mapping=mapping) click.echo("...transcription CRF model", err=True) transcriber = gp.Transcriber() transcriber.load(crf) click.echo("...n-gram language model", err=True) rater = gp.Rater.load(lm) # # conversion # # read input in_strings = [] if strings and strings[0] == "-": for line in sys.stdin: in_strings.append(line.strip()) elif strings: for datum in strings: in_strings.append(datum) else: pass # convert for string in in_strings: segmentations = aligner.scan(string.lower()) best_transcription = [] best_prob = 0.0 for segmentation in segmentations: transcriptions = transcriber.transcribe(segmentation) for transcription in transcriptions: prob = rater.rate([segmentation, transcription]) #click.echo("%s: %f" % (",".join(transcription),prob), err=True) if prob >= best_prob: best_prob = prob best_transcription = transcription click.echo(",".join(best_transcription))
def train_gp(mapping, model, data): """Train a model.""" # # stage 1: alignment # click.echo("Stage 1a: creating data alignment", err=True) # the aligner aligner = gp.Aligner(mapping=mapping) click.echo("Stage 1b: aligning training data", err=True) # iterate over input and align training data aligned_training_data = [] with open(str(data), "r") as f: training_data = f.read() for line in tqdm(training_data.split("\n")): # skip comments if line.startswith("#"): continue # assume tab-separated values fields = line.split("\t") if len(fields) < 2: continue # align alignment = aligner.align(fields[0], fields[1]) if alignment: aligned_training_data.append(alignment) else: click.echo("%s and %s could not be aligned." % (fields[0], fields[1])) # # stage 2: crf training # click.echo("Stage 2: training transcription CRF model", err=True) # the transcriber transcriber = gp.Transcriber() # train with previously aligned training data transcriber.train(aligned_training_data) # save transcriber.save(model + ".gp.crf") # # stage 3: language model training # click.echo("Stage 3: training rating n-gram language model", err=True) # the rater rater = gp.Rater() # train with previously aligned training data rater.train(aligned_training_data) # save rater.save(model + ".gp.ngram")
def test_scan(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) segmentations = aligner.scan(u"aabb") assert (segmentations == [['a', 'a', 'b', 'b'], ['aa', 'b', 'b']])
def test_align(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) alignment = aligner.align(u"aabb", u"abbbb") assert (alignment[0] == ['aa', 'b', 'b']) assert (alignment[1] == ['a', 'bb', 'bb'])
def test_expand(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) exp_fst = aligner.expand(u"aabb") exp_fst.draw('/tmp/exp.dot') assert (exp_fst.verify())
def test_scan(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) seg_fst = aligner.segment(u"aabb") seg_fst.draw('/tmp/seg.dot') assert (seg_fst.verify())
def test_chain(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) chain_fst = aligner.chain(u"aabb") chain_fst.draw('/tmp/chain.dot') assert (chain_fst.verify())
def test_loading(datadir): aligner = gp.Aligner(mapping=datadir.join('test_alignment.txt')) assert (aligner.status == 1)
def test_constructor(): aligner = gp.Aligner() assert (aligner != None)