def test_mark_jonhson(tmpdir, datadir): # this is a transcription of the original "toy run" delivered with # the original AG code (as a target in the Makefile) assert os.path.isdir(datadir) grammar_file = os.path.join(datadir, 'ag_testengger.lt') text = list( codecs.open(os.path.join(datadir, 'ag_testeng.yld'), 'r', encoding='utf8')) arguments = ( '-r 1234 -P -D -R -1 -d 100 -a 1e-2 -b 1 -e 1 -f 1 ' '-g 1e2 -h 1e-2 -n 10 -C -E -A {prs} -N 10 -F {trace} -G {wlt} ' # -X "cat > {X1}" -X "cat > {X2}" ' # -U "cat > {prs1}" -v {testeng2} -V "cat > {prs2}"' '-u {testeng1} '.format( testeng1=os.path.join(datadir, 'ag_testeng1.yld'), # testeng2=os.path.join(datadir, 'ag_testeng2.yld'), trace=tmpdir.join('trace'), wlt=tmpdir.join('wlt'), prs=tmpdir.join('prs') # X1=tmpdir.join('X1'), X2=tmpdir.join('X2'), # prs1=tmpdir.join('prs1'), prs2=tmpdir.join('prs2'))) )) # pc = ag.ParseCounter(len(text)) output = ag.segment(text, grammar_file=grammar_file, category='VP', args=arguments, ignore_first_parses=0, nruns=1) assert len(text) == len(output) for i in range(len(text)): assert text[i].strip().replace(' ', '') == output[i].replace(' ', '')
def test_default_grammar(prep): segmented = ag.segment(prep, args=TEST_ARGUMENTS, nruns=1) assert len(segmented) == len(prep) segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented) prep = ''.join(utt.replace(' ', '').strip() for utt in prep) assert segmented == prep
def test_ignore_first_parses(prep, ignore): # we use the default test value -n 10 -x 2 (10 iterations yields to 6 # parses, initial one and 5 each 2 iterations) if ignore < 6: segmented = ag.segment(prep, args=TEST_ARGUMENTS, nruns=1, ignore_first_parses=ignore) assert len(segmented) == len(prep) else: # ignoring more than the extracted parses raises an error with pytest.raises(RuntimeError): ag.segment(prep, args=TEST_ARGUMENTS, nruns=1, ignore_first_parses=ignore)
def test_grammars(prep, grammar, level): grammar = os.path.join(GRAMMAR_DIR, grammar) segmented = ag.segment(prep, grammar, level, TEST_ARGUMENTS, nruns=1) assert len(segmented) == len(prep) segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented) prep = ''.join(utt.replace(' ', '').strip() for utt in prep) assert segmented == prep
def test_traintext_equal_testtext(prep, grammar, level): grammar = os.path.join(GRAMMAR_DIR, grammar) segmented = ag.segment(prep, train_text=prep, grammar_file=grammar, category=level, args=TEST_ARGUMENTS, nruns=1) assert len(segmented) == len(prep) segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented) prep = ''.join(utt.replace(' ', '').strip() for utt in prep) assert segmented == prep
def test_traintext_notequal_testtext(grammar, level): # hello world train_text = ['hh ax l ow w er l d'] * 10 # good morn, that dog is big (no phones shared with train) test_text = ['g uh d m ao r n', 'dh ae t d ao g ih z b ih g'] grammar = os.path.join(GRAMMAR_DIR, grammar) segmented = ag.segment(test_text, train_text=train_text, grammar_file=grammar, category=level, args=TEST_ARGUMENTS, nruns=1) assert len(segmented) == len(test_text) segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented) prep = ''.join(utt.replace(' ', '').strip() for utt in test_text) assert segmented == prep
sys.stdout.write( '* Statistics\n\n' + json.dumps(stats, indent=4) + '\n') # prepare the input for segmentation prepared = list(prepare(text)) # generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs) # evaluate them against the gold file eval_baseline = evaluate(segmented_baseline, gold, units=prepared) eval_tp = evaluate(segmented_tp, gold, units=prepared) eval_puddle = evaluate(segmented_puddle, gold, units=prepared) eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared) eval_ag = evaluate(segmented_ag, gold, units=prepared) eval_dibs = evaluate(segmented_dibs, gold, units=prepared) # a little function to display score with 4-digits precision