コード例 #1
0
def test_mark_jonhson(tmpdir, datadir):
    # this is a transcription of the original "toy run" delivered with
    # the original AG code (as a target in the Makefile)
    assert os.path.isdir(datadir)

    grammar_file = os.path.join(datadir, 'ag_testengger.lt')
    text = list(
        codecs.open(os.path.join(datadir, 'ag_testeng.yld'),
                    'r',
                    encoding='utf8'))
    arguments = (
        '-r 1234 -P -D -R -1 -d 100 -a 1e-2 -b 1 -e 1 -f 1 '
        '-g 1e2 -h 1e-2 -n 10 -C -E -A {prs} -N 10 -F {trace} -G {wlt} '
        # -X "cat > {X1}" -X "cat > {X2}" '
        # -U "cat > {prs1}" -v {testeng2} -V "cat > {prs2}"'
        '-u {testeng1} '.format(
            testeng1=os.path.join(datadir, 'ag_testeng1.yld'),
            # testeng2=os.path.join(datadir, 'ag_testeng2.yld'),
            trace=tmpdir.join('trace'),
            wlt=tmpdir.join('wlt'),
            prs=tmpdir.join('prs')
            # X1=tmpdir.join('X1'), X2=tmpdir.join('X2'),
            # prs1=tmpdir.join('prs1'), prs2=tmpdir.join('prs2')))
        ))
    # pc = ag.ParseCounter(len(text))
    output = ag.segment(text,
                        grammar_file=grammar_file,
                        category='VP',
                        args=arguments,
                        ignore_first_parses=0,
                        nruns=1)
    assert len(text) == len(output)
    for i in range(len(text)):
        assert text[i].strip().replace(' ', '') == output[i].replace(' ', '')
コード例 #2
0
def test_default_grammar(prep):
    segmented = ag.segment(prep, args=TEST_ARGUMENTS, nruns=1)
    assert len(segmented) == len(prep)

    segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented)
    prep = ''.join(utt.replace(' ', '').strip() for utt in prep)
    assert segmented == prep
コード例 #3
0
def test_ignore_first_parses(prep, ignore):
    # we use the default test value -n 10 -x 2 (10 iterations yields to 6
    # parses, initial one and 5 each 2 iterations)
    if ignore < 6:
        segmented = ag.segment(prep,
                               args=TEST_ARGUMENTS,
                               nruns=1,
                               ignore_first_parses=ignore)
        assert len(segmented) == len(prep)
    else:
        # ignoring more than the extracted parses raises an error
        with pytest.raises(RuntimeError):
            ag.segment(prep,
                       args=TEST_ARGUMENTS,
                       nruns=1,
                       ignore_first_parses=ignore)
コード例 #4
0
def test_grammars(prep, grammar, level):
    grammar = os.path.join(GRAMMAR_DIR, grammar)
    segmented = ag.segment(prep, grammar, level, TEST_ARGUMENTS, nruns=1)
    assert len(segmented) == len(prep)

    segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented)
    prep = ''.join(utt.replace(' ', '').strip() for utt in prep)
    assert segmented == prep
コード例 #5
0
def test_traintext_equal_testtext(prep, grammar, level):
    grammar = os.path.join(GRAMMAR_DIR, grammar)
    segmented = ag.segment(prep,
                           train_text=prep,
                           grammar_file=grammar,
                           category=level,
                           args=TEST_ARGUMENTS,
                           nruns=1)
    assert len(segmented) == len(prep)

    segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented)
    prep = ''.join(utt.replace(' ', '').strip() for utt in prep)
    assert segmented == prep
コード例 #6
0
def test_traintext_notequal_testtext(grammar, level):
    # hello world
    train_text = ['hh ax l ow w er l d'] * 10

    # good morn, that dog is big (no phones shared with train)
    test_text = ['g uh d m ao r n', 'dh ae t d ao g ih z b ih g']

    grammar = os.path.join(GRAMMAR_DIR, grammar)
    segmented = ag.segment(test_text,
                           train_text=train_text,
                           grammar_file=grammar,
                           category=level,
                           args=TEST_ARGUMENTS,
                           nruns=1)
    assert len(segmented) == len(test_text)

    segmented = ''.join(utt.replace(' ', '').strip() for utt in segmented)
    prep = ''.join(utt.replace(' ', '').strip() for utt in test_text)
    assert segmented == prep
コード例 #7
0
ファイル: tutorial.py プロジェクト: rsantana-isg/wordseg
sys.stdout.write(
    '* Statistics\n\n' +
    json.dumps(stats, indent=4) + '\n')

# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)

# evaluate them against the gold file
eval_baseline = evaluate(segmented_baseline, gold, units=prepared)
eval_tp = evaluate(segmented_tp, gold, units=prepared)
eval_puddle = evaluate(segmented_puddle, gold, units=prepared)
eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared)
eval_ag = evaluate(segmented_ag, gold, units=prepared)
eval_dibs = evaluate(segmented_dibs, gold, units=prepared)


# a little function to display score with 4-digits precision