Example #1
0
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    _tags = [utt for utt in codecs.open(
        os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8')
            if utt]
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)
    _train = _tags[:200]

    model = dibs.CorpusSummary(_train)
    segmented = dibs.segment(_prepared, model)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases). You
    # can replicate this result in CDSWordseg using
    # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs"
    expected = {
        'type_fscore': 0.2359,
        'type_precision': 0.2084,
        'type_recall': 0.2719,
        'token_fscore': 0.239,
        'token_precision': 0.3243,
        'token_recall': 0.1892,
        'boundary_all_fscore': 0.6543,
        'boundary_all_precision': 0.8377,
        'boundary_all_recall': 0.5367,
        'boundary_noedge_fscore': 0.4804,
        'boundary_noedge_precision': 0.7161,
        'boundary_noedge_recall': 0.3614}

    assert score == pytest.approx(expected, rel=1e-3)
Example #2
0
def test_phone_sep(level):
    text = ['hh_ih_r_;eword ',
            'dh_eh_r_;eword w_iy_;eword g_ow_;eword ']

    sep = Separator(
        phone='_' if level == 'phone' else None,
        syllable='_' if level == 'syllable' else None,
        word=';eword ')

    model = dibs.CorpusSummary(text, separator=sep, level=level)
    assert model.summary == {'nlines': 2, 'nwords': 4, 'nphones': 10}
Example #3
0
def test_basic(prep, tags, type, threshold, pwb):
    sep = Separator()
    model = dibs.CorpusSummary(tags, separator=sep)

    out = list(dibs.segment(
        prep, model, type=type, threshold=threshold, pwb=pwb))

    s = Separator().remove
    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n+1, s(a), s(b))
Example #4
0
def test_emptyline_in_train(tags):
    # empty lines are ignored in train file
    dibs.CorpusSummary([''] + tags)
Example #5
0
def test_bad_train(prep):
    # cannot have a train text without word separators
    with pytest.raises(ValueError):
        dibs.CorpusSummary(prep)
Example #6
0
# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)

# evaluate them against the gold file
eval_baseline = evaluate(segmented_baseline, gold, units=prepared)
eval_tp = evaluate(segmented_tp, gold, units=prepared)
eval_puddle = evaluate(segmented_puddle, gold, units=prepared)
eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared)
eval_ag = evaluate(segmented_ag, gold, units=prepared)
eval_dibs = evaluate(segmented_dibs, gold, units=prepared)


# a little function to display score with 4-digits precision
def display(score):
    if score is None:
        return 'None'