def test_replicate_cdswordseg(datadir): sep = Separator() _tags = [utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt] _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) _train = _tags[:200] model = dibs.CorpusSummary(_train) segmented = dibs.segment(_prepared, model) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases). You # can replicate this result in CDSWordseg using # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs" expected = { 'type_fscore': 0.2359, 'type_precision': 0.2084, 'type_recall': 0.2719, 'token_fscore': 0.239, 'token_precision': 0.3243, 'token_recall': 0.1892, 'boundary_all_fscore': 0.6543, 'boundary_all_precision': 0.8377, 'boundary_all_recall': 0.5367, 'boundary_noedge_fscore': 0.4804, 'boundary_noedge_precision': 0.7161, 'boundary_noedge_recall': 0.3614} assert score == pytest.approx(expected, rel=1e-3)
def test_phone_sep(level): text = ['hh_ih_r_;eword ', 'dh_eh_r_;eword w_iy_;eword g_ow_;eword '] sep = Separator( phone='_' if level == 'phone' else None, syllable='_' if level == 'syllable' else None, word=';eword ') model = dibs.CorpusSummary(text, separator=sep, level=level) assert model.summary == {'nlines': 2, 'nwords': 4, 'nphones': 10}
def test_basic(prep, tags, type, threshold, pwb): sep = Separator() model = dibs.CorpusSummary(tags, separator=sep) out = list(dibs.segment( prep, model, type=type, threshold=threshold, pwb=pwb)) s = Separator().remove assert len(out) == len(prep) for n, (a, b) in enumerate(zip(out, prep)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n+1, s(a), s(b))
def test_emptyline_in_train(tags): # empty lines are ignored in train file dibs.CorpusSummary([''] + tags)
def test_bad_train(prep): # cannot have a train text without word separators with pytest.raises(ValueError): dibs.CorpusSummary(prep)
# prepare the input for segmentation prepared = list(prepare(text)) # generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs) # evaluate them against the gold file eval_baseline = evaluate(segmented_baseline, gold, units=prepared) eval_tp = evaluate(segmented_tp, gold, units=prepared) eval_puddle = evaluate(segmented_puddle, gold, units=prepared) eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared) eval_ag = evaluate(segmented_ag, gold, units=prepared) eval_dibs = evaluate(segmented_dibs, gold, units=prepared) # a little function to display score with 4-digits precision def display(score): if score is None: return 'None'