def test_empty_lines(): text = ['', ''] assert len(list(prepare(text))) == 0 assert len(list(gold(text))) == 0 text = [ 'hh ax l ;esyll ow ;esyll ;eword', '', 'hh ax l ;esyll ow ;esyll ;eword' ] assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2 assert len(list(gold(text, separator=Separator()))) == 2
def test_replicate_cdswordseg(datadir): sep = Separator() _tags = [utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt] _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) _train = _tags[:200] model = dibs.CorpusSummary(_train) segmented = dibs.segment(_prepared, model) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases). You # can replicate this result in CDSWordseg using # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs" expected = { 'type_fscore': 0.2359, 'type_precision': 0.2084, 'type_recall': 0.2719, 'token_fscore': 0.239, 'token_precision': 0.3243, 'token_recall': 0.1892, 'boundary_all_fscore': 0.6543, 'boundary_all_precision': 0.8377, 'boundary_all_recall': 0.5367, 'boundary_noedge_fscore': 0.4804, 'boundary_noedge_precision': 0.7161, 'boundary_noedge_recall': 0.3614} assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = puddle.segment(_prepared, nfolds=1) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.06369, 'type_precision': 0.1075, 'type_recall': 0.04525, 'token_fscore': 0.06295, 'token_precision': 0.2056, 'token_recall': 0.03716, 'boundary_all_fscore': 0.4605, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.2991, 'boundary_noedge_fscore': 0.02806, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.01423 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = tp.segment(_prepared) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.304, 'type_precision': 0.2554, 'type_recall': 0.3756, 'token_fscore': 0.3994, 'token_precision': 0.3674, 'token_recall': 0.4375, 'boundary_all_fscore': 0.7174, 'boundary_all_precision': 0.6671, 'boundary_all_recall': 0.776, 'boundary_noedge_fscore': 0.6144, 'boundary_noedge_precision': 0.557, 'boundary_noedge_recall': 0.685 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate_cdswordseg(datadir): sep = Separator() # only the last 10 lines, for a fast test. We cannot take the 10 # first lines because they cause the dpseg_bugfix to correct a # fold (the implementation of that fix differs in CDS and wordseg, # so the results are not replicated exactly) _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][-10:] _prepared = prepare(_tags, separator=sep, unit='syllable') _gold = gold(_tags, separator=sep) uni_dmcmc_conf = [ c for c in wordseg.utils.get_config_files('dpseg') if 'uni_dmcmc' in c ][0] args = '--ngram 1 --a1 0 --b1 1 -C {}'.format(uni_dmcmc_conf) segmented = segment(_prepared, nfolds=5, njobs=4, args=args) score = evaluate(segmented, _gold) # we obtained that scores from the dpseg version in CDSWordSeg expected = { 'type_fscore': 0.3768, 'type_precision': 0.3939, 'type_recall': 0.3611, 'token_fscore': 0.3836, 'token_precision': 0.4118, 'token_recall': 0.359, 'boundary_all_fscore': 0.7957, 'boundary_all_precision': 0.8409, 'boundary_all_recall': 0.7551, 'boundary_noedge_fscore': 0.6415, 'boundary_noedge_precision': 0.7083, 'boundary_noedge_recall': 0.5862 } assert score == pytest.approx(expected, rel=1e-3)
# compute some statistics on the input text (text tokenized at phone # and word levels) separator = Separator(phone=' ', syllable=None, word=';eword') stats = CorpusStatistics(text, separator).describe_all() # display the computed statistics sys.stdout.write( '* Statistics\n\n' + json.dumps(stats, indent=4) + '\n') # prepare the input for segmentation prepared = list(prepare(text)) # generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs) # evaluate them against the gold file eval_baseline = evaluate(segmented_baseline, gold, units=prepared) eval_tp = evaluate(segmented_tp, gold, units=prepared)