Python CorpusSummary Examples

Programming Language: Python

Namespace/Package Name: wordseg.algos.dibs

Method/Function: CorpusSummary

Examples at hotexamples.com: 6

Python CorpusSummary - 6 examples found. These are the top rated real world Python examples of wordseg.algos.dibs.CorpusSummary extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def test_replicate_cdswordseg(datadir):
    sep = Separator()

    _tags = [utt for utt in codecs.open(
        os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8')
            if utt]
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)
    _train = _tags[:200]

    model = dibs.CorpusSummary(_train)
    segmented = dibs.segment(_prepared, model)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases). You
    # can replicate this result in CDSWordseg using
    # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs"
    expected = {
        'type_fscore': 0.2359,
        'type_precision': 0.2084,
        'type_recall': 0.2719,
        'token_fscore': 0.239,
        'token_precision': 0.3243,
        'token_recall': 0.1892,
        'boundary_all_fscore': 0.6543,
        'boundary_all_precision': 0.8377,
        'boundary_all_recall': 0.5367,
        'boundary_noedge_fscore': 0.4804,
        'boundary_noedge_precision': 0.7161,
        'boundary_noedge_recall': 0.3614}

    assert score == pytest.approx(expected, rel=1e-3)

Example #2

Show file

def test_phone_sep(level):
    text = ['hh_ih_r_;eword ',
            'dh_eh_r_;eword w_iy_;eword g_ow_;eword ']

    sep = Separator(
        phone='_' if level == 'phone' else None,
        syllable='_' if level == 'syllable' else None,
        word=';eword ')

    model = dibs.CorpusSummary(text, separator=sep, level=level)
    assert model.summary == {'nlines': 2, 'nwords': 4, 'nphones': 10}

Example #3

Show file

def test_basic(prep, tags, type, threshold, pwb):
    sep = Separator()
    model = dibs.CorpusSummary(tags, separator=sep)

    out = list(dibs.segment(
        prep, model, type=type, threshold=threshold, pwb=pwb))

    s = Separator().remove
    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n+1, s(a), s(b))

Example #4

Show file

def test_emptyline_in_train(tags):
    # empty lines are ignored in train file
    dibs.CorpusSummary([''] + tags)

Example #5

Show file

def test_bad_train(prep):
    # cannot have a train text without word separators
    with pytest.raises(ValueError):
        dibs.CorpusSummary(prep)

Example #6

Show file

File: tutorial.py Project: rsantana-isg/wordseg

# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)

# evaluate them against the gold file
eval_baseline = evaluate(segmented_baseline, gold, units=prepared)
eval_tp = evaluate(segmented_tp, gold, units=prepared)
eval_puddle = evaluate(segmented_puddle, gold, units=prepared)
eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared)
eval_ag = evaluate(segmented_ag, gold, units=prepared)
eval_dibs = evaluate(segmented_dibs, gold, units=prepared)


# a little function to display score with 4-digits precision
def display(score):
    if score is None:
        return 'None'