def test_levels(): assert Separator(phone='a', syllable='b', word='c').levels() \ == ['phone', 'syllable', 'word'] assert Separator(phone='a', syllable='b', word=None).levels() \ == ['phone', 'syllable'] assert Separator(phone='a', syllable=None, word=None).levels() \ == ['phone']
def test_split_vs_tokenize(text, expected, keep_boundaries): s = Separator(phone='p', syllable='s', word='w') assert list(s.split(text, 'word', keep_boundaries=keep_boundaries)) \ == expected assert list(s.tokenize(text, 'word', keep_boundaries=keep_boundaries)) \ == [e for e in expected if len(e)]
def __init__(self): # token separation on words only self.separator = Separator(phone=None, syllable=None, word=' ') # count over/under/mis/good segmentation for each word type self.over_segmentation = collections.defaultdict(int) self.under_segmentation = collections.defaultdict(int) self.mis_segmentation = collections.defaultdict(int) self.correct_segmentation = collections.defaultdict(int)
def test_summary_perfect(gold): d = summary(gold, gold) sep = Separator(phone=None, syllable=None, word=' ') nwords = sum(len(sep.tokenize(utt, level='word')) for utt in gold) # all is in correct for category in ('under', 'over', 'mis'): assert not d[category] # expected number of words in correct assert sum(d['correct'].values()) == nwords
def test_describe3(tags): stats_tags = CorpusStatistics(tags, separator=Separator( phone=' ', syllable=';esyll', word=';eword')).describe_tokens('word') stats_gold = CorpusStatistics( tags, separator=Separator()).describe_tokens('word') assert pytest.approx(stats_tags) == stats_gold
def test_basic(prep, tags, type, threshold, pwb): sep = Separator() model = dibs.CorpusSummary(tags, separator=sep) out = list(dibs.segment( prep, model, type=type, threshold=threshold, pwb=pwb)) s = Separator().remove assert len(out) == len(prep) for n, (a, b) in enumerate(zip(out, prep)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n+1, s(a), s(b))
def test_empty_lines(): text = ['', ''] assert len(list(prepare(text))) == 0 assert len(list(gold(text))) == 0 text = [ 'hh ax l ;esyll ow ;esyll ;eword', '', 'hh ax l ;esyll ow ;esyll ;eword' ] assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2 assert len(list(gold(text, separator=Separator()))) == 2
def test_entropy(tags): stats = CorpusStatistics(UTTS, separator=Separator(phone=None, syllable=None, word=' ')) with pytest.raises(KeyError): stats.normalized_segmentation_entropy() stats = CorpusStatistics(tags, Separator()) assert stats.normalized_segmentation_entropy() \ == pytest.approx(0.06298494117721846)
def test_remove_level(): s = Separator(phone='p', syllable='s', word='w') assert s.remove('..p.s.p.w') == '.....' assert s.remove('..p.s.p.w', level='phone') == '...s..w' assert s.remove('..p.s.p.w', level='syllable') == '..p..p.w' assert s.remove('..p.s.p.w', level='word') == '..p.s.p.' s = Separator(phone=';', syllable='_', word=' ') assert s.remove('ab c', level='phone') == 'ab c'
def test_no_vowel(onsets, vowels): text = 's;i; a;j; l;j; a;l; a;j; ' s = Syllabifier(onsets, vowels, separator=Separator(';', '_', ' ')) with pytest.raises(ValueError) as err: s.syllabify([text]) assert 'no vowel in word' in str(err.value) s = Syllabifier(onsets, vowels, separator=Separator(';', '_', ' ')) assert [] == s.syllabify([text], tolerant=True) s = Syllabifier( onsets, vowels, separator=Separator(';', '_', ' '), filling_vowel=True) assert ['s;i;_ a;j;_ l;j;_ a;l;_ a;j;_ '] == s.syllabify([text])
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = puddle.segment(_prepared, nfolds=1) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.06369, 'type_precision': 0.1075, 'type_recall': 0.04525, 'token_fscore': 0.06295, 'token_precision': 0.2056, 'token_recall': 0.03716, 'boundary_all_fscore': 0.4605, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.2991, 'boundary_noedge_fscore': 0.02806, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.01423 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = tp.segment(_prepared) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.304, 'type_precision': 0.2554, 'type_recall': 0.3756, 'token_fscore': 0.3994, 'token_precision': 0.3674, 'token_recall': 0.4375, 'boundary_all_fscore': 0.7174, 'boundary_all_precision': 0.6671, 'boundary_all_recall': 0.776, 'boundary_noedge_fscore': 0.6144, 'boundary_noedge_precision': 0.557, 'boundary_noedge_recall': 0.685 } assert score == pytest.approx(expected, rel=1e-3)
def test_prepare_bad_types(): # give dict or list of int as input, must fail with pytest.raises(AttributeError): list(prepare({1: 1, 2: 2})) with pytest.raises(AttributeError): list(prepare([1, 2], separator=Separator()))
def gold(text, separator=Separator()): """Returns a gold text from a phonologized one The returned gold text is the ground-truth segmentation. It has phone and syllable separators removed and word separators replaced by a single space ' '. It is used to evaluate the output of segmentation algorithms. Parameters ---------- text : sequence The input text to be prepared for segmentation. Each element of the sequence is assumed to be a single and complete utterance in valid phonological form. separator : Separator, optional Token separation in the `text` Returns ------- gold_text : generator Gold utterances with separators removed and words separated by spaces. The returned text is the gold version, against which the algorithms are evaluated. """ # delete phone and syllable separators. Replace word boundaries by # a single space. gold = (line.replace(separator.syllable, '').replace(separator.phone or '', '').replace(separator.word, ' ') for line in text) # delete any duplicate, begin or end spaces. As for prepare, we # ignore empty lines. return (line for line in (utils.strip(line) for line in gold) if line)
def test_descibe2(tags): stats = CorpusStatistics( tags, separator=Separator(phone=' ', syllable=';esyll', word=';eword')).describe_all() assert stats['corpus'] == pytest.approx({ 'entropy': 0.06298494117721846, 'mattr': 0.7166666666666667, 'nutts': 13, 'nutts_single_word': 4 }) assert stats['phones'] == pytest.approx({ 'tokens': 121, 'types': 28, 'hapaxes': 5 }) assert stats['syllables'] == pytest.approx({ 'tokens': 49, 'types': 31, 'hapaxes': 24 }) assert stats['words'] == pytest.approx({ 'tokens': 34, 'types': 24, 'hapaxes': 19 })
def test_puddle(prep, window, nfolds, njobs): out = list(puddle.segment(prep, window=window, nfolds=nfolds, njobs=njobs)) s = Separator().remove assert len(out) == len(prep) for n, (a, b) in enumerate(zip(out, prep)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
def test_cspanish_phones(onsets, vowels, strip): separator = Separator(phone=';', syllable='_', word=' ') text = [ 'n;o; s;e; k;a;e; ', 's;i; a;j; a;j; a;l; a;j; ', 'es;t;a; a;j; l;a; t;a;t;a; e;9u; ', 'm;i;r;a; es;t;a; x;u;g;a;n;9o; ' ] if strip: expected = [ 'n;o s;e k;a_e', 's;i a;j a;j a;l a;j', 'es_t;a a;j l;a t;a_t;a e_9u', 'm;i_r;a es_t;a x;u_g;a;n_9o' ] else: expected = [ 'n;o;_ s;e;_ k;a;_e;_ ', 's;i;_ a;j;_ a;j;_ a;l;_ a;j;_ ', 'es;_t;a;_ a;j;_ l;a;_ t;a;_t;a;_ e;_9u;_ ', 'm;i;_r;a;_ es;_t;a;_ x;u;_g;a;n;_9o;_ '] s = Syllabifier(onsets, vowels, separator=separator) sylls = s.syllabify(text, strip=strip) assert sylls == expected
def __init__(self, onsets, vowels, separator=Separator(), filling_vowel=False, log=utils.null_logger()): self.onsets = onsets self.vowels = vowels self.separator = separator self.log = log # ensure onsets and vowels are not empty if not isinstance(vowels, list) or not len(vowels): raise ValueError('unvalid or empty vowels list') if not isinstance(onsets, list) or not len(onsets): raise ValueError('unvalid or empty onsets list') # concatenation of all chars in onsets and vowels (usefull to # detect any char during syllabification) self.symbols = (set(''.join(v for v in vowels)).union( set(''.join(o for o in onsets)))) # if defined, ensure the silent vowel is not already used if filling_vowel: # find a silent vowel (some char not already prensent in # the symbols) code = 1 while six.unichr(code) in self.symbols: code += 1 self.silent = six.unichr(code) self.symbols.add(self.silent) self.vowels.append(self.silent) else: self.silent = None
def main(): """Entry point of the 'wordseg-baseline' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-baseline', description=__doc__, add_arguments=_add_arguments) # setup the seed for random number generation if args.random: log.info('setup random seed to %s', args.random) random.seed(args.random) if args.oracle: # load the oracle text if not os.path.isfile(args.oracle): raise ValueError('oracle file not found: {}'.format(args.oracle)) oracle_text = list(codecs.open(args.oracle, 'r')) log.info('loaded %s utterances from oracle text', len(oracle_text)) # init the oracle tokens separator oracle_separator = Separator(phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) segmented = segment_oracle(streamin, oracle_text, oracle_separator, args.level, log=log) else: segmented = segment(streamin, probability=args.probability, log=log) streamout.write('\n'.join(segmented) + '\n')
def test_replicate_cdswordseg(datadir): sep = Separator() _tags = [utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt] _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) _train = _tags[:200] model = dibs.CorpusSummary(_train) segmented = dibs.segment(_prepared, model) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases). You # can replicate this result in CDSWordseg using # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs" expected = { 'type_fscore': 0.2359, 'type_precision': 0.2084, 'type_recall': 0.2719, 'token_fscore': 0.239, 'token_precision': 0.3243, 'token_recall': 0.1892, 'boundary_all_fscore': 0.6543, 'boundary_all_precision': 0.8377, 'boundary_all_recall': 0.5367, 'boundary_noedge_fscore': 0.4804, 'boundary_noedge_precision': 0.7161, 'boundary_noedge_recall': 0.3614} assert score == pytest.approx(expected, rel=1e-3)
def main(): """Entry point of the 'wordseg-stats' command""" # options description def add_arguments(parser): parser.add_argument( '--json', action='store_true', help='print the results in JSON format, else print in raw text') # command initialization streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-stats', description=__doc__, add_arguments=add_arguments, separator=Separator()) # compute the statistics stats = CorpusStatistics(streamin, separator, log=log) results = stats.describe_all() # display the results either as a JSON string or in raw text if args.json: streamout.write((json.dumps(results, indent=4)) + '\n') else: out = (' '.join((name, k, str(v))) for name, stats in results.items() for k, v in stats.items()) streamout.write('\n'.join(out) + '\n')
def test_remove_re(): s = Separator('ab', None, None) assert s.remove('ab') == '' assert s.remove('aa') == 'aa' assert s.remove('[ab]') == '[]' s = Separator('[ab]', None, None) assert s.remove('ab') == '' assert s.remove('aa') == '' assert s.remove('[ab]') == '[]' with pytest.raises(ValueError): Separator(r'\[ab\]', None, None) with pytest.raises(ValueError): Separator(re.escape('[ab]'), None, None)
def test_most_common(): stats = CorpusStatistics(UTTS, separator=Separator(phone=None, syllable=None, word=' ')) top_freq = stats.most_common_tokens('word', n=4) assert dict(top_freq) == {'i': 2, 'people': 2, 'she\'s': 2, 'like': 2}
def test_cspanish_default_separator(onsets, vowels, strip): text = ['m i r a ;eword'] expected = ( ['m i ;esyllr a ;esyll;eword'] if not strip else ['m i;esyllr a']) s = Syllabifier(onsets, vowels, separator=Separator()) sylls = s.syllabify(text, strip=strip) assert sylls == expected
def test_tp(prep, threshold, dependency): """Check input and output are the same, once the separators are removed""" out = list(tp.segment(prep, threshold=threshold, dependency=dependency)) s = Separator().remove assert len(out) == len(prep) for n, (a, b) in enumerate(zip(out, prep)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
def test_remove_restore_phones(text): separator = Separator(phone=';', syllable='_', word=' ') s = Syllabifier(['foo'], ['bar'], separator=separator) clean, index = s._remove_phone_separators(text) assert not re.search(separator.phone, clean) restored = s._restore_phone_separators(clean, index, strip=False) assert restored == text
def test_remove(): s = Separator(phone='p', syllable='s', word='w') assert s.remove('abc') == 'abc' assert s.remove('wss p') == ' ' s = Separator(phone='_', word=';eword ') t = 'j_uː_;eword n_oʊ_;eword dʒ_ʌ_s_t_;eword s_t_uː_p_ɪ_d_ɪ_ɾ_i_;eword ' assert s.remove(t) == 'juːnoʊdʒʌststuːpɪdɪɾi'
def test_tokenize_full_nosyll(): t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ ' s = Separator(phone='_', syllable=None, word=' ') assert list(s.tokenize(t)) \ == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']] s = Separator(phone='_', syllable=';', word=' ') assert list(s.tokenize(t)) \ == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's', 't']]] # tokenize phones only t = t.replace(' ', '') s = Separator(phone='_', syllable=None, word=None) assert list(s.tokenize(t)) == \ ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
def test_phone_sep(level): text = ['hh_ih_r_;eword ', 'dh_eh_r_;eword w_iy_;eword g_ow_;eword '] sep = Separator( phone='_' if level == 'phone' else None, syllable='_' if level == 'syllable' else None, word=';eword ') model = dibs.CorpusSummary(text, separator=sep, level=level) assert model.summary == {'nlines': 2, 'nwords': 4, 'nphones': 10}
def test_remove_phones(): separator = Separator(phone=' ', syllable=';esyll', word=';eword') s = Syllabifier(['foo'], ['bar'], separator=separator) text = 'a b ;ewordc ;eword' clean, index = s._remove_phone_separators(text) assert clean == 'ab;ewordc;eword' assert index == [[1, 1], [1]] separator = Separator(phone=';', syllable='_', word=' ') s = Syllabifier(['foo'], ['bar'], separator=separator) text = 'a;b; c;' clean, index = s._remove_phone_separators(text) assert index == [[1, 1], [1]] assert clean == 'ab c' separator = Separator(phone=';', syllable='_', word=' ') s = Syllabifier(['foo'], ['bar'], separator=separator) text = 'ab c' clean, index = s._remove_phone_separators(text) assert index == [] assert clean == 'ab c'