def test_remove(): s = Separator(phone='p', syllable='s', word='w') assert s.remove('abc') == 'abc' assert s.remove('wss p') == ' ' s = Separator(phone='_', word=';eword ') t = 'j_uː_;eword n_oʊ_;eword dʒ_ʌ_s_t_;eword s_t_uː_p_ɪ_d_ɪ_ɾ_i_;eword ' assert s.remove(t) == 'juːnoʊdʒʌststuːpɪdɪɾi'
def test_remove_level(): s = Separator(phone='p', syllable='s', word='w') assert s.remove('..p.s.p.w') == '.....' assert s.remove('..p.s.p.w', level='phone') == '...s..w' assert s.remove('..p.s.p.w', level='syllable') == '..p..p.w' assert s.remove('..p.s.p.w', level='word') == '..p.s.p.' s = Separator(phone=';', syllable='_', word=' ') assert s.remove('ab c', level='phone') == 'ab c'
def test_remove_re(): s = Separator('ab', None, None) assert s.remove('ab') == '' assert s.remove('aa') == 'aa' assert s.remove('[ab]') == '[]' s = Separator('[ab]', None, None) assert s.remove('ab') == '' assert s.remove('aa') == '' assert s.remove('[ab]') == '[]' with pytest.raises(ValueError): Separator(r'\[ab\]', None, None) with pytest.raises(ValueError): Separator(re.escape('[ab]'), None, None)
def test_pipeline(algo, encoding, tags, tmpdir): # the token separator we use in the whole pipeline separator = Separator(phone=' ', syllable=';esyll', word=';eword') # add some unicode chars in the input text if encoding == 'unicode': tags = add_unicode(tags) # build the gold version from the tags gold = list(wordseg.prepare.gold(tags, separator=separator)) assert len(gold) == len(tags) for a, b in zip(gold, tags): assert separator.remove(a) == separator.remove(b) # prepare the text for segmentation prepared_text = list(wordseg.prepare.prepare(tags, separator=separator)) assert len(prepared_text) == len(tags) for a, b in zip(prepared_text, tags): assert separator.remove(a) == separator.remove(b) # segment it with the given algo (use default options) if algo in ('dpseg', 'puddle'): # only 1 fold for iterative algos: faster segmented = list(ALGOS[algo].segment(prepared_text, nfolds=1)) elif algo == 'ag': # add grammar related arguments, if in unicode test adapt the # grammar too grammar_file = os.path.join( os.path.dirname(wordseg.algos.ag.get_grammar_files()[0]), 'Colloc0_enFestival.lt') if encoding == 'unicode': grammar_unicode = add_unicode( codecs.open(grammar_file, 'r', encoding='utf8')) grammar_file = os.path.join(str(tmpdir), 'grammar.lt') codecs.open(grammar_file, 'w', encoding='utf8').write('\n'.join(grammar_unicode)) segmented = list(ALGOS[algo].segment( # we just use 10 iterations here to be fast prepared_text, grammar_file=grammar_file, category='Colloc0', nruns=1, args='-n 10')) elif algo == 'dibs': # dibs need training, train of test set for the test dibs_model = wordseg.algos.dibs.CorpusSummary(tags, separator=separator) segmented = list(ALGOS[algo].segment(prepared_text, dibs_model)) else: segmented = list(ALGOS[algo].segment(prepared_text)) s = separator.remove assert len(segmented) == len(tags) for n, (a, b) in enumerate(zip(segmented, tags)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b)) results = wordseg.evaluate.evaluate(segmented, gold) assert len(results.keys()) % 3 == 0 for v in results.values(): if v is not None: assert v >= 0 assert v <= 1
def test_remove_bad(): s = Separator(phone='p', syllable='s', word='w') with pytest.raises(ValueError) as err: s.remove('', level='bad') assert 'this is bad' in err
class SegmentationSummary(object): """Computes a summary of the segmentation errors The errors can be oversegmentations, undersegmentations or missegmentations. Correct segmentations are also reported. """ def __init__(self): # token separation on words only self.separator = Separator(phone=None, syllable=None, word=' ') # count over/under/mis/good segmentation for each word type self.over_segmentation = collections.defaultdict(int) self.under_segmentation = collections.defaultdict(int) self.mis_segmentation = collections.defaultdict(int) self.correct_segmentation = collections.defaultdict(int) def to_dict(self): """Exports the summary as a dictionary Returns ------- summary : dict A dictionary with the complete summary in the following entries: 'over', 'under', 'mis', 'correct'. In each entry, the words are sorted by decreasing frequency, and alphabetically (for equivalent frequency). """ # collapse all the dicts in a single one summary = { 'over': self.over_segmentation, 'under': self.under_segmentation, 'mis': self.mis_segmentation, 'correct': self.correct_segmentation } # sort by most frequent word decreasing order (and then # alphabetically increasing order) summary = { k: { w[0]: w[1] for w in sorted( v.items(), key=lambda x: (-x[1], x[0]), reverse=False) } for k, v in summary.items() } return summary def summarize(self, text, gold): """Computes segmentation errors on a whole text Call :meth:`summarize_utterance` on each utterance of gold and text. Parameters ---------- text : list of str The list of utterances for the segmented text (to be evaluated) gold : list of str The list of utterances for the gold text Raises ------ ValueError If `text` and `gold` do not have the same number of utterances. If :meth:`summarize_utterance` raise a ValueError. """ if not len(gold) == len(text): raise ValueError( 'text and gold do not have the same number of utterances') for t, g in zip(text, gold): self.summarize_utterance(t, g) def summarize_utterance(self, text, gold): """Computes segmentation errors on a single utterance This method returns no result but update the intern summary, accessible using :meth:`to_dict`. Parameters ---------- text : str A segmented utterance gold : str A gold utterance Raises ------ ValueError If `text` and `gold` are mismatched, i.e. they do not contain the same suite of letters (once all the spaces removed). """ # check gold and text match (with all spaces removed) if self.separator.remove(gold) != self.separator.remove(text): raise ValueError('mismatch in gold and text: {} != {}'.format( gold, text)) # get text and gold as lists of words gold_words = self.separator.tokenize(gold, level='word') text_words = self.separator.tokenize(text, level='word') # silly case where gold and text are identical if gold_words == text_words: for word in gold_words: self.correct_segmentation[word] += 1 return # divide gold and text in chunks, packing chunks where gold # and text share a common boundary. chunks = self._boundary_chunks(text_words, gold_words) # classify each chunk as under/over/mis/good segmentation for text_chunk, gold_chunk in chunks: category = self._classify_chunk(text_chunk, gold_chunk) if category == 'correct': d = self.correct_segmentation elif category == 'under': d = self.under_segmentation elif category == 'over': d = self.over_segmentation else: d = self.mis_segmentation # register the chunk's words into the summary for the # relevant category for word in gold_chunk: d[word] += 1 @classmethod def _boundary_chunks(cls, text, gold): """Returns the list of chunks in a pair of text/gold utterance""" return cls._boundary_chunks_aux(text, gold, []) @classmethod def _boundary_chunks_aux(cls, text, gold, chunks): lg = len(gold) lt = len(text) # end of recursion if not lg and not lt: return chunks # impossible to have one empty but not the other. Should be # the case by construction, this assert is not required. assert lg and lt # compute the next chunk chunk = cls._compute_chunk(text, gold) # recursion return cls._boundary_chunks_aux(text[len(chunk[0]):], gold[len(chunk[1]):], chunks + [chunk]) @staticmethod def _compute_chunk(text, gold): """Find the first chunk in a pair of text/gold utterances A chunk is a pair of lists of words sharing a common boundary (begin and end of a sequence of words). Example ------- >>> gold = 'baby going home'.split() >>> text = 'ba by going home'.split() >> _compute_chunk(text, gold) (['ba', 'by'], ['baby']) >>> text = 'babygoinghome'.split() >> _compute_chunk(text, gold) (['babygoinghome'], ['baby', 'going', 'home']) """ # non empty texts and same letters. This should be the case by # construction, those asserts are not required. assert len(gold) and len(text) assert ''.join(gold) == ''.join(text) # easy case, first word is the same if gold[0] == text[0]: return ([text[0]], [gold[0]]) text_concat, text_index = text[0], 0 gold_concat, gold_index = gold[0], 0 while len(gold_concat) != len(text_concat): if len(gold_concat) < len(text_concat): gold_index += 1 gold_concat = gold_concat + gold[gold_index] else: text_index += 1 text_concat = text_concat + text[text_index] return (text[:text_index + 1], gold[:gold_index + 1]) def _classify_chunk(self, text, gold): """A chunk is either over/under/mis/correct""" if len(gold) == len(text): if len(gold) == 1: return 'correct' return 'mis' elif len(gold) < len(text): if len(gold) == 1: return 'over' return 'mis' else: # len(gold) > len(text) if len(text) == 1: return 'under' return 'mis'