Example #1
0
def test_remove():
    s = Separator(phone='p', syllable='s', word='w')
    assert s.remove('abc') == 'abc'
    assert s.remove('wss p') == ' '

    s = Separator(phone='_', word=';eword ')
    t = 'j_uː_;eword n_oʊ_;eword dʒ_ʌ_s_t_;eword s_t_uː_p_ɪ_d_ɪ_ɾ_i_;eword '
    assert s.remove(t) == 'juːnoʊdʒʌststuːpɪdɪɾi'
Example #2
0
def test_remove_level():
    s = Separator(phone='p', syllable='s', word='w')
    assert s.remove('..p.s.p.w') == '.....'
    assert s.remove('..p.s.p.w', level='phone') == '...s..w'
    assert s.remove('..p.s.p.w', level='syllable') == '..p..p.w'
    assert s.remove('..p.s.p.w', level='word') == '..p.s.p.'

    s = Separator(phone=';', syllable='_', word=' ')
    assert s.remove('ab c', level='phone') == 'ab c'
Example #3
0
def test_remove_re():
    s = Separator('ab', None, None)
    assert s.remove('ab') == ''
    assert s.remove('aa') == 'aa'
    assert s.remove('[ab]') == '[]'

    s = Separator('[ab]', None, None)
    assert s.remove('ab') == ''
    assert s.remove('aa') == ''
    assert s.remove('[ab]') == '[]'

    with pytest.raises(ValueError):
        Separator(r'\[ab\]', None, None)

    with pytest.raises(ValueError):
        Separator(re.escape('[ab]'), None, None)
Example #4
0
def test_pipeline(algo, encoding, tags, tmpdir):
    # the token separator we use in the whole pipeline
    separator = Separator(phone=' ', syllable=';esyll', word=';eword')

    # add some unicode chars in the input text
    if encoding == 'unicode':
        tags = add_unicode(tags)

    # build the gold version from the tags
    gold = list(wordseg.prepare.gold(tags, separator=separator))
    assert len(gold) == len(tags)
    for a, b in zip(gold, tags):
        assert separator.remove(a) == separator.remove(b)

    # prepare the text for segmentation
    prepared_text = list(wordseg.prepare.prepare(tags, separator=separator))
    assert len(prepared_text) == len(tags)
    for a, b in zip(prepared_text, tags):
        assert separator.remove(a) == separator.remove(b)

    # segment it with the given algo (use default options)
    if algo in ('dpseg', 'puddle'):
        # only 1 fold for iterative algos: faster
        segmented = list(ALGOS[algo].segment(prepared_text, nfolds=1))
    elif algo == 'ag':
        # add grammar related arguments, if in unicode test adapt the
        # grammar too
        grammar_file = os.path.join(
            os.path.dirname(wordseg.algos.ag.get_grammar_files()[0]),
            'Colloc0_enFestival.lt')
        if encoding == 'unicode':
            grammar_unicode = add_unicode(
                codecs.open(grammar_file, 'r', encoding='utf8'))
            grammar_file = os.path.join(str(tmpdir), 'grammar.lt')
            codecs.open(grammar_file, 'w',
                        encoding='utf8').write('\n'.join(grammar_unicode))
        segmented = list(ALGOS[algo].segment(
            # we just use 10 iterations here to be fast
            prepared_text,
            grammar_file=grammar_file,
            category='Colloc0',
            nruns=1,
            args='-n 10'))
    elif algo == 'dibs':
        # dibs need training, train of test set for the test
        dibs_model = wordseg.algos.dibs.CorpusSummary(tags,
                                                      separator=separator)
        segmented = list(ALGOS[algo].segment(prepared_text, dibs_model))
    else:
        segmented = list(ALGOS[algo].segment(prepared_text))

    s = separator.remove
    assert len(segmented) == len(tags)
    for n, (a, b) in enumerate(zip(segmented, tags)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))

    results = wordseg.evaluate.evaluate(segmented, gold)
    assert len(results.keys()) % 3 == 0
    for v in results.values():
        if v is not None:
            assert v >= 0
            assert v <= 1
Example #5
0
def test_remove_bad():
    s = Separator(phone='p', syllable='s', word='w')
    with pytest.raises(ValueError) as err:
        s.remove('', level='bad')
        assert 'this is bad' in err
Example #6
0
class SegmentationSummary(object):
    """Computes a summary of the segmentation errors

    The errors can be oversegmentations, undersegmentations or
    missegmentations. Correct segmentations are also reported.

    """
    def __init__(self):
        # token separation on words only
        self.separator = Separator(phone=None, syllable=None, word=' ')

        # count over/under/mis/good segmentation for each word type
        self.over_segmentation = collections.defaultdict(int)
        self.under_segmentation = collections.defaultdict(int)
        self.mis_segmentation = collections.defaultdict(int)
        self.correct_segmentation = collections.defaultdict(int)

    def to_dict(self):
        """Exports the summary as a dictionary

        Returns
        -------
        summary : dict
            A dictionary with the complete summary in the following
            entries: 'over', 'under', 'mis', 'correct'. In each entry,
            the words are sorted by decreasing frequency, and
            alphabetically (for equivalent frequency).

        """
        # collapse all the dicts in a single one
        summary = {
            'over': self.over_segmentation,
            'under': self.under_segmentation,
            'mis': self.mis_segmentation,
            'correct': self.correct_segmentation
        }

        # sort by most frequent word decreasing order (and then
        # alphabetically increasing order)
        summary = {
            k: {
                w[0]: w[1]
                for w in sorted(
                    v.items(), key=lambda x: (-x[1], x[0]), reverse=False)
            }
            for k, v in summary.items()
        }

        return summary

    def summarize(self, text, gold):
        """Computes segmentation errors on a whole text

        Call :meth:`summarize_utterance` on each utterance of gold
        and text.

        Parameters
        ----------
        text : list of str
            The list of utterances for the segmented text (to be
            evaluated)
        gold : list of str
            The list of utterances for the gold text

        Raises
        ------
        ValueError
            If `text` and `gold` do not have the same number of
            utterances. If :meth:`summarize_utterance` raise a
            ValueError.

        """
        if not len(gold) == len(text):
            raise ValueError(
                'text and gold do not have the same number of utterances')

        for t, g in zip(text, gold):
            self.summarize_utterance(t, g)

    def summarize_utterance(self, text, gold):
        """Computes segmentation errors on a single utterance

        This method returns no result but update the intern summary,
        accessible using :meth:`to_dict`.

        Parameters
        ----------
        text : str
            A segmented utterance
        gold : str
            A gold utterance

        Raises
        ------
        ValueError
            If `text` and `gold` are mismatched, i.e. they do not
            contain the same suite of letters (once all the spaces
            removed).

        """
        # check gold and text match (with all spaces removed)
        if self.separator.remove(gold) != self.separator.remove(text):
            raise ValueError('mismatch in gold and text: {} != {}'.format(
                gold, text))

        # get text and gold as lists of words
        gold_words = self.separator.tokenize(gold, level='word')
        text_words = self.separator.tokenize(text, level='word')

        # silly case where gold and text are identical
        if gold_words == text_words:
            for word in gold_words:
                self.correct_segmentation[word] += 1
            return

        # divide gold and text in chunks, packing chunks where gold
        # and text share a common boundary.
        chunks = self._boundary_chunks(text_words, gold_words)

        # classify each chunk as under/over/mis/good segmentation
        for text_chunk, gold_chunk in chunks:
            category = self._classify_chunk(text_chunk, gold_chunk)

            if category == 'correct':
                d = self.correct_segmentation
            elif category == 'under':
                d = self.under_segmentation
            elif category == 'over':
                d = self.over_segmentation
            else:
                d = self.mis_segmentation

            # register the chunk's words into the summary for the
            # relevant category
            for word in gold_chunk:
                d[word] += 1

    @classmethod
    def _boundary_chunks(cls, text, gold):
        """Returns the list of chunks in a pair of text/gold utterance"""
        return cls._boundary_chunks_aux(text, gold, [])

    @classmethod
    def _boundary_chunks_aux(cls, text, gold, chunks):
        lg = len(gold)
        lt = len(text)

        # end of recursion
        if not lg and not lt:
            return chunks

        # impossible to have one empty but not the other. Should be
        # the case by construction, this assert is not required.
        assert lg and lt

        # compute the next chunk
        chunk = cls._compute_chunk(text, gold)

        # recursion
        return cls._boundary_chunks_aux(text[len(chunk[0]):],
                                        gold[len(chunk[1]):], chunks + [chunk])

    @staticmethod
    def _compute_chunk(text, gold):
        """Find the first chunk in a pair of text/gold utterances

        A chunk is a pair of lists of words sharing a common boundary
        (begin and end of a sequence of words).

        Example
        -------
        >>> gold = 'baby going home'.split()

        >>> text = 'ba by going home'.split()
        >> _compute_chunk(text, gold)
        (['ba', 'by'], ['baby'])

        >>> text = 'babygoinghome'.split()
        >> _compute_chunk(text, gold)
        (['babygoinghome'], ['baby', 'going', 'home'])

        """
        # non empty texts and same letters. This should be the case by
        # construction, those asserts are not required.
        assert len(gold) and len(text)
        assert ''.join(gold) == ''.join(text)

        # easy case, first word is the same
        if gold[0] == text[0]:
            return ([text[0]], [gold[0]])

        text_concat, text_index = text[0], 0
        gold_concat, gold_index = gold[0], 0
        while len(gold_concat) != len(text_concat):
            if len(gold_concat) < len(text_concat):
                gold_index += 1
                gold_concat = gold_concat + gold[gold_index]
            else:
                text_index += 1
                text_concat = text_concat + text[text_index]
        return (text[:text_index + 1], gold[:gold_index + 1])

    def _classify_chunk(self, text, gold):
        """A chunk is either over/under/mis/correct"""
        if len(gold) == len(text):
            if len(gold) == 1:
                return 'correct'
            return 'mis'
        elif len(gold) < len(text):
            if len(gold) == 1:
                return 'over'
            return 'mis'
        else:  # len(gold) > len(text)
            if len(text) == 1:
                return 'under'
            return 'mis'