Example #1
0
def test_tokenize_full_syll():
    t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_;t_ '

    s = Separator(phone='_', syllable=None, word=' ')
    assert list(s.tokenize(t)) \
        == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', ';t']]

    s = Separator(phone='_', syllable=';', word=' ')
    assert list(s.tokenize(t)) \
        == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's'], ['t']]]
Example #2
0
def test_tokenize_noboundaries():
    s = Separator(phone=None, syllable=' ', word=';eword')
    t = 'j uː ;eword n oʊ ;eword dʒ ʌ s t ;eword'
    assert list(s.tokenize(t, 'word', keep_boundaries=False)) \
        == ['juː', 'noʊ', 'dʒʌst']
    assert list(s.tokenize(t, 'syllable', keep_boundaries=False)) \
        == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']

    s = Separator(phone=' ', word='_')
    t = 'j uː _ n oʊ _ dʒ ʌ s t _'
    assert list(s.tokenize(t, 'word', keep_boundaries=False)) \
        == ['juː', 'noʊ', 'dʒʌst']
    assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \
        == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']

    s = Separator(phone='_', word=' ')
    t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ '
    assert list(s.tokenize(t, 'word', keep_boundaries=False)) \
        == ['juː', 'noʊ', 'dʒʌst']
    assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \
        == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']

    s = Separator(phone='_', syllable=';', word=' ')
    t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_;t_ '
    assert list(s.tokenize(t, 'word', keep_boundaries=False)) \
        == ['juː', 'noʊ', 'dʒʌst']
    assert list(s.tokenize(t, 'syllable', keep_boundaries=False)) \
        == ['juː', 'noʊ', 'dʒʌs', 't']
    assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \
        == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
Example #3
0
def test_tokenize_full_nosyll():
    t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ '

    s = Separator(phone='_', syllable=None, word=' ')
    assert list(s.tokenize(t)) \
        == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']]

    s = Separator(phone='_', syllable=';', word=' ')
    assert list(s.tokenize(t)) \
        == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's', 't']]]

    # tokenize phones only
    t = t.replace(' ', '')
    s = Separator(phone='_', syllable=None, word=None)
    assert list(s.tokenize(t)) == \
        ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
Example #4
0
def test_tokenize_none():
    s = Separator(phone=None, syllable=None, word=' ')
    text = 'te9abesitosgr uNone'
    assert list(s.tokenize(text, level='word')) == ['te9abesitosgr', 'uNone']

    assert s.strip('uNone') == 'uNone'
    assert s.strip('None') == 'None'
    assert s.strip('Noneu') == 'Noneu'
Example #5
0
def test_split_vs_tokenize(text, expected, keep_boundaries):
    s = Separator(phone='p', syllable='s', word='w')

    assert list(s.split(text, 'word', keep_boundaries=keep_boundaries)) \
        == expected

    assert list(s.tokenize(text, 'word', keep_boundaries=keep_boundaries)) \
        == [e for e in expected if len(e)]
Example #6
0
def test_summary_perfect(gold):
    d = summary(gold, gold)
    sep = Separator(phone=None, syllable=None, word=' ')
    nwords = sum(len(sep.tokenize(utt, level='word')) for utt in gold)

    # all is in correct
    for category in ('under', 'over', 'mis'):
        assert not d[category]

    # expected number of words in correct
    assert sum(d['correct'].values()) == nwords
Example #7
0
class SegmentationSummary(object):
    """Computes a summary of the segmentation errors

    The errors can be oversegmentations, undersegmentations or
    missegmentations. Correct segmentations are also reported.

    """
    def __init__(self):
        # token separation on words only
        self.separator = Separator(phone=None, syllable=None, word=' ')

        # count over/under/mis/good segmentation for each word type
        self.over_segmentation = collections.defaultdict(int)
        self.under_segmentation = collections.defaultdict(int)
        self.mis_segmentation = collections.defaultdict(int)
        self.correct_segmentation = collections.defaultdict(int)

    def to_dict(self):
        """Exports the summary as a dictionary

        Returns
        -------
        summary : dict
            A dictionary with the complete summary in the following
            entries: 'over', 'under', 'mis', 'correct'. In each entry,
            the words are sorted by decreasing frequency, and
            alphabetically (for equivalent frequency).

        """
        # collapse all the dicts in a single one
        summary = {
            'over': self.over_segmentation,
            'under': self.under_segmentation,
            'mis': self.mis_segmentation,
            'correct': self.correct_segmentation
        }

        # sort by most frequent word decreasing order (and then
        # alphabetically increasing order)
        summary = {
            k: {
                w[0]: w[1]
                for w in sorted(
                    v.items(), key=lambda x: (-x[1], x[0]), reverse=False)
            }
            for k, v in summary.items()
        }

        return summary

    def summarize(self, text, gold):
        """Computes segmentation errors on a whole text

        Call :meth:`summarize_utterance` on each utterance of gold
        and text.

        Parameters
        ----------
        text : list of str
            The list of utterances for the segmented text (to be
            evaluated)
        gold : list of str
            The list of utterances for the gold text

        Raises
        ------
        ValueError
            If `text` and `gold` do not have the same number of
            utterances. If :meth:`summarize_utterance` raise a
            ValueError.

        """
        if not len(gold) == len(text):
            raise ValueError(
                'text and gold do not have the same number of utterances')

        for t, g in zip(text, gold):
            self.summarize_utterance(t, g)

    def summarize_utterance(self, text, gold):
        """Computes segmentation errors on a single utterance

        This method returns no result but update the intern summary,
        accessible using :meth:`to_dict`.

        Parameters
        ----------
        text : str
            A segmented utterance
        gold : str
            A gold utterance

        Raises
        ------
        ValueError
            If `text` and `gold` are mismatched, i.e. they do not
            contain the same suite of letters (once all the spaces
            removed).

        """
        # check gold and text match (with all spaces removed)
        if self.separator.remove(gold) != self.separator.remove(text):
            raise ValueError('mismatch in gold and text: {} != {}'.format(
                gold, text))

        # get text and gold as lists of words
        gold_words = self.separator.tokenize(gold, level='word')
        text_words = self.separator.tokenize(text, level='word')

        # silly case where gold and text are identical
        if gold_words == text_words:
            for word in gold_words:
                self.correct_segmentation[word] += 1
            return

        # divide gold and text in chunks, packing chunks where gold
        # and text share a common boundary.
        chunks = self._boundary_chunks(text_words, gold_words)

        # classify each chunk as under/over/mis/good segmentation
        for text_chunk, gold_chunk in chunks:
            category = self._classify_chunk(text_chunk, gold_chunk)

            if category == 'correct':
                d = self.correct_segmentation
            elif category == 'under':
                d = self.under_segmentation
            elif category == 'over':
                d = self.over_segmentation
            else:
                d = self.mis_segmentation

            # register the chunk's words into the summary for the
            # relevant category
            for word in gold_chunk:
                d[word] += 1

    @classmethod
    def _boundary_chunks(cls, text, gold):
        """Returns the list of chunks in a pair of text/gold utterance"""
        return cls._boundary_chunks_aux(text, gold, [])

    @classmethod
    def _boundary_chunks_aux(cls, text, gold, chunks):
        lg = len(gold)
        lt = len(text)

        # end of recursion
        if not lg and not lt:
            return chunks

        # impossible to have one empty but not the other. Should be
        # the case by construction, this assert is not required.
        assert lg and lt

        # compute the next chunk
        chunk = cls._compute_chunk(text, gold)

        # recursion
        return cls._boundary_chunks_aux(text[len(chunk[0]):],
                                        gold[len(chunk[1]):], chunks + [chunk])

    @staticmethod
    def _compute_chunk(text, gold):
        """Find the first chunk in a pair of text/gold utterances

        A chunk is a pair of lists of words sharing a common boundary
        (begin and end of a sequence of words).

        Example
        -------
        >>> gold = 'baby going home'.split()

        >>> text = 'ba by going home'.split()
        >> _compute_chunk(text, gold)
        (['ba', 'by'], ['baby'])

        >>> text = 'babygoinghome'.split()
        >> _compute_chunk(text, gold)
        (['babygoinghome'], ['baby', 'going', 'home'])

        """
        # non empty texts and same letters. This should be the case by
        # construction, those asserts are not required.
        assert len(gold) and len(text)
        assert ''.join(gold) == ''.join(text)

        # easy case, first word is the same
        if gold[0] == text[0]:
            return ([text[0]], [gold[0]])

        text_concat, text_index = text[0], 0
        gold_concat, gold_index = gold[0], 0
        while len(gold_concat) != len(text_concat):
            if len(gold_concat) < len(text_concat):
                gold_index += 1
                gold_concat = gold_concat + gold[gold_index]
            else:
                text_index += 1
                text_concat = text_concat + text[text_index]
        return (text[:text_index + 1], gold[:gold_index + 1])

    def _classify_chunk(self, text, gold):
        """A chunk is either over/under/mis/correct"""
        if len(gold) == len(text):
            if len(gold) == 1:
                return 'correct'
            return 'mis'
        elif len(gold) < len(text):
            if len(gold) == 1:
                return 'over'
            return 'mis'
        else:  # len(gold) > len(text)
            if len(text) == 1:
                return 'under'
            return 'mis'