Beispiel #1
0
    def replace(self, utt):
        """Examine tokens and performs some replacements.

        A dictionary with symbols contains the replacements to operate.

        :param utt: (list) the utterance
        :returns: A list of strings

        """
        # Specific case of float numbers
        sent = ' '.join(utt)
        sent = re.sub(u('([0-9])\.([0-9])'), u(r'\1 NUMBER_SEP_POINT \2'),
                      sent)
        sent = re.sub(u('([0-9])\,([0-9])'), u(r'\1 NUMBER_SEP \2'), sent)
        sent = sppasUnicode(sent).to_strip()
        _utt = sent.split()

        # Other generic replacements
        _result = list()
        for s in _utt:
            if self.repl.is_key(s):
                s = s.replace(s, self.repl.replace(s))
            _result.append(sppasUnicode(s).to_strip())

        return _result
Beispiel #2
0
    def format_token(entry):
        """ Remove the CR/LF, tabs, multiple spaces and others... and lowerise.

        :param entry: (str) a token
        :returns: formatted token

        """
        t = sppasUnicode(entry).to_strip()
        return sppasUnicode(t).to_lower()
Beispiel #3
0
    def format_token(entry):
        """Remove the CR/LF, tabs, multiple spaces and others... and lowerise.

        :param entry: (str) a token
        :returns: formatted token

        """
        t = sppasUnicode(entry).to_strip()
        return sppasUnicode(t).to_lower()
Beispiel #4
0
    def get_phon(self, entry):
        """Return the phonetization of an unknown entry.

        :param entry: (str) the string to phonetize
        :returns: a string with the proposed phonetization
        :raises: Exception if the word can NOT be phonetized

        """
        _str = sppasUnicode(entry).to_strip()
        _str = sppasUnicode(_str).to_lower()
        if len(_str) > 0 and _str[-1].isalnum() is False:
            _str = _str[:-1]
        if len(_str) > 0 and _str[0].isalnum() is False:
            _str = _str[1:]
        if len(_str) == 0:
            return ""

        if len(entry) > LIMIT_SIZE:
            raise Exception

        # Find all pronunciations of segments with a longest matching algo.
        _tabstr = re.split("[-'_\s]", _str)
        pronlr = ""
        pronrl = ""

        for s in _tabstr:
            plr = self.__recurslr(s)
            plr = plr.strip()
            if len(plr) > 0:
                pronlr = pronlr + " " + plr

            prl = self.__recursrl(s)
            prl = prl.strip()
            if len(prl) > 0:
                pronrl = pronrl + " " + prl

        pronlr = pronlr.strip()
        pronrl = pronrl.strip()

        # Create the output
        pron = ""
        if len(pronlr) > 0:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose(pronlr, pronrl)
            else:
                pron = self.dagphon.decompose(pronlr)
        else:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose(pronrl)

        if len(pron) > 0:
            return pron

        raise Exception
Beispiel #5
0
    def get_phon(self, entry):
        """ Return the phonetization of an unknown entry.

        :param entry: (str) the string to phonetize
        :returns: a string with the proposed phonetization
        :raises: Exception if the word can NOT be phonetized

        """
        _str = sppasUnicode(entry).to_strip()
        _str = sppasUnicode(_str).to_lower()
        if len(_str) > 0 and _str[-1].isalnum() is False:
            _str = _str[:-1]
        if len(_str) > 0 and _str[0].isalnum() is False:
            _str = _str[1:]
        if len(_str) == 0:
            return ""

        if len(entry) > LIMIT_SIZE:
            raise Exception

        # Find all pronunciations of segments with a longest matching algo.
        _tabstr = re.split("[-'_\s]", _str)
        pronlr = ""
        pronrl = ""

        for s in _tabstr:
            plr = self.__recurslr(s)
            plr = plr.strip()
            if len(plr) > 0:
                pronlr = pronlr + " " + plr

            prl = self.__recursrl(s)
            prl = prl.strip()
            if len(prl) > 0:
                pronrl = pronrl + " " + prl

        pronlr = pronlr.strip()
        pronrl = pronrl.strip()

        # Create the output
        pron = ""
        if len(pronlr) > 0:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose(pronlr, pronrl)
            else:
                pron = self.dagphon.decompose(pronlr)
        else:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose(pronrl)

        if len(pron) > 0:
            return pron

        raise Exception
Beispiel #6
0
    def append_activity(self, token, activity):
        """ Append a new activity.

        :param token: (str) The token of the tier TokensAlign
        :param activity: (str) Name of the activity

        """
        sp = sppasUnicode(token)
        token = sp.to_strip()

        sp = sppasUnicode(activity)
        activity = sp.to_strip()
        
        if token not in self._activities:
            self._activities[token] = activity
Beispiel #7
0
    def append_activity(self, token, activity):
        """Append a new activity.

        :param token: (str) The token of the tier TokensAlign
        :param activity: (str) Name of the activity

        """
        sp = sppasUnicode(token)
        token = sp.to_strip()

        sp = sppasUnicode(activity)
        activity = sp.to_strip()

        if token not in self._activities:
            self._activities[token] = activity
Beispiel #8
0
    def __stick_longest_lr(self, phrase, separator):
        """ Return the longest first word of a phrase.
        A longest matching algorithm is applied from left to right.

        :param phrase: (str)
        :returns: tuple of (index of the first longest token, the longest token)

        """
        tab_toks = phrase.split(" ")
        token = tab_toks[0]
        i = len(tab_toks)

        if self.__vocab is None:
            return 1, token

        while i > 0:
            # try to aggregate all tokens
            token = separator.join(tab_toks)

            # next round will try without the last token
            tab_toks.pop()
            i -= 1

            # find if this is a word in the vocabulary
            if self.__vocab.is_unk(token) is False:
                break

        # the first real token is the first given token
        return i, sppasUnicode(token).to_strip()
Beispiel #9
0
    def add_pron(self, token, pron):
        """Add a token/pron to the dict.

        :param token: (str) Unicode string of the token to add
        :param pron: (str) A pronunciation in which the phonemes are separated
        by whitespace

        """
        entry = sppasDictPron.format_token(token)

        new_pron = sppasUnicode(pron).to_strip()
        new_pron = new_pron.replace(" ", separators.phonemes)

        # Already a pronunciation for this token?
        cur_pron = ""
        if entry in self._dict:
            # ... don't append an already known pronunciation
            if self.is_pron_of(entry, new_pron) is False:
                cur_pron = self.get_pron(entry) + separators.variants
            else:
                cur_pron = self.get_pron(entry)
                new_pron = ""

        # Get the current pronunciation and append the new one
        new_pron = cur_pron + new_pron

        # Add (or change) the entry in the dict
        self._dict[entry] = new_pron
Beispiel #10
0
    def add_pron(self, token, pron):
        """ Add a token/pron to the dict.

        :param token: (str) Unicode string of the token to add
        :param pron: (str) A pronunciation in which the phonemes are separated by whitespace

        """
        entry = sppasDictPron.format_token(token)

        new_pron = sppasUnicode(pron).to_strip()
        new_pron = new_pron.replace(" ", sppasDictPron.PHONEMES_SEPARATOR)

        # Already a pronunciation for this token?
        cur_pron = ""
        if entry in self._dict:
            # ... don't append an already known pronunciation
            if self.is_pron_of(entry, new_pron) is False:
                cur_pron = self.get_pron(entry) + sppasDictPron.VARIANTS_SEPARATOR
            else:
                cur_pron = self.get_pron(entry)
                new_pron = ""

        # Get the current pronunciation and append the new one
        new_pron = cur_pron + new_pron

        # Add (or change) the entry in the dict
        self._dict[entry] = new_pron
Beispiel #11
0
    def _map_variant(self, phonvariant):
        """ Map phonemes of only one variant of a phonetized entry.

        :param phonvariant: (str) One phonetization variant of an entry.

        """
        phones = self._map_split_variant(phonvariant)
        subs = []
        # Single phonemes
        for p in phones:
            mapped = self._map_table.map_entry(p)
            if len(mapped) > 0:
                subs.append(p + VARIANTS_SEPARATOR + mapped)
            else:
                subs.append(p)

        self._dag_phon.variants = 0
        phon = sppasUnicode(self._dag_phon.decompose(
            " ".join(subs))).to_strip()

        # Remove un-pronounced phonemes!!!
        # By convention, they are represented by an underscore in the
        # mapping table.
        tmp = []
        for p in phon.split(VARIANTS_SEPARATOR):
            r = [x for x in p.split(PHONEMES_SEPARATOR) if x != "_"]
            tmp.append(PHONEMES_SEPARATOR.join(r))

        return VARIANTS_SEPARATOR.join(set(tmp))
Beispiel #12
0
    def __init__(self, entry):
        """ Creates a Token instance. """

        if entry is None:
            self.__entry = ""
        else:
            self.__entry = sppasUnicode(entry)
Beispiel #13
0
    def get_boundary(self, phonemes):
        """Get the index of the syllable boundary (EXCRULES or GENRULES).

        Phonemes are separated with the symbol defined by separators.phonemes
        variable.

        :param phonemes: (str) Sequence of phonemes to syllabify
        :returns: (int) boundary index or -1 if phonemes don't match any rule.

        """
        sp = sppasUnicode(phonemes)
        phonemes = sp.to_strip()
        phon_list = phonemes.split(separators.phonemes)
        classes = ""
        for phon in phon_list:
            classes += self.get_class(phon)

        # search into exception
        if classes in self.exception:
            return self.exception[classes]

        # search into general
        for key, val in self.general.items():
            if len(key) == len(phon_list):
                return val

        return -1
Beispiel #14
0
    def load(self, filename):
        """Load the rules from a file.

        :param filename: (str) Name of the file with the rules.

        """
        self.reset()

        with open(filename, "r") as f:
            lines = f.readlines()
            f.close()

        for line_nb, line in enumerate(lines, 1):
            sp = sppasUnicode(line)
            line = sp.to_strip()

            wds = line.split()
            if len(wds) == 3:
                if wds[0] == "PHONCLASS":
                    self.phonclass[wds[1]] = wds[2]

                elif wds[0] == "GENRULE":
                    self.general[wds[1]] = int(wds[2])

                elif wds[0] == "EXCRULE":
                    self.exception[wds[1]] = int(wds[2])

            if len(wds) == 7:
                if wds[0] == "OTHRULE":
                    s = " ".join(wds[1:6])
                    self.gap[s] = int(wds[6])
Beispiel #15
0
    def _parse_lines(self, lines):
        """Fill the transcription from the lines of the STM file."""
        # the current tier to fill
        tier = None

        # Extract rows, create tiers and metadata.
        for i, line in enumerate(lines):
            line = sppasUnicode(line).to_strip()

            # a comment can contain metadata
            if sppasBaseSclite.is_comment(line):
                if tier is None:
                    sppasBaseSclite._parse_comment(line, self)
                else:
                    sppasBaseSclite._parse_comment(line, tier)
            # ignore comments and blank lines
            if sppasSTM.check_line(line, i + 1) is False:
                continue

            # check for the tier (find it or create it)
            tier = self.get_tier(line)

            # extract information of this annotation
            tab_line = line.split()
            sppasSTM._create_annotation(tab_line[3], tab_line[4],
                                        " ".join(tab_line[5:]), tier)
Beispiel #16
0
    def _map_variant(self, phonvariant):
        """ Map phonemes of only one variant of a phonetized entry.

        :param phonvariant: (str) One phonetization variant of an entry.

        """
        phones = self._map_split_variant(phonvariant)
        subs = []
        # Single phonemes
        for p in phones:
            mapped = self._map_table.map_entry(p)
            if len(mapped) > 0:
                subs.append(p + sppasDictPron.VARIANTS_SEPARATOR + mapped)
            else:
                subs.append(p)

        self._dag_phon.variants = 0
        phon = sppasUnicode(self._dag_phon.decompose(" ".join(subs))).to_strip()

        # Remove un-pronounced phonemes!!!
        # By convention, they are represented by an underscore in the
        # mapping table.
        tmp = []
        for p in phon.split(sppasDictPron.VARIANTS_SEPARATOR):
            r = [x for x in p.split(sppasDictPron.PHONEMES_SEPARATOR) if x != "_"]
            tmp.append(sppasDictPron.PHONEMES_SEPARATOR.join(r))

        return sppasDictPron.VARIANTS_SEPARATOR.join(set(tmp))
Beispiel #17
0
    def __stick_longest_lr(self, phrase, separator):
        """Return the longest first word of a phrase.

        A longest matching algorithm is applied from left to right.

        :param phrase: (str)
        :returns: tuple of (index of the first longest token, the longest token)

        """
        tab_toks = phrase.split(" ")
        token = tab_toks[0]
        i = len(tab_toks)

        if self.__vocab is None:
            return 1, token

        while i > 0:
            # try to aggregate all tokens
            token = separator.join(tab_toks)

            # next round will try without the last token
            tab_toks.pop()
            i -= 1

            # find if this is a word in the vocabulary
            if self.__vocab.is_unk(token) is False:
                break

        # the first real token is the first given token
        return i, sppasUnicode(token).to_strip()
Beispiel #18
0
    def get_phon_tokens(self, tokens, phonunk=True):
        """ Return the phonetization of a list of tokens, with the status.
        Unknown entries are automatically phonetized if `phonunk` is set to True.

        :param tokens: (list) The list of tokens to be phonetized.
        :param phonunk: (bool) Phonetize unknown words (or not).

        TODO: EOT is not fully supported.

        :returns: A list with the tuple (token, phon, status).

        """
        tab = list()

        for entry in tokens:
            entry = entry.strip()
            phon = self._pdict.get_unkstamp()
            status = OK_ID

            # Enriched Orthographic Transcription Convention:
            # entry can be already in SAMPA.
            if entry.startswith("/") is True and entry.endswith("/") is True:
                phon = entry.strip("/")
                # It must use X-SAMPA, including minus character to separate phonemes.

            else:

                phon = self.get_phon_entry(entry)

                if phon == self._pdict.get_unkstamp():
                    status = ERROR_ID

                    # A missing compound word?
                    if "-" in entry or "'" in entry or "_" in entry:
                        _tabpron = [
                            self.get_phon_entry(w)
                            for w in re.split("[-'_]", entry)
                        ]

                        # OK, finally the entry is in the dictionary?
                        if self._pdict.get_unkstamp() not in _tabpron:
                            # ATTENTION: each part can have variants! must be decomposed.
                            self._dag_phon.variants = 4
                            phon = sppasUnicode(
                                self._dag_phon.decompose(
                                    " ".join(_tabpron))).to_strip()
                            status = WARNING_ID

                    if phon == self._pdict.get_unkstamp() and phonunk is True:
                        try:
                            phon = self._phonunk.get_phon(entry)
                            status = WARNING_ID
                        except:
                            phon = self._pdict.get_unkstamp()
                            status = ERROR_ID

            if len(phon) > 0:
                tab.append((entry, phon, status))

        return tab
Beispiel #19
0
    def set_meta(self, key, value):
        """ Set or update a metadata.

        :param key: (str) The key of the metadata.
        :param value: (str) The value assigned to the key.

        key, and value are formatted and stored in unicode.

        """
        su = sppasUnicode(key)
        key = su.to_strip()

        su = sppasUnicode(value)
        value = su.to_strip()

        self.__metadata[key] = value
Beispiel #20
0
    def get_units_julius(lines):
        """Return the units of a palign/walign file (in frames).

        :param lines: (List of str)
        :returns: List of tuples (start, end)

        """
        units = list()
        i = 0
        while "=== begin forced alignment ===" not in lines[i]:
            i += 1
            if i > len(lines):
                raise IOError('Time units not found')

        while "=== end forced alignment ===" not in lines[i]:
            i += 1
            if i > len(lines):
                raise IOError('Time units not found in alignment result')
            if lines[i].startswith('['):
                # New phonemes
                line = lines[i].replace("[", "")
                line = line.replace("]", "")
                line = sppasUnicode(line).to_strip()
                tab = line.split()
                # tab 0: first frame
                # tab 1: last frame
                # tab 2: score of the segmentation (log proba)
                # tab 3: triphone used
                units.append((int(tab[0]), int(tab[1])))
        return units
Beispiel #21
0
    def set_tiername(self, tier_name):
        """ Fix the tiername option.

        :param tier_name: (str)

        """
        self._options['tiername'] = sppasUnicode(tier_name).to_strip()
Beispiel #22
0
    def read(dir_name):
        """Return a list of (start-time end-time).

        :param dir_name: Name of the directory with the file to read.
        :returns: list of units

        """
        filename = os.path.join(dir_name, ListOfTracks.DEFAULT_FILENAME)
        if os.path.exists(filename) is False:
            raise IOError('The list of tracks is missing of the directory '
                          '{:s}'.format(dir_name))

        with open(filename, 'r') as fp:
            lines = fp.readlines()
            fp.close()

        # Each line corresponds to a track,
        # with a couple 'start end' of float values.
        _units = list()
        for line in lines:
            s = sppasUnicode(line)
            line = s.to_strip()
            _tab = line.split()
            if len(_tab) >= 2:
                _units.append((float(_tab[0]), float(_tab[1])))

        return _units
Beispiel #23
0
    def check_data(self):
        """ Check the given data to be aligned (phones and tokens).

        :returns: A warning message, or an empty string if check is OK.

        """
        if len(self._phones) == 0:
            raise IOError("No data to time-align.")

        phones = sppasUnicode(self._phones).to_strip().split()
        tokens = sppasUnicode(self._tokens).to_strip().split()
        if len(tokens) != len(phones):
            message = "Tokens alignment disabled: not the same number of tokens in tokenization (%d) and phonetization (%d)."%(len(self._tokens), len(self._phones))
            self._tokens = " ".join(["w_"+str(i) for i in range(len(self._phones))])
            return message

        return ""
Beispiel #24
0
    def set_phones(self, phones):
        """Fix the pronunciations of each token.

        :param phones: (str) Phonetization

        """
        phones = sppasUnicode(phones).unicode()
        self._phones = phones
Beispiel #25
0
    def set_tokens(self, tokens):
        """Fix the tokens.

        :param tokens: (str) Tokenization

        """
        tokens = sppasUnicode(tokens).unicode()
        self._tokens = tokens
Beispiel #26
0
    def get_count(self, token):
        """Return the count of a token.

        :param token: (str) The string of the token

        """
        s = sppasUnicode(token).to_strip()
        return self.__entries.get(s, 0)
Beispiel #27
0
    def set_description(self, description=""):
        """ Set the description of the controlled vocabulary.

        :param description: (str)

        """
        su = sppasUnicode(description)
        self.__desc = su.to_strip()
Beispiel #28
0
 def _create_annotation(begin, end, utterance, tier):
     """Add into the tier the annotation corresponding to data of a line."""
     utterance = sppasUnicode(utterance).to_strip()
     labels = format_labels(utterance)
     location = sppasLocation(
         sppasInterval(sppasBaseSclite.make_point(begin),
                       sppasBaseSclite.make_point(end)))
     tier.create_annotation(location, labels)
Beispiel #29
0
    def add(self, entry):
        """ Add an entry into the list except if the entry is already inside.

        :param entry: (str) The entry to add in the word list
        :returns: (bool)

        """
        s = sppasUnicode(entry)
        entry = s.to_strip()
        if self.__case_sensitive is False:
            s = sppasUnicode(entry)
            entry = s.to_lower()

        if entry not in self.__entries:
            self.__entries[entry] = None
            return True

        return False
Beispiel #30
0
    def add(self, entry):
        """ Add an entry into the list except if the entry is already inside.

        :param entry: (str) The entry to add in the word list
        :returns: (bool)

        """
        s = sppasUnicode(entry)
        entry = s.to_strip()
        if self.__case_sensitive is False:
            s = sppasUnicode(entry)
            entry = s.to_lower()

        if entry not in self.__entries:
            self.__entries[entry] = None
            return True

        return False
Beispiel #31
0
    def set_tg_prefix_label(self, prefix):
        """Fix the prefix to add to each TG.

        :param prefix: (str) Default is 'tg_'

        """
        sp = sppasUnicode(prefix)
        tg = sp.to_strip()
        if len(tg) > 0:
            self._options['tg_prefix_label'] = tg
Beispiel #32
0
 def _create_annotation(begin, duration, word, score):
     """Return the annotation corresponding to data of a line."""
     word = sppasUnicode(word).clear_whitespace()
     label = sppasLabel(sppasTag(word), score)
     begin = float(begin)
     end = begin + float(duration)
     location = sppasLocation(
         sppasInterval(sppasBaseSclite.make_point(begin),
                       sppasBaseSclite.make_point(end)))
     return sppasAnnotation(location, label)
Beispiel #33
0
    def add_message(self, message):
        """ Add a new message tips in the list of tips.

        :param message: (str) A help message.

        """
        su = sppasUnicode(message)
        u_message = su.to_strip()
        if len(u_message) > 0:
            self._tips.append(u_message)
Beispiel #34
0
    def add_message(self, message):
        """Add a new message tips in the list of tips.

        :param message: (str) A help message.

        """
        su = sppasUnicode(message)
        u_message = su.to_strip()
        if len(u_message) > 0:
            self._tips.append(u_message)
Beispiel #35
0
    def _parse_lines(self, lines):
        """ Fill the transcription from the lines of the CTM file. """

        # the number of the current alternation
        in_alt = 0
        # the annotations of the alternations
        alternates = dict()
        # the current tier to fill
        tier = None

        # Extract rows, create tiers and metadata.
        for i, line in enumerate(lines):
            line = sppasUnicode(line).to_strip()

            # a comment can contain metadata
            if sppasBaseSclite.is_comment(line):
                if tier is None:
                    sppasBaseSclite._parse_comment(line, self)
                else:
                    sppasBaseSclite._parse_comment(line, tier)
            # ignore comments and blank lines
            if sppasCTM.check_line(line, i + 1) is False:
                continue

            # check for the tier (find it or create it)
            tier = self.get_tier(line)

            # extract information of this annotation
            tab_line = line.strip().split()
            wavname, channel, begin, duration, word = tab_line[:5]
            score = sppasCTM.get_score(line)

            # check for an alternative annotation
            if begin == "*":
                if word == "<ALT_BEGIN>":
                    alternates = dict()
                    in_alt = 1
                    alternates[in_alt] = list()
                elif word == "<ALT>":
                    in_alt += 1
                    alternates[in_alt] = list()
                else:
                    # todo: we SHOULD add ALL the alternations into the tier
                    # but we add only the first one...
                    sppasCTM._add_alt_annotations(tier, alternates[1])
                    # re-init
                    alternates = dict()
                    in_alt = 0
            else:
                ann = sppasCTM._create_annotation(begin, duration, word, score)
                if in_alt == 0:
                    tier.add(ann)
                else:
                    alternates[in_alt].append(ann)
Beispiel #36
0
    def is_comment(line):
        """Check if the line is a comment, ie starts with ';;'.

        :param line: (str/unicode)
        :returns: boolean

        """
        sp = sppasUnicode(line)
        line = sp.to_strip()

        return line.startswith(";;")
Beispiel #37
0
    def unbind(self, utt):
        """ Unbind tokens containing - or ' or . depending on rules.

        :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...)
        :returns: A list of strings

        """
        new_utt = list()
        for tok in utt:
            is_unknown = self.__vocab.is_unk(tok.lower().strip())
            is_sampa = tok.startswith('/') and tok.endswith('/')
            is_trunc = tok.endswith('-')
            # a missing compound word?
            #   --> an unknown token
            #   --> containing a special character
            #   --> that is not a truncated word
            #   --> not in a sampa sequence!
            if is_unknown is True \
                    and ("-" in tok or "'" in tok or "." in tok) \
                    and is_sampa is False\
                    and is_trunc is False:

                # KEEP special chars in the array!
                tab_split = re.split("([-'.])", tok)
                tab_tok = list(entry for entry in tab_split if len(entry) > 0)
                idx_start = 0
                while idx_start < len(tab_tok):

                    # use a longest matching to aggregate the current token with the next ones
                    idx_end = min(len(tab_tok), idx_start + 5)
                    phrase = " ".join(tab_tok[idx_start:idx_end])
                    idx_end, word = self.__stick_longest_lr(
                        sppasUnicode(phrase).to_strip(), "")

                    new_utt.append(word)
                    idx_start += idx_end + 1

            else:
                new_utt.append(sppasUnicode(tok).to_strip())

        return new_utt
Beispiel #38
0
    def get_phon_tokens(self, tokens, phonunk=True):
        """ Return the phonetization of a list of tokens, with the status.
        Unknown entries are automatically phonetized if `phonunk` is set to True.

        :param tokens: (list) The list of tokens to be phonetized.
        :param phonunk: (bool) Phonetize unknown words (or not).

        TODO: EOT is not fully supported.

        :returns: A list with the tuple (token, phon, status).

        """
        tab = list()

        for entry in tokens:
            phon = self._pdict.get_unkstamp()
            status = OK_ID

            # Enriched Orthographic Transcription Convention:
            # entry can be already in SAMPA.
            if entry.startswith("/") is True and entry.endswith("/") is True:
                phon = entry.strip("/")
                # Must use SAMPA (including minus to separate phones)

            else:

                phon = self.get_phon_entry(entry)

                if phon == self._pdict.get_unkstamp():
                    status = ERROR_ID

                    # A missing compound word?
                    if "-" in entry or "'" in entry or "_" in entry:
                        _tabpron = [self.get_phon_entry(w) for w in re.split("[-'_]", entry)]

                        # OK, finally the entry is in the dictionary?
                        if self._pdict.get_unkstamp() not in _tabpron:
                            # ATTENTION: each part can have variants! must be decomposed.
                            self._dag_phon.variants = 4
                            phon = sppasUnicode(self._dag_phon.decompose(" ".join(_tabpron))).to_strip()
                            status = WARNING_ID

                    if phon == self._pdict.get_unkstamp() and phonunk is True:
                        try:
                            phon = self._phonunk.get_phon(entry)
                            status = WARNING_ID
                        except Exception:
                            phon = self._pdict.get_unkstamp()
                            status = ERROR_ID

            tab.append((entry, phon, status))

        return tab
Beispiel #39
0
    def _readline(self, filename):
        """ Read the first line of filename, and return it as a unicode formatted string. """

        line = ""
        try:
            with codecs.open(filename, 'r', encoding) as fp:
                sp = sppasUnicode(fp.readline())
                line = sp.to_strip()
        except Exception:
            return ""

        return line
Beispiel #40
0
    def check_data(self):
        """ Check the given data to be aligned (phones and tokens).

        :returns: A warning message, or an empty string if check is OK.

        """
        if len(self._phones) == 0:
            raise IOError("No data to time-align.")

        phones = sppasUnicode(self._phones).to_strip().split()
        tokens = sppasUnicode(self._tokens).to_strip().split()
        if len(tokens) != len(phones):
            message = "Tokens alignment disabled: " \
                      "not the same number of tokens in tokenization (%d) " \
                      "and phonetization (%d)."\
                      % (len(self._tokens), len(self._phones))
            self._tokens = " ".join(
                ["w_" + str(i) for i in range(len(self._phones))])
            return message

        return ""
Beispiel #41
0
    def unbind(self, utt):
        """ Unbind tokens containing - or ' or . depending on rules.

        :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...)
        :returns: A list of strings

        """
        new_utt = list()
        for tok in utt:
            is_unknown = self.__vocab.is_unk(tok.lower().strip())
            is_sampa = tok.startswith('/') and tok.endswith('/')
            is_trunc = tok.endswith('-')
            # a missing compound word?
            #   --> an unknown token
            #   --> containing a special character
            #   --> that is not a truncated word
            #   --> not in a sampa sequence!
            if is_unknown is True \
                    and ("-" in tok or "'" in tok or "." in tok) \
                    and is_sampa is False\
                    and is_trunc is False:

                # KEEP special chars in the array!
                tab_split = re.split("([-'.])", tok)
                tab_tok = list(entry for entry in tab_split if len(entry) > 0)
                idx_start = 0
                while idx_start < len(tab_tok):

                    # use a longest matching to aggregate the current token with the next ones
                    idx_end = min(len(tab_tok), idx_start + 5)
                    phrase = " ".join(tab_tok[idx_start:idx_end])
                    idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), "")

                    new_utt.append(word)
                    idx_start += idx_end + 1

            else:
                new_utt.append(sppasUnicode(tok).to_strip())

        return new_utt
Beispiel #42
0
    def replace(self, utt):
        """ Examine tokens and performs some replacements.
        A dictionary with symbols contains the replacements to operate.

        :param utt: (list) the utterance
        :returns: A list of strings

        """
        # Specific case of float numbers
        sent = ' '.join(utt)
        sent = re.sub('([0-9])\.([0-9])', r'\1 NUMBER_SEP_POINT \2', sent)
        sent = re.sub('([0-9])\,([0-9])', r'\1 NUMBER_SEP \2', sent)
        sent = sppasUnicode(sent).to_strip()
        _utt = sent.split()

        # Other generic replacements
        _result = []
        for s in _utt:
            if self.repl.is_key(s):
                s = s.replace(s, self.repl.replace(s))
            _result.append(sppasUnicode(s).to_strip())

        return _result
Beispiel #43
0
    def remove(self, utt, wlist):
        """ Remove data of an utterance if included in a dictionary.
        Only used to remove punctuation.

        :param entry:
        :param wlist: (WordList)

        """
        _utt = []
        for tok in utt:
            tok = sppasUnicode(tok).to_strip()
            if wlist.is_unk(tok) is True and "gpd_" not in tok and "ipu_" not in tok:
                _utt.append(tok)

        return _utt
Beispiel #44
0
    def lower(self, utt):
        """ Lower a list of strings.

        :param utt: (list)

        """
        _utt = []
        for tok in utt:
            # if it's not an already phonetized string:
            if "/" not in tok:
                _utt.append(sppasUnicode(tok).to_lower())
            else:
                _utt.append(tok)

        return _utt
Beispiel #45
0
    def is_pron_of(self, entry, pron):
        """ Return True if pron is a pronunciation of entry.
        Phonemes of pron are separated by "-".

        :param entry: (str) A unicode token to find in the dictionary
        :param pron: (str) A unicode pronunciation
        :returns: bool

        """
        s = sppasDictPron.format_token(entry)

        if s in self._dict:
            p = sppasUnicode(pron).to_strip()
            return p in self._dict[s].split(sppasDictPron.VARIANTS_SEPARATOR)

        return False
Beispiel #46
0
    def split(self, utt):
        """ Split an utterance using whitespace.
        If the language is character-based, split each character.

        :param utt: (str) an utterance of a transcription, a sentence, ...
        :param std: (bool)

        :returns: A list (array of string)

        """
        s = utt
        if without_whitespace(self.__lang) is True:
            s = self.split_characters(s)

        toks = []
        for t in s.split():
            # if not a phonetized entry
            if t.startswith("/") is False and t.endswith("/") is False:
                if without_whitespace(self.__lang) is False:
                    # Split numbers if stick to characters
                    # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers)
                    # and not on Asian languages: it can be a tone!
                    t = re.sub(u'([0-9])([a-zA-Z])', ur'\1 \2', t)
                    t = re.sub(u'([a-zA-Z])([0-9])', ur'\1 \2', t)

                # Split some punctuation
                t = re.sub(u'\\[\\]', ur'\\] \\[', t)

                # Split dots if stick to the beginning of a word
                # info: a dot at the end of a word is analyzed by the tokenizer
                t = re.sub(u' \.([\w-])', ur' . \1', t)
                t = re.sub(u'^\.([\w-])', ur' . \1', t)

                # Split replacement characters
                for r in self.__repl:
                    if t.endswith(r):
                        t = t[:-len(r)]
                        t = t + ' ' + r
            toks.append(t.strip())

        s = " ".join(toks)

        # Then split each time there is a space and return result
        s = sppasUnicode(s).to_strip()

        return s.split()
Beispiel #47
0
    def read(dirname):
        """ Read a list file (start-time end-time).

        :param dirname: Name of the directory with the file to read.

        """
        filename = os.path.join(dirname, ListIO.DEFAULT_FILENAME)
        with codecs.open(filename, 'r', encoding) as fp:
            lines = fp.readlines()

        _units = []
        # Each line corresponds to a track,
        # with a couple 'start end' of float values.
        for line in lines:
            s = sppasUnicode(line)
            line = s.to_strip()
            _tab = line.split()
            if len(_tab) >= 2:
                _units.append((float(_tab[0]), float(_tab[1])))

        return _units
Beispiel #48
0
    def bind(self, utt):
        """ Bind tokens of an utterance using a specific character.

        :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...)
        :returns: A list of strings

        """
        new_utt = list()

        idx_start = 0
        while idx_start < len(utt):

            # use a longest matching to aggregate the current token with the next ones
            idx_end = min(len(utt), idx_start+self.aggregate_max+1)
            phrase = " ".join(utt[idx_start:idx_end])
            idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), self.separator)

            new_utt.append(word)
            idx_start += idx_end + 1

        return new_utt
Beispiel #49
0
    def load_from_ascii(self, filename):
        """ Load a pronunciation dictionary from an HTK-ASCII file.

        :param filename: (str) Pronunciation dictionary file name

        """
        try:
            with codecs.open(filename, 'r', encoding) as fd:
                lines = fd.readlines()
        except Exception:
            raise FileIOError(filename)

        for l, line in enumerate(lines):

            uline = sppasUnicode(line).to_strip()

            # Ignore empty lines and check the number of columns
            if len(uline) == 0:
                continue
            if len(uline) == 1:
                raise FileFormatError(l, uline)

            # The entry is before the "[" and the pronunciation is after the "]"
            i = uline.find("[")
            if i == -1:
                i = uline.find(" ")
            entry = uline[:i]
            endline = uline[i:]
            j = endline.find("]")
            if j == -1:
                j = endline.find(" ")
            new_pron = endline[j+1:]

            # Phonetic variant of an entry (i.e. entry ends with (XX))
            i = entry.find("(")
            if i > -1:
                if ")" in entry[i:]:
                    entry = entry[:i]

            self.add_pron(entry, new_pron)
Beispiel #50
0
    def get_phon_entry(self, entry):
        """ Return the phonetization of an entry.
        Unknown entries are not automatically phonetized.
        This is a pure dictionary-based method.

        :param entry: (str) The entry to be phonetized.
        :returns: A string with the phonetization of the given entry or
        the unknown symbol.

        """
        entry = sppasUnicode(entry).to_strip()

        # Specific strings... for the italian transcription...
        # For the participation at the CLIPS-Evalita 2011 campaign.
        if entry.startswith(u("<")) is True and entry.endswith(u(">")) is True:
            entry = entry[1:-1]

        # No entry! Nothing to do.
        if len(entry) == 0:
            return ""

        # Specific strings used in the CID transcription...
        # CID is Corpus of Interactional Data, http://sldr.org/sldr000720
        if entry.startswith(u("gpd_")) is True or entry.startswith(u("gpf_")) is True:
            return ""

        # Specific strings used in SPPAS IPU segmentation...
        if entry.startswith(u("ipu_")):
            return ""

        # Find entry in the dict as it is given
        _strphon = self._pdict.get_pron(entry)

        # OK, the entry is properly phonetized.
        if _strphon != self._pdict.get_unkstamp():
            return self._map_phonentry(_strphon)

        return self._pdict.get_unkstamp()
Beispiel #51
0
    def _tier2raw(self, tier, mapp=False):
        """ Return all interval contents into a single string.

        """
        # Map phonemes from SAMPA to the expected ones.
        self._mapping.set_keep_miss(True)
        self._mapping.set_reverse(True)

        raw = ""
        for i, ann in enumerate(tier):
            if ann.GetLabel().IsEmpty() is True:
                logging.info("WARNING: Found an empty annotation label at index %d" % i)
                raw = raw + " sil"
            else:  # if ann.GetLabel().IsSilence() is False:
                besttext = ann.GetLabel().GetValue()
                if mapp is True:
                    besttext = self._mapping.map(besttext)

                if unk_stamp in besttext:
                    besttext = besttext.replace(unk_stamp, "sil")
                raw = raw + " " + besttext

        return sppasUnicode(raw).to_strip()
Beispiel #52
0
    def get_boundary(self, phonemes):
        """ Get the index of the syllable boundary (EXCRULES or GENRULES).

        :param phonemes: (str) Phonemes to syllabify
        :returns: (int) boundary index or -1 if phonemes does not match any rule.

        """
        sp = sppasUnicode(phonemes)
        phonemes = sp.to_strip()
        phon_list = phonemes.split(" ")
        classes = ""
        for phon in phon_list:
            classes += self.get_class(phon)

        # search into exception
        if classes in self.exception:
            return self.exception[classes]

        # search into general
        for key, val in self.general.items():
            if len(key) == len(phon_list):
                return val

        return -1
Beispiel #53
0
    def load(self, filename):
        """ Load the rules using the file "filename".

        :param filename: (str) Name of the file with the syllabification rules.

        """
        self.general = dict()    # list of general rules
        self.exception = dict()  # list of exception rules
        self.gap = dict()        # list of gap rules
        self.phonclass = dict()  # list of couples phoneme/classe
        with open(filename, "r") as file_in:

            for line_nb, line in enumerate(file_in, 1):
                sp = sppasUnicode(line)
                line = sp.to_strip()

                wds = line.split()
                if len(wds) == 3:
                    if wds[0] == "PHONCLASS":
                        self.phonclass[wds[1]] = wds[2]
                    elif wds[0] == "GENRULE":
                        self.general[wds[1]] = int(wds[2])
                    elif wds[0] == "EXCRULE":
                        self.exception[wds[1]] = int(wds[2])
                if len(wds) == 7:
                    if wds[0] == "OTHRULE":
                        s = " ".join(wds[1:6])
                        self.gap[s] = int(wds[6])

        if len(self.general) < 4:
            raise IOError('Syllabification rules file corrupted. '
                          'Got {:d} general rules, {:d} exceptions '
                          'and {:d} other rules.'.format(len(self.general), len(self.exception), len(self.gap)))

        if "UNK" not in self.phonclass:
            self.phonclass["UNK"] = "#"
Beispiel #54
0
    def read_palign(self, filename):
        """ Read an alignment file in the standard format of Julius CSR engine.

        :param filename: (str) The input file name.
        :returns: Two lists of tuples:
            - (start-time end-time phoneme score)
            - (start-time end-time word score)

        """
        _phonalign = []
        _wordalign = []

        phonidx = -1     # phoneme index
        loc_s = 0.       # phoneme start time
        loc_e = 0.       # phoneme end time
        phonlist = []
        wordseq = []
        scores = [0]
        tokens = [""]
        wordlist = []

        with codecs.open(filename, 'r', encoding) as fp:
            lines = fp.readlines()

        for line in lines:
            # Each line is either a new annotation or nothing interesting!
            line = sppasUnicode(line).to_strip()

            if line.startswith("=== begin forced alignment ==="):
                phonidx = 0

            elif line.startswith("=== end forced alignment ==="):
                phonidx = -1

            elif line.startswith("phseq1:"):
                s = sppasUnicode(line[7:])
                line = s.to_strip()

                wordseq = line.split('|')
                # get indexes of each word
                wordlist = []
                _idx = -1
                for w in wordseq:
                    _wrdphseq = w.strip().split()
                    _idx += len(_wrdphseq)
                    wordlist.append(_idx)
                # get the list of phonemes (without word segmentation)
                line = line.replace('|', '')
                line = sppasUnicode(line).to_strip()
                phonlist = line.split()

            elif line.startswith('cmscore1:'):
                line = line[9:]
                # confidence score of the pronunciation of each token
                scores = [float(s) for s in line.split()]
                if len(scores) == 0:
                    scores = [0]

            elif line.startswith('sentence1:'):
                line = line[10:]
                # each token
                tokens = line.split()
                if len(tokens)==0:
                    tokens = [""]

            elif line.startswith('[') and phonidx > -1:
                # New phonemes
                line = line.replace("[", "")
                line = line.replace("]", "")
                line = sppasUnicode(line).to_strip()
                tab = line.split(" ")
                # tab 0: first frame
                # tab 1: last frame
                # tab 2: score of the segmentation (log proba)
                # tab 3: triphone used
                loc_s = (float(tab[0]) / 100.)
                loc_e = (float(tab[1]) / 100.)
                if len(tab)>3:
                    # Put real phoneme instead of triphones
                    _phonalign.append([loc_s, loc_e, phonlist[phonidx], tab[2]])
                else:
                    _phonalign.append([loc_s, loc_e, "", tab[2]])
                phonidx = phonidx+1

        # Adjust time values and create wordalign
        wordidx = 0     # word index
        wordloc_s = 0.  # word start time
        loc_e = 0.
        nextloc_s = 0.
        for phonidx in range(len(_phonalign)):

            # Fix the end of this annotation to the begin of the next one.
            loc_e = _phonalign[phonidx][1]
            if phonidx < (len(_phonalign)-1):
                # some hack because julius has a tendency to always be... ahead! 
                nextloc_s = _phonalign[phonidx+1][0] + 0.01
                _phonalign[phonidx+1][0] = nextloc_s
            else:
                nextloc_s = 0.
            if loc_e < nextloc_s:
                loc_e = nextloc_s
            _phonalign[phonidx][1] = loc_e

            # Override the segmentation score of the phone by
            # the score of the pronunciation of the word
            _phonalign[phonidx][3] = scores[wordidx]

            # add also the word?
            if phonidx == wordlist[wordidx]:
                _wordalign.append([wordloc_s, loc_e, tokens[wordidx], scores[wordidx]])
                wordidx = wordidx + 1
                wordloc_s = loc_e

        # last word, or the only entry in case of empty interval...
        if len(wordseq)-1 == wordidx:
            _wordalign.append([wordloc_s, loc_e, tokens[wordidx-1], scores[wordidx-1]])

        return _phonalign,_wordalign
Beispiel #55
0
    def run_alignment(self, inputwav, outputalign, N=3):
        """ Execute the external program `julius` to align.

        The data related to the unit to time-align need to be previously
        fixed with:

            - set_phones(str)
            - set_tokens(str)

        :param inputwav: (str - IN) the audio input file name, of type PCM-WAV 16000 Hz, 16 bits
        :param outputalign: (str - OUT) the output file name
        :param N: (int) N value of N-grams, used only if SLM (i.e. outext=walign)

        :returns: (str) A message of `julius`.

        """
        outputalign = outputalign + "." + self._outext

        basename = os.path.splitext(inputwav)[0]
        if self._outext == "palign":
            self.gen_grammar_dependencies(basename)
        else:
            self.gen_slm_dependencies(basename)

        self.run_julius(inputwav, basename, outputalign)
        with codecs.open(outputalign, 'r', encoding) as f:
            lines = f.readlines()

        errorlines = ""
        message = ""

        entries = []
        for line in lines:
            if line.find("Error: voca_load_htkdict") > -1 and line.find("not found") > -1:
                line = sppasUnicode(line).to_strip()
                line = line[line.find('"')+1:]
                line = line[:line.find('"')]
                if len(line) > 0:
                    entries = line.split()

        if len(entries) > 0:
            added = self.add_tiedlist(entries)
            if len(added) > 0:
                message = "The acoustic model was modified. " \
                          "The following entries were successfully added into the tiedlist: "
                message = message + " ".join(added) + "\n"
                self.run_julius(inputwav, basename, outputalign)
                with codecs.open(outputalign, 'r', encoding) as f:
                    lines = f.readlines()

        for line in lines:
            if (line.startswith("Error:") or line.startswith("ERROR:")) and " line " not in line:
                errorlines = errorlines + line
            if "search failed" in line:
                message = "Julius search has failed to find the transcription in the audio file. "
                errorlines = "Search error. " + errorlines

        if len(errorlines) > 0:
            raise Exception(message + errorlines)

        return message
Beispiel #56
0
    def toe_spelling(self, entry, std=False):
        """ Create a specific spelling from an Enriched Orthographic Transcription.

        :param entry: (str) the EOT string
        :param std: (bool) Standard spelling expected instead of the Faked one.
        :returns: (str)

        DevNote: Python’s regular expression engine supports Unicode.
        It can apply the same pattern to either 8-bit (encoded) or
        Unicode strings. To create a regular expression pattern that
        uses Unicode character classes for \w (and \s, and \b), use
        the “(?u)” flag prefix, or the re.UNICODE flag.

        """
        # Ensure all regexp will work!
        _fentry = " " + u(entry) + " "

        if std is False:
            # Stick unregular Liaisons to the previous token
            _fentry = re.sub(u' =([\w]+)=', ur'-\1', _fentry, re.UNICODE)
        else:
            # Remove Liaisons
            _fentry = re.sub(u' =([\w]+)=', ur' ', _fentry, re.UNICODE)

        # Laughing sequences
        _fentry = re.sub(u"\s?@\s?@\s?", u" ", _fentry, re.UNICODE)

        # Laughing
        _fentry = re.sub(u"([\w\xaa-\xff]+)@", ur"\1 @", _fentry, re.UNICODE)
        _fentry = re.sub(u"@([\w\xaa-\xff]+)", ur"@ \1", _fentry, re.UNICODE)

        # Noises
        _fentry = re.sub(u"([\w\xaa-\xff]+)\*", ur"\1 *", _fentry, re.UNICODE)
        _fentry = re.sub(u"\*([\w\xaa-\xff]+)", ur"* \1", _fentry, re.UNICODE)

        # Transcriptor comment's: {comment}
        _fentry = re.sub(u'\\{[\s\w\xaa-\xff\-:]+\\}', ur'', _fentry, re.UNICODE)
        # Transcriptor comment's: [comment]
        _fentry = re.sub(u'\\[[\s\w\xaa-\xff\-:]+\\]', ur'', _fentry, re.UNICODE)

        if std is False:
            # Special elisions (remove parenthesis content)
            _fentry = re.sub(u'\\([\s\w\xaa-\xff\-\']+\\)', ur'', _fentry, re.UNICODE)
        else:
            # Special elisions (keep parenthesis content)
            _fentry = re.sub(u'\\(([\s\w\xaa-\xff\-]+)\\)', ur'\1', _fentry, re.UNICODE)

        # Morphological variants are ignored for phonetization (same pronunciation!)
        _fentry = re.sub(u'\s+\\<([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\>', ur' \1', _fentry, re.UNICODE)
        _fentry = re.sub(u'\s+\\{([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\}', ur' \1', _fentry, re.UNICODE)

        if std is False:
            # Special pronunciations (keep right part)
            _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]', ur' \2', _fentry, re.UNICODE)
        else:
            # Special pronunciations (keep left part)
            _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]', ur' \1', _fentry, re.UNICODE)

        # Proper names: $ name ,P\$
        _fentry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', _fentry, re.UNICODE)
        _fentry = re.sub(u'\\$', ur'', _fentry, re.UNICODE)

        # specific case with numbers
        _fentry = re.sub(u"\s(?=,[0-9]+)", "", _fentry, re.UNICODE)

        # ok, now stop regexp and work with unicode:
        _fentry = sppasUnicode(_fentry).to_strip()

        # Punctuations at the end of a token

        s = []
        entries = _fentry.split()
        for i, c in enumerate(entries):
            # Check for the SAMPA sequence to assign properly "in_sampa"
            if c.startswith("/") and c.endswith('/'):
                in_sampa = True
            else:
                in_sampa = False

            # if not in_sampa, add a whitespace if some punctuations are stick to a word
            if in_sampa is False:

                # if there is a serie of punctuations at the beginning
                while len(c) > 0 and category(c[0])[0] in ('P', 'S'):
                    s.append(c[0])
                    c = c[1:]

                # if there is a serie of punctuations at the end
                end_punct = []
                while len(c) > 0 and category(c[-1])[0] in ('P', 'S'):
                    end_punct.append(c[-1])
                    c = c[:-1]
                if len(end_punct) == 1 and end_punct[0] == u("."):
                    s.append(c+u("."))
                else:
                    s.append(c)
                    if len(end_punct) > 0:
                        s.extend(reversed(end_punct))

            else:
                if len(s) == 0:
                    s.append(c)
                else:
                    s[-1] += c

        return " ".join(s)
Beispiel #57
0
    def read_walign(self, filename):
        """ Read an alignment file in the standard format of Julius CSR engine.

        :param filename: (str) The input file name.
        :returns: Two lists of tuples:
            - None
            - (start-time end-time word score)

        """
        tokens = [""]
        scores = [0]
        _wordalign = []
        wordidx = -1
        with codecs.open(filename, 'r', encoding) as fp:
            lines = fp.readlines()

        for line in lines:
            # Each line is either a new annotation or nothing interesting!
            line = sppasUnicode(line).to_strip()

            if line.startswith("=== begin forced alignment ==="):
                wordidx = 0

            elif line.startswith("=== end forced alignment ==="):
                wordidx = -1

            elif line.startswith('wseq1:'):
                line = line[6:]
                # each token
                tokens = line.split()
                if len(tokens) == 0:
                    tokens = [""]

            elif line.startswith('cmscore1:'):
                line = line[9:]
                # confidence score of the pronunciation of each token
                scores = [float(s) for s in line.split()]
                if len(scores) == 0:
                    scores = [0]

            elif line.startswith('[') and wordidx > -1:
                # New phonemes
                line = line.replace("[", "")
                line = line.replace("]", "")
                line = sppasUnicode(line).to_strip()
                tab = line.split(" ")
                # tab 0: first frame
                # tab 1: last frame
                # tab 2: score of the segmentation (log proba)
                # tab 3: word
                loc_s = (float(tab[0]) / 100.)
                loc_e = (float(tab[1]) / 100.)
                _wordalign.append([loc_s, loc_e, tokens[wordidx], scores[wordidx]])
                wordidx = wordidx+1

        # Adjust time values
        for wordidx in range(len(_wordalign)):

            # Fix the end of this annotation to the begin of the next one.
            loc_e = _wordalign[wordidx][1]
            if wordidx < (len(_wordalign)-1):
                nextloc_s = _wordalign[wordidx+1][0]
            else:
                nextloc_s = 0.
            if loc_e < nextloc_s:
                loc_e = nextloc_s
            _wordalign[wordidx][1] = loc_e

        return None,_wordalign
Beispiel #58
0
    def normalize(self, entry, actions=[]):
        """ Tokenize an utterance.

        :param entry: (str) the string to normalize
        :param actions: (list) the modules/options to enable.

            - "std": generated the standard orthography instead of the faked one
            - "replace": use a replacement dictionary
            - "tokenize": tokenize the entry
            - "numbers": convert numbers to their written form
            - "lower": change case of characters to lower
            - "punct": remove punctuation

        :returns: (str) the normalized entry

        Important:
        An empty actions list or a list containing only "std" means to
        enable all actions.

        """
        _str = sppasUnicode(entry).to_strip()

        # Remove UTF-8 specific characters that are not in our dictionaries!
        for key in self.dicoutf:
            _str = _str.replace(key, self.dicoutf.replace(key))

        # Clean the Enriched Orthographic Transcription
        ortho = sppasTranscription()
        _str = ortho.clean_toe(_str)
        if "std" in actions:
            _str = ortho.toe_spelling(_str, True)
        else:
            _str = ortho.toe_spelling(_str, False)

        # Split using whitespace or characters.
        splitter = sppasTokSplitter(self.lang, self.repl)
        utt = splitter.split(_str)

        # The entry is now a list of strings on which we'll perform actions
        # -----------------------------------------------------------------
        if len(actions) == 0 or (len(actions) == 1 and "std" in actions):
            actions.append("replace")
            actions.append("tokenize")
            actions.append("numbers")
            actions.append("lower")
            actions.append("punct")

        if "replace" in actions:
            utt = self.replace(utt)

        if "tokenize" in actions:
            utt = self.tokenize(utt)

        if "numbers" in actions:
            utt = self.numbers(utt)

        if "lower" in actions:
            utt = self.lower(utt)

        if "punct" in actions:
            utt = self.remove(utt, self.punct)

        # Finally, prepare the result
        result = ""
        for s in utt:
            s = sppasUnicode(s).to_strip()
            result = result + " " + s.replace(" ", "_")

        result = sppasUnicode(result).to_strip()
        if len(result) == 0:
            return ""  # Nothing valid!

        return result.replace(" ", self.delimiter)