Ejemplo n.º 1
0
    def parsedefinition(self, raw_definition, tonedchars_callback=None):
        log.info("Parsing the raw definition %s", raw_definition)

        # Default the toned characters callback to something sensible
        if tonedchars_callback is None:
            tonedchars_callback = lambda characters: [Word(Text(characters))]

        meanings, measurewords = [], []
        for definition in raw_definition.strip().lstrip("/").rstrip("/").split(
                "/"):
            # Remove stray spaces
            definition = definition.strip()

            # Detect measure-word ness
            if definition.startswith("CL:"):
                ismeasureword = True

                # Measure words are comma-seperated
                for mw in definition[3:].strip().split(","):
                    # Attempt to parse the measure words as structured data
                    match = self.embeddedchineseregex.match(mw)
                    if match is None:
                        log.info(
                            "Could not parse the apparent measure word %s", mw)
                        continue

                    # They SHOULD have pinyin information
                    characterswords, pinyinwords = self.formatmatch(
                        match, tonedchars_callback)
                    if characterswords is None or pinyinwords is None:
                        log.info(
                            "The measure word %s was missing some information in the dictionary",
                            mw)
                        continue

                    measurewords.append((characterswords, pinyinwords))
            else:
                words = []
                for ismatch, thing in utils.regexparse(
                        self.embeddedchineseregex, definition):
                    if ismatch:
                        # A match - we can append a representation of the words it contains
                        (characterwords, pinyinwords) = self.formatmatch(
                            thing, tonedchars_callback)

                        # Put the resulting words right into the output in a human-readable format
                        words.extend(characterwords)
                        if pinyinwords is not None:
                            words.append(Word(Text(" - ")))
                            words.extend(pinyinwords)
                    else:
                        # Just a string: append it as a list of tokens, trying to extract any otherwise-unmarked
                        # pinyin in the sentence for colorisation etc
                        words.append(Word(*tokenize(thing, forcenumeric=True)))

                meanings.append(words)

        return meanings, measurewords
Ejemplo n.º 2
0
 def reformatmeaning(self, meaning):
     output = u""
     for recognised, match in utils.regexparse(re.compile(ur"\(([0-9]+)\)"), meaning):
         if recognised:
             # Should reformat the number
             output += self.config.meaningnumber(int(match.group(1)))
         else:
             # Output is just unicode, append it directly
             output += match
Ejemplo n.º 3
0
 def reformataudio(self, audio):
     output = u""
     for recognised, match in utils.regexparse(re.compile(ur"\[sound:([^\]]*)\]"), audio):
         if recognised:
             # Must be a sound tag - leave it well alone
             output += match.group(0)
         else:
             # Process as if this non-sound tag were a reading, in order to turn it into some tags
             output += generateaudio(self.notifier, self.mediamanager, self.config, [model.Word(*model.tokenize(match))])
Ejemplo n.º 4
0
    def parsedefinition(self, raw_definition, tonedchars_callback=None):
        log.info("Parsing the raw definition %s", raw_definition)

        # Default the toned characters callback to something sensible
        if tonedchars_callback is None:
            tonedchars_callback = lambda characters: [Word(Text(characters))]

        meanings, measurewords = [], []
        for definition in raw_definition.strip().lstrip("/").rstrip("/").split("/"):
            # Remove stray spaces
            definition = definition.strip()

            # Detect measure-word ness
            if definition.startswith("CL:"):
                ismeasureword = True

                # Measure words are comma-seperated
                for mw in definition[3:].strip().split(","):
                    # Attempt to parse the measure words as structured data
                    match = self.embeddedchineseregex.match(mw)
                    if match is None:
                        log.info("Could not parse the apparent measure word %s", mw)
                        continue

                    # They SHOULD have pinyin information
                    characterswords, pinyinwords = self.formatmatch(match, tonedchars_callback)
                    if characterswords is None or pinyinwords is None:
                        log.info("The measure word %s was missing some information in the dictionary", mw)
                        continue

                    measurewords.append((characterswords, pinyinwords))
            else:
                words = []
                for ismatch, thing in utils.regexparse(self.embeddedchineseregex, definition):
                    if ismatch:
                        # A match - we can append a representation of the words it contains
                        (characterwords, pinyinwords) = self.formatmatch(thing, tonedchars_callback)

                        # Put the resulting words right into the output in a human-readable format
                        words.extend(characterwords)
                        if pinyinwords is not None:
                            words.append(Word(Text(" - ")))
                            words.extend(pinyinwords)
                    else:
                        # Just a string: append it as a list of tokens, trying to extract any otherwise-unmarked
                        # pinyin in the sentence for colorisation etc
                        words.append(Word(*tokenize(thing, forcenumeric=True)))

                meanings.append(words)

        return meanings, measurewords
Ejemplo n.º 5
0
def tokenizetext(text, forcenumeric):
    # To recognise pinyin amongst the rest of the text, for now just look for maximal
    # sequences of alphanumeric characters as defined by Unicode. This should catch
    # the pinyin, its tone marks, tone numbers (if any) and allow umlauts.
    tokens = []
    for recognised, match in utils.regexparse(re.compile(u"(\w|:)+", re.UNICODE), text):
        if recognised:
            tokens.extend(tokenizeonewitherhua(match.group(0), forcenumeric=forcenumeric))
        else:
            tokens.append(Text(match))
    
    # TODO: could be much smarter about segmentation here. For example, we could use the
    # pinyin regex to split up run on groups of pinyin-like characters.
    return tokens