Ejemplo n.º 1
0
 def inner(word):
     for reading, meaning in database.selectRows(sqlalchemy.select(
             [dicttable.c.Reading,
              dicttable.c.Translation],
             sqlalchemy.or_(dicttable.c.HeadwordSimplified == word,
                            dicttable.c.HeadwordTraditional == word))):
         yield (reading, parseMeaning(meaning, simptradindex))
Ejemplo n.º 2
0
 def inner(word):
     for reading, meaning in database.selectRows(sqlalchemy.select(
             [dicttable.c.Reading,
              dicttable.c.Translation],
             sqlalchemy.or_(dicttable.c.HeadwordSimplified == word,
                            dicttable.c.HeadwordTraditional == word))):
         yield (reading, parseMeaning(meaning, simptradindex))
Ejemplo n.º 3
0
def databaseReadingSource():
    log.info("Loading character reading database")

    readingtable = sqlalchemy.Table("CharacterPinyin", database.metadata, autoload=True)

    return (
        1,
        lambda word: [
            (reading[0], None)
            for reading in database.selectRows(
                sqlalchemy.select([readingtable.c.Reading], readingtable.c.ChineseCharacter == word)
            )
        ],
    )
Ejemplo n.º 4
0
class Pinyin(object):
    # Extract a simple regex of all the possible pinyin.
    # NB: we have to delay-load  this in order to give the UI a chance to create the database if it is missing
    # NB: we only need to consider the ü versions because the regex is used to check *after* we have normalised to ü
    validpinyin = utils.Thunk(lambda: set(["r"] + [substituteForUUmlaut(pinyin[0]).lower() for pinyin in database.selectRows(sqlalchemy.select([sqlalchemy.Table("PinyinSyllables", database.metadata, autoload=True).c.Pinyin]))]))
    
    def __init__(self, word, toneinfo, htmlattrs=None):
        self.word = word
        
        if isinstance(toneinfo, int):
            # Convenience constructor: build a ToneInfo from a simple number
            self.toneinfo = ToneInfo(written=toneinfo)
        else:
            self.toneinfo = toneinfo
        
        self.htmlattrs = htmlattrs or {}
    
    iser = property(lambda self: self.word.lower() == u"r" and self.toneinfo.written == 5)

    def __str__(self):
        return self.__unicode__()
    
    def __unicode__(self):
        return self.numericformat(hideneutraltone=True)
    
    def __repr__(self):
        return u"Pinyin(%s, %s%s)" % (repr(self.word), repr(self.toneinfo), opt_dict_arg_repr(self.htmlattrs))
    
    def __eq__(self, other):
        if other == None or other.__class__ != self.__class__:
            return False
        
        return self.toneinfo == other.toneinfo and self.word == other.word and self.htmlattrs == other.htmlattrs
    
    def __ne__(self, other):
        return not (self.__eq__(other))
    
    def accept(self, visitor):
        return visitor.visitPinyin(self)
    
    def numericformat(self, hideneutraltone=False, tone="written"):
        if hideneutraltone and getattr(self.toneinfo, tone) == 5:
            return self.word
        else:
            return self.word + str(getattr(self.toneinfo, tone))
    
    def tonifiedformat(self):
        return PinyinTonifier().tonify(self.numericformat(hideneutraltone=False))

    """
    Constructs a Pinyin object from text representing a single character and numeric tone mark
    or an embedded tone mark on one of the letters.
    
    >>> Pinyin.parse("hen3")
    hen3
    """
    @classmethod
    def parse(cls, text, forcenumeric=False):
        # Normalise u: and v: into umlauted version:
        # NB: might think about doing lower() here, as some dictionary words have upper case (e.g. proper names)
        text = substituteForUUmlaut(text)
        
        # Length check (yes, you can get 7 character pinyin, such as zhuang1.
        # If the u had an umlaut then it would be 8 'characters' to Python)
        if len(text) < 2 or len(text) > 8:
            raise ValueError(u"The text '%s' was not the right length to be Pinyin - should be in the range 2 to 7 characters" % text)
        
        # Does it look like we have a non-tonified string?
        if text[-1].isdigit():
            # Extract the tone number directly
            toneinfo = ToneInfo(written=int(text[-1]))
            word = text[:-1]
        elif forcenumeric:
            # Whoops. Should have been numeric but wasn't!
            raise ValueError(u"No tone mark present on purportely-numeric pinyin '%s'" % text)
        else:
            # Seperate combining marks (NFD = Normal Form Decomposed) so it
            # is easy to spot the combining marks
            text = unicodedata.normalize('NFD', text)
            
            # Remove the combining mark to get the tone
            toneinfo, word = None, text
            for n, tonecombiningmark in enumerate(tonecombiningmarks):
                if tonecombiningmark != "" and tonecombiningmark in text:
                    # Two marks on the same string is an error
                    if toneinfo != None:
                        raise ValueError(u"Too many combining tone marks on the input pinyin '%s'" % text)
                    
                    # Record the corresponding tone and remove the combining mark
                    toneinfo = ToneInfo(written=n+1)
                    word = word.replace(tonecombiningmark, "")
            
            # No combining mark? Fall back on the unmarked 5th tone
            if toneinfo == None:
                toneinfo = ToneInfo(written=5)
            
            # Recombine for consistency of comparisons in the application (everything else assumes NFC)
            word = unicodedata.normalize('NFC', word)
        
        # Sanity check to catch English/French/whatever that doesn't look like pinyin
        if word.lower() not in cls.validpinyin():
            log.info("Couldn't find %s in the valid pinyin list", word)
            raise ValueError(u"The proposed pinyin '%s' doesn't look like pinyin after all" % text)
        
        # We now have a word and tone info, whichever route we took
        return Pinyin(word, toneinfo)
Ejemplo n.º 5
0
def databaseReadingSource():
    log.info("Loading character reading database")
    
    readingtable = sqlalchemy.Table("CharacterPinyin", database.metadata, autoload=True)
    
    return 1, lambda word: [(reading[0], None) for reading in database.selectRows(sqlalchemy.select([readingtable.c.Reading], readingtable.c.ChineseCharacter == word))]