def inner(word): for reading, meaning in database.selectRows(sqlalchemy.select( [dicttable.c.Reading, dicttable.c.Translation], sqlalchemy.or_(dicttable.c.HeadwordSimplified == word, dicttable.c.HeadwordTraditional == word))): yield (reading, parseMeaning(meaning, simptradindex))
def databaseReadingSource(): log.info("Loading character reading database") readingtable = sqlalchemy.Table("CharacterPinyin", database.metadata, autoload=True) return ( 1, lambda word: [ (reading[0], None) for reading in database.selectRows( sqlalchemy.select([readingtable.c.Reading], readingtable.c.ChineseCharacter == word) ) ], )
class Pinyin(object): # Extract a simple regex of all the possible pinyin. # NB: we have to delay-load this in order to give the UI a chance to create the database if it is missing # NB: we only need to consider the ü versions because the regex is used to check *after* we have normalised to ü validpinyin = utils.Thunk(lambda: set(["r"] + [substituteForUUmlaut(pinyin[0]).lower() for pinyin in database.selectRows(sqlalchemy.select([sqlalchemy.Table("PinyinSyllables", database.metadata, autoload=True).c.Pinyin]))])) def __init__(self, word, toneinfo, htmlattrs=None): self.word = word if isinstance(toneinfo, int): # Convenience constructor: build a ToneInfo from a simple number self.toneinfo = ToneInfo(written=toneinfo) else: self.toneinfo = toneinfo self.htmlattrs = htmlattrs or {} iser = property(lambda self: self.word.lower() == u"r" and self.toneinfo.written == 5) def __str__(self): return self.__unicode__() def __unicode__(self): return self.numericformat(hideneutraltone=True) def __repr__(self): return u"Pinyin(%s, %s%s)" % (repr(self.word), repr(self.toneinfo), opt_dict_arg_repr(self.htmlattrs)) def __eq__(self, other): if other == None or other.__class__ != self.__class__: return False return self.toneinfo == other.toneinfo and self.word == other.word and self.htmlattrs == other.htmlattrs def __ne__(self, other): return not (self.__eq__(other)) def accept(self, visitor): return visitor.visitPinyin(self) def numericformat(self, hideneutraltone=False, tone="written"): if hideneutraltone and getattr(self.toneinfo, tone) == 5: return self.word else: return self.word + str(getattr(self.toneinfo, tone)) def tonifiedformat(self): return PinyinTonifier().tonify(self.numericformat(hideneutraltone=False)) """ Constructs a Pinyin object from text representing a single character and numeric tone mark or an embedded tone mark on one of the letters. >>> Pinyin.parse("hen3") hen3 """ @classmethod def parse(cls, text, forcenumeric=False): # Normalise u: and v: into umlauted version: # NB: might think about doing lower() here, as some dictionary words have upper case (e.g. proper names) text = substituteForUUmlaut(text) # Length check (yes, you can get 7 character pinyin, such as zhuang1. # If the u had an umlaut then it would be 8 'characters' to Python) if len(text) < 2 or len(text) > 8: raise ValueError(u"The text '%s' was not the right length to be Pinyin - should be in the range 2 to 7 characters" % text) # Does it look like we have a non-tonified string? if text[-1].isdigit(): # Extract the tone number directly toneinfo = ToneInfo(written=int(text[-1])) word = text[:-1] elif forcenumeric: # Whoops. Should have been numeric but wasn't! raise ValueError(u"No tone mark present on purportely-numeric pinyin '%s'" % text) else: # Seperate combining marks (NFD = Normal Form Decomposed) so it # is easy to spot the combining marks text = unicodedata.normalize('NFD', text) # Remove the combining mark to get the tone toneinfo, word = None, text for n, tonecombiningmark in enumerate(tonecombiningmarks): if tonecombiningmark != "" and tonecombiningmark in text: # Two marks on the same string is an error if toneinfo != None: raise ValueError(u"Too many combining tone marks on the input pinyin '%s'" % text) # Record the corresponding tone and remove the combining mark toneinfo = ToneInfo(written=n+1) word = word.replace(tonecombiningmark, "") # No combining mark? Fall back on the unmarked 5th tone if toneinfo == None: toneinfo = ToneInfo(written=5) # Recombine for consistency of comparisons in the application (everything else assumes NFC) word = unicodedata.normalize('NFC', word) # Sanity check to catch English/French/whatever that doesn't look like pinyin if word.lower() not in cls.validpinyin(): log.info("Couldn't find %s in the valid pinyin list", word) raise ValueError(u"The proposed pinyin '%s' doesn't look like pinyin after all" % text) # We now have a word and tone info, whichever route we took return Pinyin(word, toneinfo)
def databaseReadingSource(): log.info("Loading character reading database") readingtable = sqlalchemy.Table("CharacterPinyin", database.metadata, autoload=True) return 1, lambda word: [(reading[0], None) for reading in database.selectRows(sqlalchemy.select([readingtable.c.Reading], readingtable.c.ChineseCharacter == word))]