Example #1
0
    def generatemeanings(self, expression, dictmeanings):
        if dictmeanings == None:
            # We didn't get any meanings, don't update the field
            return None
        
        # Consider sandhi in meanings - you never know, there might be some!
        dictmeanings = [transformations.tonesandhi(dictmeaning) for dictmeaning in dictmeanings]
        
        if self.config.hanzimasking:
            # Hanzi masking is on: scan through the meanings and remove the expression itself
            dictmeanings = [transformations.maskhanzi(expression, self.config.formathanzimaskingcharacter(), dictmeaning) for dictmeaning in dictmeanings]

        # Prepare all the meanings by flattening them and removing empty entries
        meanings = [meaning for meaning in [preparetokens(self.config, dictmeaning) for dictmeaning in dictmeanings] if meaning.strip != '']
        
        # Scan through the meanings and replace instances of 'surname Foo' with a masked version
        lookslikesurname = lambda what: what.lower().startswith("surname ") and " " not in what[len("surname ")]
        meanings = [lookslikesurname(meaning) and "(a surname)" or meaning for meaning in meanings]
        
        if len(meanings) == 0:
            # After flattening and stripping, we didn't get any meanings: don't update the field
            return None
        
        # Use the configuration to insert numbering etc
        return self.config.formatmeanings(meanings)
Example #2
0
   def expression2dictreading(self, expression):
       dictreadingsources = [
               # Get the reading by considering the text as a (Western) number
               lambda: numbers.readingfromnumberlike(expression, self.dictionary),
               # Use CEDICT to get reading (always succeeds)
               lambda: self.dictionary.reading(expression)
           ]
       
       # Find the first source that returns a sensible reading
       for lookup in dictreadingsources:
           dictreading = lookup()
           if dictreading != None:
               return transformations.tonesandhi(dictreading)
 
       raise AssertionError("The CEDICT reading lookup should always succeed, but it failed on %s" % expression)
Example #3
0
   def expression2dictreading(self, expression):
       dictreadingsources = [
               # Get the reading by considering the text as a (Western) number
               lambda: numbers.readingfromnumberlike(expression, self.dictionary),
               # Use CEDICT to get reading (always succeeds)
               lambda: self.dictionary.reading(expression)
           ]
       
       # Find the first source that returns a sensible reading
       for lookup in dictreadingsources:
           dictreading = lookup()
           if dictreading != None:
               return transformations.tonesandhi(dictreading)
 
       raise AssertionError("The CEDICT reading lookup should always succeed, but it failed on %s" % expression)
Example #4
0
 def mergeddictmwdictreading2mwaudio(self, mergeddictmws, noundictreading):
     dictreading = []
     for _, mwpinyinwords in mergeddictmws:
         # The audio field will contain <random number> <mw> <noun> for every possible MW
         # NB: we explicitly encode the tokens rather than doing a lookup because e.g. 几 has
         # several readings, but we know precisely the one we want here and can avoid ambiguity
         dictreading.append(model.Word(random.choice(numbers.hanziquantitypinyin)))
         dictreading.extend(mwpinyinwords)
         dictreading.extend(noundictreading)
         # This comma doesn't currently do anything, but it might come in useful if we
         # add delay generation in the audio code later on
         dictreading.append(model.Word(model.Text(", ")))
     
     # Only apply the sandhi generator at this point: we have carefully avoided doing it for the
     # input up to now (especially for the noundictreading). Probably doesn't make a difference
     # with the current implementation, but better safe than sorry.
     return generateaudio(self.notifier, self.mediamanager, self.config, transformations.tonesandhi(dictreading))
Example #5
0
 def mergeddictmwdictreading2mwaudio(self, mergeddictmws, noundictreading):
     dictreading = []
     for _, mwpinyinwords in mergeddictmws:
         # The audio field will contain <random number> <mw> <noun> for every possible MW
         # NB: we explicitly encode the tokens rather than doing a lookup because e.g. 几 has
         # several readings, but we know precisely the one we want here and can avoid ambiguity
         dictreading.append(model.Word(random.choice(numbers.hanziquantitypinyin)))
         dictreading.extend(mwpinyinwords)
         dictreading.extend(noundictreading)
         # This comma doesn't currently do anything, but it might come in useful if we
         # add delay generation in the audio code later on
         dictreading.append(model.Word(model.Text(", ")))
     
     # Only apply the sandhi generator at this point: we have carefully avoided doing it for the
     # input up to now (especially for the noundictreading). Probably doesn't make a difference
     # with the current implementation, but better safe than sorry.
     return generateaudio(self.notifier, self.mediamanager, self.config, transformations.tonesandhi(dictreading))
Example #6
0
    def generatemeanings(self, expression, dictmeanings):
        if dictmeanings == None:
            # We didn't get any meanings, don't update the field
            return None

        # Consider sandhi in meanings - you never know, there might be some!
        dictmeanings = [
            transformations.tonesandhi(dictmeaning)
            for dictmeaning in dictmeanings
        ]

        if self.config.hanzimasking:
            # Hanzi masking is on: scan through the meanings and remove the expression itself
            dictmeanings = [
                transformations.maskhanzi(
                    expression, self.config.formathanzimaskingcharacter(),
                    dictmeaning) for dictmeaning in dictmeanings
            ]

        # Prepare all the meanings by flattening them and removing empty entries
        meanings = [
            meaning for meaning in [
                preparetokens(self.config, dictmeaning)
                for dictmeaning in dictmeanings
            ] if meaning.strip != ''
        ]

        # Scan through the meanings and replace instances of 'surname Foo' with a masked version
        lookslikesurname = lambda what: what.lower().startswith(
            "surname ") and " " not in what[len("surname ")]
        meanings = [
            lookslikesurname(meaning) and "(a surname)" or meaning
            for meaning in meanings
        ]

        if len(meanings) == 0:
            # After flattening and stripping, we didn't get any meanings: don't update the field
            return None

        # Use the configuration to insert numbering etc
        return self.config.formatmeanings(meanings)
Example #7
0
    def updatefact(self, fact, expression):
        # AutoBlanking Feature - If there is no expression, zeros relevant fields
        # DEBUG - add feature to store the text when a lookup is performed. When new text is entered then allow auto-blank any field that has not been edited
        if expression == None or expression.strip() == u"":
            for key in ["reading", "meaning", "color", "trad", "simp", "weblinks"]:
                if key in fact:
                    fact[key] = u""
            
            # DEBUG Me - Auto generated pinyin should be at least "[sound:" + ".xxx]" (12 characters) plus pinyin (max 6). i.e. 18
            # DEBUG - Split string around "][" to get the audio of each sound in an array. Blank the field unless any one string is longer than 20 characters
            # Exploit the fact that pinyin text-to-speech pinyin should be no longer than 18 characters to guess that anything longer is user generated
            # MaxB comment: I don't think that will work, because we import the Chinese-Lessons.com Mandarin Sounds into anki and it gives them /long/ names.
            # Instead, how about we check if all of the audio files referenced are files in the format pinyin<tone>.mp3?
            if 'audio' in fact and len(fact['audio']) < 40:
                fact['audio'] = u""
            
            # For now this is a compromise in safety and function.
            # longest MW should be: "? - zhangì (9 char)
            # shortest possible is "? - ge" 6 char so we will autoblank if less than 12 letters
            # this means blanking will occur if one measure word is there but not if two (so if user added any they are safe)
            if 'mw' in fact and len(fact['mw']) < 12: 
                fact['mw'] = u""
            
            # TODO: Nick added this to give up after auto-blanking. He claims it removes a minor
            # delay, but I'm not sure where the delay originates from, which worries me:
            return
        
        # Apply tone sandhi: this information is needed both by the sound generation
        # and the colorisation, so we can't do it in generatereading
        dictreading = self.getdictreading(expression)
        dictreadingsandhi = transformations.tonesandhi(dictreading)
  
        # Preload the meaning, but only if we absolutely must
        if self.config.needmeanings:
            dictmeaningssources = [
                    # Use CEDICT to get meanings
                    (None,
                     lambda: self.dictionary.meanings(expression, self.config.prefersimptrad)),
                    # Interpret Hanzi as numbers. NB: only consult after CEDICT so that we
                    # handle curious numbers such as 'liang' using the dictionary
                    (None,
                     lambda: (numberutils.meaningfromnumberlike(expression, self.dictionary), None))
                ] + (self.config.shouldusegoogletranslate and [
                    # If the dictionary can't answer our question, ask Google Translate.
                    # If there is a long word followed by another word then this will be treated as a phrase.
                    # Phrases are also queried using googletranslate rather than the local dictionary.
                    # This helps deal with small dictionaries (for example French)
                    ('<br /><span style="color:gray"><small>[Google Translate]</small></span><span> </span>',
                     lambda: (dictionaryonline.gTrans(expression, self.config.dictlanguage), None))
                ] or [])
            
            # Find the first source that returns a sensible meaning
            for dictmeaningssource, lookup in dictmeaningssources:
                dictmeanings, dictmeasurewords = lookup()
                if dictmeanings != None or dictmeasurewords != None:
                    break
            
            # If the user wants the measure words to be folded into the definition or there
            # is no MW field for us to split them out into, fold them in there
            if not(self.config.detectmeasurewords) or "mw" not in fact:
                # NB: do NOT overwrite the old dictmeasurewords, because we still want to use the
                # measure words for e.g. measure word audio generation
                dictmeanings = dictionary.combinemeaningsmws(dictmeanings, dictmeasurewords)
            
            # NB: expression only used for Hanzi masking here
            meaning = self.generatemeanings(expression, dictmeanings)
            if meaning and dictmeaningssource:
                # Append attribution to the meaning if we have any
                meaning = meaning + dictmeaningssource

        # Generate translations of the expression into simplified/traditional on-demand
        expressionviews = utils.FactoryDict(lambda simptrad: self.generateincharactersystem(expression, simptrad))
        
        # Update the expression is option is turned on and the preference simp/trad is different to expression (i.e. needs correcting)
        expressionupdated = False
        if self.config.forceexpressiontobesimptrad and (expression != expressionviews[self.config.prefersimptrad]):
            expression = expressionviews[self.config.prefersimptrad]
            expressionupdated = True

        # Do the updates on the fields the user has requested:
        # NB: when adding an updater to this list, make sure that you have
        # added it to the updatecontrolflags dictionary in Config as well!
        updaters = {
                'expression' : lambda: expression,
                'reading'    : lambda: self.generatereading(dictreadingsandhi),
                'meaning'    : lambda: meaning,
                'mw'         : lambda: self.generatemeasureword(self.config.detectmeasurewords and dictmeasurewords or None),
                'audio'      : lambda: self.generateaudio(dictreadingsandhi),
                'mwaudio'    : lambda: self.generatemwaudio(dictreading, dictmeasurewords),
                'color'      : lambda: self.generatecoloredcharacters(expression),
                'trad'       : lambda: (expressionviews["trad"] != expressionviews["simp"]) and expressionviews["trad"] or None,
                'simp'       : lambda: (expressionviews["trad"] != expressionviews["simp"]) and expressionviews["simp"] or None,
                'weblinks'   : lambda: self.weblinkgeneration(expression)
            }

        # Loop through each field, deciding whether to update it or not
        for key, updater in updaters.items():
            # A hint for reading this method: read the stuff inside the if not(...):
            # as an assertion that has to be valid before we can proceed with the update.
            
            # If this option has been disabled or the field isn't present then jump to the next update.
            # Expression is always updated because some parts of the code call updatefact with an expression
            # that is not yet set on the fact, and we need to make sure that it arrives. This is OK, because
            # we only actually modify a directly user-entered expression when forceexpressiontobesimptrad is on.
            #
            # NB: please do NOT do this if key isn't in updatecontrolflags, because that
            # indicates an error with the Toolkit that I'd like to get an exception for!
            if not(key in fact and (key == "expression" or updatecontrolflags[key] is None or self.config.settings[updatecontrolflags[key]])):
                continue
            
            # If the field is not empty already then skip (so we don't overwrite it), unless:
            # a) this is the expression field, which should always be over-written with simp/trad
            # b) this is the weblinks field, which must always be up to date
            # c) this is the color field and we have just forced the expression to change,
            #    in which case we'd like to overwrite the colored characters regardless
            if not(fact[key].strip() == u"" or key in ["expression", "weblinks"] or (key == "color" and expressionupdated)):
                continue
            
            # Fill the field with the new value, but only if we have one and it is necessary to do so
            value = updater()
            if value != None and value != fact[key]:
                fact[key] = value
Example #8
0
 def generatecoloredcharacters(self, expression):
     return model.flatten(transformations.colorize(self.config.tonecolors, transformations.tonesandhi(self.dictionary.tonedchars(expression))))
Example #9
0
    def updatefact(self, fact, expression):
        # AutoBlanking Feature - If there is no expression, zeros relevant fields
        # DEBUG - add feature to store the text when a lookup is performed. When new text is entered then allow auto-blank any field that has not been edited
        if expression == None or expression.strip() == u"":
            for key in [
                    "reading", "meaning", "color", "trad", "simp", "weblinks"
            ]:
                if key in fact:
                    fact[key] = u""

            # DEBUG Me - Auto generated pinyin should be at least "[sound:" + ".xxx]" (12 characters) plus pinyin (max 6). i.e. 18
            # DEBUG - Split string around "][" to get the audio of each sound in an array. Blank the field unless any one string is longer than 20 characters
            # Exploit the fact that pinyin text-to-speech pinyin should be no longer than 18 characters to guess that anything longer is user generated
            # MaxB comment: I don't think that will work, because we import the Chinese-Lessons.com Mandarin Sounds into anki and it gives them /long/ names.
            # Instead, how about we check if all of the audio files referenced are files in the format pinyin<tone>.mp3?
            if 'audio' in fact and len(fact['audio']) < 40:
                fact['audio'] = u""

            # For now this is a compromise in safety and function.
            # longest MW should be: "? - zhangì (9 char)
            # shortest possible is "? - ge" 6 char so we will autoblank if less than 12 letters
            # this means blanking will occur if one measure word is there but not if two (so if user added any they are safe)
            if 'mw' in fact and len(fact['mw']) < 12:
                fact['mw'] = u""

            # TODO: Nick added this to give up after auto-blanking. He claims it removes a minor
            # delay, but I'm not sure where the delay originates from, which worries me:
            return

        # Apply tone sandhi: this information is needed both by the sound generation
        # and the colorisation, so we can't do it in generatereading
        dictreading = self.getdictreading(expression)
        dictreadingsandhi = transformations.tonesandhi(dictreading)

        # Preload the meaning, but only if we absolutely must
        if self.config.needmeanings:
            dictmeaningssources = [
                # Use CEDICT to get meanings
                (None, lambda: self.dictionary.meanings(
                    expression, self.config.prefersimptrad)),
                # Interpret Hanzi as numbers. NB: only consult after CEDICT so that we
                # handle curious numbers such as 'liang' using the dictionary
                (None, lambda: (numberutils.meaningfromnumberlike(
                    expression, self.dictionary), None))
            ] + (
                self.config.shouldusegoogletranslate and [
                    # If the dictionary can't answer our question, ask Google Translate.
                    # If there is a long word followed by another word then this will be treated as a phrase.
                    # Phrases are also queried using googletranslate rather than the local dictionary.
                    # This helps deal with small dictionaries (for example French)
                    ('<br /><span style="color:gray"><small>[Google Translate]</small></span><span> </span>',
                     lambda: (dictionaryonline.gTrans(expression, self.config.
                                                      dictlanguage), None))
                ] or [])

            # Find the first source that returns a sensible meaning
            for dictmeaningssource, lookup in dictmeaningssources:
                dictmeanings, dictmeasurewords = lookup()
                if dictmeanings != None or dictmeasurewords != None:
                    break

            # If the user wants the measure words to be folded into the definition or there
            # is no MW field for us to split them out into, fold them in there
            if not (self.config.detectmeasurewords) or "mw" not in fact:
                # NB: do NOT overwrite the old dictmeasurewords, because we still want to use the
                # measure words for e.g. measure word audio generation
                dictmeanings = dictionary.combinemeaningsmws(
                    dictmeanings, dictmeasurewords)

            # NB: expression only used for Hanzi masking here
            meaning = self.generatemeanings(expression, dictmeanings)
            if meaning and dictmeaningssource:
                # Append attribution to the meaning if we have any
                meaning = meaning + dictmeaningssource

        # Generate translations of the expression into simplified/traditional on-demand
        expressionviews = utils.FactoryDict(
            lambda simptrad: self.generateincharactersystem(
                expression, simptrad))

        # Update the expression is option is turned on and the preference simp/trad is different to expression (i.e. needs correcting)
        expressionupdated = False
        if self.config.forceexpressiontobesimptrad and (
                expression != expressionviews[self.config.prefersimptrad]):
            expression = expressionviews[self.config.prefersimptrad]
            expressionupdated = True

        # Do the updates on the fields the user has requested:
        # NB: when adding an updater to this list, make sure that you have
        # added it to the updatecontrolflags dictionary in Config as well!
        updaters = {
            'expression':
            lambda: expression,
            'reading':
            lambda: self.generatereading(dictreadingsandhi),
            'meaning':
            lambda: meaning,
            'mw':
            lambda: self.generatemeasureword(self.config.detectmeasurewords and
                                             dictmeasurewords or None),
            'audio':
            lambda: self.generateaudio(dictreadingsandhi),
            'mwaudio':
            lambda: self.generatemwaudio(dictreading, dictmeasurewords),
            'color':
            lambda: self.generatecoloredcharacters(expression),
            'trad':
            lambda: (expressionviews["trad"] != expressionviews["simp"]
                     ) and expressionviews["trad"] or None,
            'simp':
            lambda: (expressionviews["trad"] != expressionviews["simp"]) and
            expressionviews["simp"] or None,
            'weblinks':
            lambda: self.weblinkgeneration(expression)
        }

        # Loop through each field, deciding whether to update it or not
        for key, updater in updaters.items():
            # A hint for reading this method: read the stuff inside the if not(...):
            # as an assertion that has to be valid before we can proceed with the update.

            # If this option has been disabled or the field isn't present then jump to the next update.
            # Expression is always updated because some parts of the code call updatefact with an expression
            # that is not yet set on the fact, and we need to make sure that it arrives. This is OK, because
            # we only actually modify a directly user-entered expression when forceexpressiontobesimptrad is on.
            #
            # NB: please do NOT do this if key isn't in updatecontrolflags, because that
            # indicates an error with the Toolkit that I'd like to get an exception for!
            if not (key in fact and
                    (key == "expression" or updatecontrolflags[key] is None
                     or self.config.settings[updatecontrolflags[key]])):
                continue

            # If the field is not empty already then skip (so we don't overwrite it), unless:
            # a) this is the expression field, which should always be over-written with simp/trad
            # b) this is the weblinks field, which must always be up to date
            # c) this is the color field and we have just forced the expression to change,
            #    in which case we'd like to overwrite the colored characters regardless
            if not (fact[key].strip() == u""
                    or key in ["expression", "weblinks"] or
                    (key == "color" and expressionupdated)):
                continue

            # Fill the field with the new value, but only if we have one and it is necessary to do so
            value = updater()
            if value != None and value != fact[key]:
                fact[key] = value
Example #10
0
 def generatecoloredcharacters(self, expression):
     return model.flatten(
         transformations.colorize(
             self.config.tonecolors,
             transformations.tonesandhi(
                 self.dictionary.tonedchars(expression))))