def getChars(freqFile, startNo, endNo):
    chars = []
    reader = unicode_csv_reader(codecs.open(freqFile, 'rb', "utf-8"),
                                dialect='excel-tab')

    frequencyList = [x for x in reader]  #read the whole list
    frequencyList = frequencyList[startNo:endNo]

    for row in frequencyList:
        templist = list(row[i] for i in [1, 4, 5])
        pinyin = ReadingFactory()
        readings = templist[1].split('/')
        # print readings
        readingString = ""
        for reading in readings:
            readingString += pinyin.convert(reading,
                                            'Pinyin',
                                            'Pinyin',
                                            sourceOptions={
                                                'toneMarkType': 'numbers',
                                                'missingToneMark': 'fifth'
                                            }) + " "
        templist[1] = readingString

        chars.append(templist)

    return chars
Ejemplo n.º 2
0
def getReadingOperator(readingName, readingOptions={}):
    global _readingOperator
    if not _readingOperator:
        readingFactory = ReadingFactory()
        _readingOperator = readingFactory.createReadingOperator(readingName,
            **readingOptions)
    return _readingOperator
Ejemplo n.º 3
0
def getReadingOperator(readingName, readingOptions={}):
    global _readingOperator
    if not _readingOperator:
        readingFactory = ReadingFactory()
        _readingOperator = readingFactory.createReadingOperator(readingName,
            **readingOptions)
    return _readingOperator
Ejemplo n.º 4
0
def runTests(tests, databases, registerUnicode, iteration=10):
    f = ReadingFactory()

    timing = {}
    for no in tests:
        print "Running test %d (reading from %s)..." % (no, databases[no])

        connection = {'sqlalchemy.url': 'sqlite:///%s' % databases[no],
                      'attach': ['cjklib'],
                      'registerUnicode': registerUnicode[no]}
        db = dbconnector.getDBConnector(connection)
        availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                          in dictionary.BaseDictionary\
                             .getAvailableDictionaries(db)]
        dictionaries = list(set(availableDicts)
                            & set(db.engine.table_names(schema=db._mainSchema)))
        if not dictionaries:
            raise ValueError("No dictionaries found")

        print "Found dictionaries '%s'" % "', '".join(dictionaries)

        runTime = {}
        for dictName in dictionaries:
            dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName)
            dictInstance = dictClass(dbConnectInst=db)

            opClass = (dictClass.READING
                       and f.getReadingOperatorClass(dictClass.READING))
            if hasattr(opClass, 'guessReadingDialect'):
                requestList = []
                for request in SEARCH_REQUESTS:
                    options = opClass.guessReadingDialect(request)
                    requestList.append((request, options))
            else:
                requestList = [(request, {}) for request in SEARCH_REQUESTS]

            mod = imp.new_module('timeit_runmod')
            mod.runRequest = runRequest
            mod.dictInstance = dictInstance
            mod.requestList = requestList

            sys.modules['timeit_runmod'] = mod

            methodTime = {}
            for method in ('getFor', 'getForHeadword', 'getForReading',
                           'getForTranslation'):
                t = Timer("""timeit_runmod.runRequest(
                                timeit_runmod.dictInstance,
                                timeit_runmod.requestList,
                                method='%s')
                          """ % method,
                          "import timeit_runmod")
                methodTime[method] = t.timeit(iteration)
            runTime[dictName] = methodTime

        timing[no] = runTime

    return timing
Ejemplo n.º 5
0
    def __init__(self, fromReading, toReading, variant=None, **options):
        self.id = '%s-%s' % (fromReading, toReading)

        if variant: self.id += '/' + variant

        icu.Transliterator.__init__(self, self.id)

        self._conv = ReadingFactory().createReadingConverter(
            fromReading, toReading, **options)
Ejemplo n.º 6
0
class ReadingConverterTest(NeedsDatabaseTest):
    """
    Base class for testing of
    :class:`~cjklib.reading.converter.ReadingConverter` classes."""
    CONVERSION_DIRECTION = None
    """Tuple of reading names for conversion from reading A to reading B."""

    def setUp(self):
        NeedsDatabaseTest.setUp(self)
        self.fromReading, self.toReading = self.CONVERSION_DIRECTION

        for clss in self.getReadingConverterClasses().values():
            if self.CONVERSION_DIRECTION in clss.CONVERSION_DIRECTIONS:
                self.readingConverterClass = clss
                break
        else:
            self.readingConverterClass = None

        self.f = ReadingFactory(dbConnectInst=self.db)

    def shortDescription(self):
        methodName = getattr(self, self.id().split('.')[-1])
        # get whole doc string and remove superfluous white spaces
        noWhitespaceDoc = re.sub('\s+', ' ', methodName.__doc__.strip())
        # remove markup for epytext format
        clearName = re.sub('[CLI]\{([^\}]*)}', r'\1', noWhitespaceDoc)
        # add information about conversion direction
        return clearName + ' (for %s to %s)' % self.CONVERSION_DIRECTION

    @staticmethod
    def getReadingConverterClasses():
        """
        Gets all classes from the reading module that implement
        :class:`~cjklib.reading.converter.ReadingConverter`.

        :rtype: dictionary of string class pairs
        :return: dictionary of all classes inheriting form
            :class:`~cjklib.reading.converter.ReadingConverter`
        """
        readingConverterClasses = {}

        # get all non-abstract classes that inherit from ReadingConverter
        readingConverterClasses = dict([(clss.__name__, clss) \
            for clss in converter.__dict__.values() \
            if type(clss) in [types.TypeType, types.ClassType] \
            and issubclass(clss, converter.ReadingConverter) \
            and clss.CONVERSION_DIRECTIONS])

        return readingConverterClasses

    def tearDown(self):
        # get rid of the possibly > 1000 instances
        self.f.clearCache()
Ejemplo n.º 7
0
    def setUp(self):
        NeedsDatabaseTest.setUp(self)
        self.fromReading, self.toReading = self.CONVERSION_DIRECTION

        for clss in self.getReadingConverterClasses().values():
            if self.CONVERSION_DIRECTION in clss.CONVERSION_DIRECTIONS:
                self.readingConverterClass = clss
                break
        else:
            self.readingConverterClass = None

        self.f = ReadingFactory(dbConnectInst=self.db)
Ejemplo n.º 8
0
    def setUp(self):
        NeedsDatabaseTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

        try:
            import PyICU

            self.toNumeric = PyICU.Transliterator.createInstance(
                "Latin-NumericPinyin", PyICU.UTransDirection.UTRANS_FORWARD)
            self.fromNumeric = self.toNumeric.createInverse()
        except ImportError:
            pass
Ejemplo n.º 9
0
class ReadingConversion(Base):
    """Converts the entries' reading string to the given target reading."""
    def __init__(self, toReading=None, targetOptions=None):
        """
        Constructs the conversion strategy.

        :type toReading: str
        :param toReading: target reading, if omitted, the dictionary's reading
            is assumed.
        :type targetOptions: dict
        :param targetOptions: target reading conversion options
        """
        Base.__init__(self)
        self.toReading = toReading
        if targetOptions:
            self.targetOptions = targetOptions
        else:
            self.targetOptions = {}

    def setDictionaryInstance(self, dictInstance):
        super(ReadingConversion, self).setDictionaryInstance(dictInstance)

        if (not hasattr(self._dictInstance, 'READING')
                or not hasattr(self._dictInstance, 'READING_OPTIONS')):
            raise ValueError('Incompatible dictionary')

        self.fromReading = self._dictInstance.READING
        self.sourceOptions = self._dictInstance.READING_OPTIONS

        self._readingFactory = ReadingFactory(
            dbConnectInst=self._dictInstance.db)

        toReading = self.toReading or self.fromReading
        if not self._readingFactory.isReadingConversionSupported(
                self.fromReading, toReading):
            raise ValueError("Conversion from '%s' to '%s' not supported" %
                             (self.fromReading, toReading))

    def format(self, string):
        toReading = self.toReading or self.fromReading
        try:
            return self._readingFactory.convert(
                string,
                self.fromReading,
                toReading,
                sourceOptions=self.sourceOptions,
                targetOptions=self.targetOptions)
        except (exception.DecompositionError, exception.CompositionError,
                exception.ConversionError):
            return None
Ejemplo n.º 10
0
    def testEveryConverterHasConsistencyTest(self):
        """
        Check if every reading has a test case.
        """
        testClasses = self.getReadingConverterConsistencyTestClasses()
        testClassReadingNames = [clss.CONVERSION_DIRECTION for clss \
            in testClasses]
        self.f = ReadingFactory(dbConnectInst=self.db)

        for clss in self.f.getReadingConverterClasses():
            for direction in clss.CONVERSION_DIRECTIONS:
                self.assert_(direction in testClassReadingNames,
                    "Conversion from %s to %s" % direction \
                    + "has no ReadingOperatorConsistencyTest")
Ejemplo n.º 11
0
def _decomposeAndRemovePinyinTones(string, type='diacritics'):
	if string is None:
		return None
	if not isinstance(string, unicode):
		string = unicode(string, 'utf-8')
		# print "isinstance of unique: " + keyword
	from cjklib.reading import ReadingFactory
	rf = ReadingFactory()
	readings = rf.decompose(string, 'Pinyin')
	readings = [rf.convert(string, 'Pinyin', 'Pinyin', 
		sourceOptions={'toneMarkType': type},
		targetOptions={'toneMarkType': 'none'}).lower().replace(u'ü', u'v') for string in readings]
	readings = [r for r in readings if r != ' ' and r != "'"]
	return readings
Ejemplo n.º 12
0
class ReadingConversion(Base):
    """Converts the entries' reading string to the given target reading."""
    def __init__(self, toReading=None, targetOptions=None):
        """
        Constructs the conversion strategy.

        :type toReading: str
        :param toReading: target reading, if omitted, the dictionary's reading
            is assumed.
        :type targetOptions: dict
        :param targetOptions: target reading conversion options
        """
        Base.__init__(self)
        self.toReading = toReading
        if targetOptions:
            self.targetOptions = targetOptions
        else:
            self.targetOptions = {}

    def setDictionaryInstance(self, dictInstance):
        super(ReadingConversion, self).setDictionaryInstance(
            dictInstance)

        if (not hasattr(self._dictInstance, 'READING')
            or not hasattr(self._dictInstance, 'READING_OPTIONS')):
            raise ValueError('Incompatible dictionary')

        self.fromReading = self._dictInstance.READING
        self.sourceOptions = self._dictInstance.READING_OPTIONS

        self._readingFactory = ReadingFactory(
            dbConnectInst=self._dictInstance.db)

        toReading = self.toReading or self.fromReading
        if not self._readingFactory.isReadingConversionSupported(
            self.fromReading, toReading):
            raise ValueError("Conversion from '%s' to '%s' not supported"
                % (self.fromReading, toReading))

    def format(self, string):
        toReading = self.toReading or self.fromReading
        try:
            return self._readingFactory.convert(string, self.fromReading,
                toReading, sourceOptions=self.sourceOptions,
                targetOptions=self.targetOptions)
        except (exception.DecompositionError, exception.CompositionError,
            exception.ConversionError):
            # wighack
            return string 
Ejemplo n.º 13
0
class ChineseLessonsComCantonesePronunciation(GlobbingPronunciationBuilder):
    """
    Builds an index on pronunciation files for Cantonese provided by
    chinese-lessions.com.
    """
    PROVIDES = "Pronunciation_CantoneseYale"
    DEPENDS = ['CantoneseYaleSyllables']

    BASE_DIRECTORY_NAME = "chineselessionscom_yue"

    TONE_ABBREV = {'HT': '1stToneLevel', 'HF': '1stToneFalling',
        'MR': '2ndTone', 'MT': '3rdTone', 'LF': '4thTone', 'LR': '5thTone',
        'LT': '6thTone'}

    def __init__(self, **options):
        super(ChineseLessonsComCantonesePronunciation, self).__init__(**options)

        self.readingFactory = ReadingFactory()

    def getReadingFromFileName(self, fileName):
        fileRoot, _ = os.path.splitext(fileName)
        matchObj = re.match('([a-z]+)(HT|HF|MR|MT|LF|LR|LT)$', fileRoot)
        if matchObj:
            plainSyllable, toneMarker = matchObj.groups([1, 2])
            toneNumber = self.TONE_ABBREV[toneMarker]
            try:
                return self.readingFactory.getTonalEntity(plainSyllable,
                    toneNumber, 'CantoneseYale')
            except exception.UnsupportedError:
                pass
            except exception.ConversionError:
                pass
Ejemplo n.º 14
0
 def fix_pinyin(self, pinyin):
     # Hacks. It is overkill to ship cjklib with this add-on. But
     # to get the tone numbers as numbers, we should use it. My
     # hope (guess) is that the typical user that will want Chinese
     # pronunciations will also have TTEMPÉ's (version of mine)
     # chinese-support-plugin installed. So try to use that and
     # don't complain if it doesn't work.
     if not self.have_tried_cjklib_hack:
         try:
             # If this works, the whole shebang is run as an Anki2
             # add-on. If not, we will still look for a system-wide
             # cjklib, but obviously not for anothre add-on.
             from aqt.utils import isWin
         except:
             pass
         else:
             from aqt import mw
             addon_dir = mw.pm.addonFolder()
             if isWin:
                 # The isWin bit is copied from TTEMPÉ's code.
                 addon_dir = addon_dir.encode(sys.getfilesystemencoding())
             sys.path.append(os.path.join(addon_dir, "chinese"))
         self.have_tried_cjk_hack = True
     if not self.reading_factory:
         try:
             from cjklib.reading import ReadingFactory
         except ImportError:
             return pinyin
         else:
             self.reading_factory = ReadingFactory()
     return self.reading_factory.convert(
         pinyin, 'Pinyin',  'Pinyin', targetOptions={
             'toneMarkType': 'numbers'}).replace('5', '0')
Ejemplo n.º 15
0
    def __init__(self, configfile):
        super(BKRS2DB, self).__init__()

        #statprof.start()

        self.get_config(configfile)
        self.comma_symbols = [u',', u'﹐', ',']
        self.BUFFER_SIZE = 10000
        self.buffer_index = 0

        self.read_fab = ReadingFactory()
        self.cedict = CEDICT()
        self.cjk = characterlookup.CharacterLookup('T')
        self.pinyinOp = self.read_fab.createReadingOperator('Pinyin')
        self.charInfo =  cjknife.CharacterInfo()
        self.last_error = {'description':'', 'match':'', 'not_match': ''}
        self.bad_word_index = 0
        self.additional_reading = {}
        self.hanzi_stat = {}
        self.hanzi_freq = {}
        self.hanzi_pron_var = {}
        self.errors_description = {
            'pinyin_not_match':'Не совпадает', 
            'no_pinyin':'Нет чтения',
            'pinyin_have_tag_symbol':'В пиньине теги', 
            'pinyin_have_bad_symbol':'В пиньине плохие символы', 
            'pinyin_have_rus_letter':'В пиньине русские буквы',
            'pinyin_have_number_symbol':'В пиньине цифры',
            'word_have_alpha_symbol':'В слове alfa символы'
        }

        self.log_file = open(self.params['log_file'], 'w', 1000)
        if self.params['write_to_pleco_db']:
            self.pleco = Pleco(self.params['output_pleco_database_file'], self)
        self.bad_hanzi_list = False
Ejemplo n.º 16
0
    def setDictionaryInstance(self, dictInstance):
        super(ReadingConversion, self).setDictionaryInstance(dictInstance)

        if (not hasattr(self._dictInstance, 'READING')
                or not hasattr(self._dictInstance, 'READING_OPTIONS')):
            raise ValueError('Incompatible dictionary')

        self.fromReading = self._dictInstance.READING
        self.sourceOptions = self._dictInstance.READING_OPTIONS

        self._readingFactory = ReadingFactory(
            dbConnectInst=self._dictInstance.db)

        toReading = self.toReading or self.fromReading
        if not self._readingFactory.isReadingConversionSupported(
                self.fromReading, toReading):
            raise ValueError("Conversion from '%s' to '%s' not supported" %
                             (self.fromReading, toReading))
Ejemplo n.º 17
0
    def __init__(self, fromReading, toReading, variant=None, **options):
        self.id = '%s-%s' % (fromReading, toReading)

        if variant: self.id += '/' + variant

        icu.Transliterator.__init__(self, self.id)

        self._conv = ReadingFactory().createReadingConverter(fromReading,
            toReading, **options)
Ejemplo n.º 18
0
    def open (self, dbname):
        """Open the database."""
        self.dbname = dbname
        if not hasattr(self, '_dictionaryName'):
	    self._dictionaryName = dbname
        try:
            self._dictInst = getDictionary(self._dictionaryName,
		entryFactory=entry.UnifiedHeadword())
        except ValueError as e:
            if debug: print(e, file=sys.stderr)
            return False

	if self._dictInst.READING:
	    f = ReadingFactory()
	    opClass = f.getReadingOperatorClass(self._dictInst.READING)
	    if hasattr(opClass, 'guessReadingDialect'):
		self._opClass = opClass

        return True
Ejemplo n.º 19
0
class PinyinICUTest(NeedsDatabaseTest, unittest.TestCase):
    """Test Pinyin tonemark conversion on ICU transformation rule."""
    CONVERSION_DIRECTION = ('Pinyin', 'Pinyin')

    def setUp(self):
        NeedsDatabaseTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

        try:
            import PyICU

            self.toNumeric = PyICU.Transliterator.createInstance(
                "Latin-NumericPinyin", PyICU.UTransDirection.UTRANS_FORWARD)
            self.fromNumeric = self.toNumeric.createInverse()
        except ImportError:
            pass

    def testToneMarkPlacement(self):
        """Test Pinyin tonemark conversion on ICU transformation rule."""
        if not hasattr(self, 'toNumeric'):
            return

        for readingEntity in self.f.getReadingEntities('Pinyin'):
            if readingEntity in (u'hn\u0304g', u'h\u0144g', u'h\u0148g',
                u'h\u01f9g', u'n\u0304g', u'\u0144g', u'\u0148g',
                u'\u01f9g'):
                continue
            targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin',
                targetOptions={'toneMarkType': 'numbers',
                    'missingToneMark': 'fifth'})
            self.assertEquals(targetEntity,
                self.toNumeric.transliterate(readingEntity))

        for readingEntity in self.f.getReadingEntities('Pinyin',
            toneMarkType='numbers', missingToneMark='fifth'):
            if readingEntity in ('hng1', 'hng2', 'hng3', 'hng4', 'ng1', 'ng2',
                'ng3', 'ng4', u'ê1', u'ê2', u'ê3', u'ê4'):
                continue
            targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin',
                sourceOptions={'toneMarkType': 'numbers',
                    'missingToneMark': 'fifth'})
            self.assertEquals(targetEntity,
                self.fromNumeric.transliterate(readingEntity))
def getChars(freqFile,startNo,endNo):
	chars = []
	reader=unicode_csv_reader(codecs.open(freqFile, 'rb',"utf-8"), dialect='excel-tab')
	
	frequencyList = [x for x in reader] #read the whole list
	frequencyList = frequencyList[startNo:endNo]
	
	for row in frequencyList:
		templist = list(row[i] for i in [1,4,5])
		pinyin = ReadingFactory()
		readings = templist[1].split('/')
		# print readings
		readingString = ""
		for reading in readings:
			readingString += pinyin.convert(reading, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers','missingToneMark': 'fifth'}) +" "
		templist[1] = readingString
		
		chars.append(templist)
	
	return chars
Ejemplo n.º 21
0
 def fix_pinyin(self, pinyin):
     # Hacks. It is overkill to ship cjklib with this add-on. But
     # to get the tone numbers as numbers, we should use it. My
     # hope (guess) is that the typical user that will want Chinese
     # pronunciations will also have TTEMPÉ's (version of mine)
     # chinese-support-plugin installed. So try to use that and
     # don't complain if it doesn't work.
     if not self.have_tried_cjklib_hack:
         try:
             # If this works, the whole shebang is run as an Anki2
             # add-on. If not, we will still look for a system-wide
             # cjklib, but obviously not for anothre add-on.
             from aqt.utils import isWin
         except:
             pass
         else:
             from aqt import mw
             addon_dir = mw.pm.addonFolder()
             if isWin:
                 # The isWin bit is copied from TTEMPÉ's code.
                 addon_dir = addon_dir.encode(sys.getfilesystemencoding())
             sys.path.append(os.path.join(addon_dir, "chinese"))
         self.have_tried_cjk_hack = True
     if not self.reading_factory:
         try:
             from cjklib.reading import ReadingFactory
         except ImportError:
             return pinyin
         else:
             self.reading_factory = ReadingFactory()
     return self.reading_factory.convert(pinyin,
                                         'Pinyin',
                                         'Pinyin',
                                         targetOptions={
                                             'toneMarkType': 'numbers'
                                         }).replace('5', '0')
Ejemplo n.º 22
0
class ReadingTransliterator(icu.Transliterator):
    def __init__(self, fromReading, toReading, variant=None, **options):
        self.id = '%s-%s' % (fromReading, toReading)

        if variant: self.id += '/' + variant

        icu.Transliterator.__init__(self, self.id)

        self._conv = ReadingFactory().createReadingConverter(
            fromReading, toReading, **options)

    def handleTransliterate(self, text, position, complete):
        substring = str(text[position.start:position.limit])

        converted = self._conv.convert(substring)
        text[position.start:position.limit] = converted

        lenDiff = len(substring) - len(converted)
        position.limit -= lenDiff
        position.contextLimit -= lenDiff

        position.start = position.limit

    @staticmethod
    def register(fromReading,
                 toReading,
                 variant=None,
                 registerInverse=False,
                 **options):
        trans = ReadingTransliterator(fromReading,
                                      toReading,
                                      variant=variant,
                                      **options)
        icu.Transliterator.registerInstance(trans)

        if registerInverse:
            inverseOptions = options.copy()
            inverseOptions['targetOptions'] = options.get('sourceOptions', {})
            inverseOptions['sourceOptions'] = options.get('targetOptions', {})

            invTrans = ReadingTransliterator(toReading,
                                             fromReading,
                                             variant=variant,
                                             **inverseOptions)
            icu.Transliterator.registerInstance(invTrans)

        return trans.id
Ejemplo n.º 23
0
    def setDictionaryInstance(self, dictInstance):
        super(ReadingConversion, self).setDictionaryInstance(
            dictInstance)

        if (not hasattr(self._dictInstance, 'READING')
            or not hasattr(self._dictInstance, 'READING_OPTIONS')):
            raise ValueError('Incompatible dictionary')

        self.fromReading = self._dictInstance.READING
        self.sourceOptions = self._dictInstance.READING_OPTIONS

        self._readingFactory = ReadingFactory(
            dbConnectInst=self._dictInstance.db)

        toReading = self.toReading or self.fromReading
        if not self._readingFactory.isReadingConversionSupported(
            self.fromReading, toReading):
            raise ValueError("Conversion from '%s' to '%s' not supported"
                % (self.fromReading, toReading))
Ejemplo n.º 24
0
class ReadingConverterTestCaseCheck(NeedsDatabaseTest, unittest.TestCase):
    """
    Checks if every :class:`~cjklib.reading.converter.ReadingConverter` has
    its own
    :class:`~cjklib.test.readingconverter.ReadingConverterConsistencyTest`.
    """
    def testEveryConverterHasConsistencyTest(self):
        """
        Check if every reading has a test case.
        """
        testClasses = self.getReadingConverterConsistencyTestClasses()
        testClassReadingNames = [clss.CONVERSION_DIRECTION for clss \
            in testClasses]
        self.f = ReadingFactory(dbConnectInst=self.db)

        for clss in self.f.getReadingConverterClasses():
            for direction in clss.CONVERSION_DIRECTIONS:
                self.assert_(direction in testClassReadingNames,
                    "Conversion from %s to %s" % direction \
                    + "has no ReadingOperatorConsistencyTest")

    @staticmethod
    def getReadingConverterConsistencyTestClasses():
        """
        Gets all classes implementing
        :class:`cjklib.test.readingconverter.ReadingConverterConsistencyTest`.

        :rtype: list
        :return: list of all classes inheriting form
            :class:`cjklib.test.readingconverter.ReadingConverterConsistencyTest`
        """
        # get all non-abstract classes that inherit from
        #   ReadingConverterConsistencyTest
        testModule = __import__("cjklib.test.readingconverter")
        testClasses = [clss for clss \
            in testModule.test.readingconverter.__dict__.values() \
            if type(clss) in [types.TypeType, types.ClassType] \
            and issubclass(clss, ReadingConverterConsistencyTest) \
            and clss.CONVERSION_DIRECTION]

        return testClasses
Ejemplo n.º 25
0
class ReadingTransliterator(icu.Transliterator):
    def __init__(self, fromReading, toReading, variant=None, **options):
        self.id = '%s-%s' % (fromReading, toReading)

        if variant: self.id += '/' + variant

        icu.Transliterator.__init__(self, self.id)

        self._conv = ReadingFactory().createReadingConverter(fromReading,
            toReading, **options)

    def handleTransliterate(self, text, position, complete):
        substring = unicode(text[position.start:position.limit])

        converted = self._conv.convert(substring)
        text[position.start:position.limit] = converted

        lenDiff = len(substring) - len(converted)
        position.limit -= lenDiff
        position.contextLimit -= lenDiff

        position.start = position.limit

    @staticmethod
    def register(fromReading, toReading, variant=None, registerInverse=False,
        **options):
        trans = ReadingTransliterator(fromReading, toReading, variant=variant,
            **options)
        icu.Transliterator.registerInstance(trans)

        if registerInverse:
            inverseOptions = options.copy()
            inverseOptions['targetOptions'] = options.get('sourceOptions', {})
            inverseOptions['sourceOptions'] = options.get('targetOptions', {})

            invTrans = ReadingTransliterator(toReading, fromReading,
                variant=variant, **inverseOptions)
            icu.Transliterator.registerInstance(invTrans)

        return trans.id
Ejemplo n.º 26
0
class ChineseLessonsComMandarinPronunciation(GlobbingPronunciationBuilder):
    """
    Builds an index on pronunciation files for Mandarin provided by
    chinese-lessions.com.
    """
    PROVIDES = "Pronunciation_Pinyin"
    DEPENDS = ['PinyinSyllables']

    BASE_DIRECTORY_NAME = "chineselessionscom_cmn"

    def __init__(self, **options):
        super(ChineseLessonsComMandarinPronunciation, self).__init__(**options)

        self.readingFactory = ReadingFactory()

    def getReadingFromFileName(self, fileName):
        fileRoot, _ = os.path.splitext(fileName)
        try:
            return self.readingFactory.convert(fileRoot, 'Pinyin', 'Pinyin',
                sourceOptions={'toneMarkType': 'numbers'})
        except exception.UnsupportedError:
            pass
        except exception.ConversionError:
            pass
Ejemplo n.º 27
0
class LeoDownloader(AudioDownloader):
    """Download audio from LEO"""
    def __init__(self):
        AudioDownloader.__init__(self)
        self.file_extension = u'.mp3'
        self.url = 'http://www.leo.org/dict/audio_{language}/{word}.mp3'
        # And, yes, they use ch for Chinese.
        # (I'm not sure if they really have anything for ru or it.)
        self.language_dict = {'de': 'de', 'en': 'en', 'es': 'es', 'fr': 'fr',
                              'it': 'it', 'ru': 'ru', 'zh': 'ch'}
        # It kind of looks like they have Swiss pronunciations, but hey don't.
        self.chinese_code = 'ch'
        # We should keep a number of site icons handy, with the right
        # flag for the request.
        self.site_icon_dict = {}
        self.site_file_name_encoding = 'ISO-8859-1'
        self.icon_url_dict = {
            'de': 'http://dict.leo.org/favicon.ico',
            'en': 'http://dict.leo.org/favicon.ico',
            'es': 'http://dict.leo.org/favicon_es.ico',
            'fr': 'http://dict.leo.org/favicon_fr.ico',
            'it': 'http://dict.leo.org/favicon_it.ico',
            'ru': 'http://dict.leo.org/favicon_ru.ico',
            # When we use this dict, we have already munged the 'zh' to 'ch'
            'ch': 'http://dict.leo.org/favicon_ch.ico'}
        # As the name implies, a hack. Try to use the cjklib TTEMPÉ
        # brings along. A syntem-wide installed one should work as
        # well.
        self.have_tried_cjklib_hack = False
        self.reading_factory = None

    def download_files(self, word, base, ruby, split):
        """
        Download a word from LEO

        We try to get pronunciations for the text for German, English,
        Spanish, French, Italian and Russian, and from the ruby for
        Chinese. There may not be any pronunciations available for
        Italian or Russian.
        """
        self.downloads_list = []
        # Fix the language. EAFP.
        self.language = self.language_dict[self.language[:2].lower()]
        # set_names also checks the language.
        self.set_names(word, base, ruby)
        # Only get the icon when we have a word
        # self.maybe_get_icon()
        self.get_flag_icon()
        # EAFP. self.query_url may return None...
        word_url = self.query_url(word, ruby)
        # ... then the get_data will blow up
        word_data = self.get_data_from_url(word_url)
        word_file_path, word_file_name = self.get_file_name()
        with open(word_file_path, 'wb') as word_file:
            word_file.write(word_data)
        # We have a file, but not much to say about it.
        self.downloads_list.append(
            (word_file_path, word_file_name, dict(Source='Leo')))

    def query_url(self, word, ruby):
        """Build query URL"""
        if self.chinese_code == self.language:
            word = self.fix_pinyin(ruby)
        return self.url.format(
            language=self.language, word=urllib.quote(word.encode(
                    self.site_file_name_encoding)))

    def fix_pinyin(self, pinyin):
        # Hacks. It is overkill to ship cjklib with this add-on. But
        # to get the tone numbers as numbers, we should use it. My
        # hope (guess) is that the typical user that will want Chinese
        # pronunciations will also have TTEMPÉ's (version of mine)
        # chinese-support-plugin installed. So try to use that and
        # don't complain if it doesn't work.
        if not self.have_tried_cjklib_hack:
            try:
                # If this works, the whole shebang is run as an Anki2
                # add-on. If not, we will still look for a system-wide
                # cjklib, but obviously not for anothre add-on.
                from aqt.utils import isWin
            except:
                pass
            else:
                from aqt import mw
                addon_dir = mw.pm.addonFolder()
                if isWin:
                    # The isWin bit is copied from TTEMPÉ's code.
                    addon_dir = addon_dir.encode(sys.getfilesystemencoding())
                sys.path.append(os.path.join(addon_dir, "chinese"))
            self.have_tried_cjk_hack = True
        if not self.reading_factory:
            try:
                from cjklib.reading import ReadingFactory
            except ImportError:
                return pinyin
            else:
                self.reading_factory = ReadingFactory()
        return self.reading_factory.convert(
            pinyin, 'Pinyin',  'Pinyin', targetOptions={
                'toneMarkType': 'numbers'}).replace('5', '0')

    def get_flag_icon(self):
        """
        Set self.site_icon to the right icon.

        We should use different icons, depending on the request
        language.  We store these icons in self.site_icon_dict and use the
        AudioDownloader.maybe_get_icon() if we don't have it yet.
        """
        if not with_pyqt:
            return
        try:
            # If this works we already have it.
            self.site_icon = self.site_icon_dict[self.language]
        except KeyError:
            # We have to get it ourself. (We know it's just 16x16, so
            # no resize. And we know the address).
            self.site_icon_dict[self.language] = \
                QImage.fromData(self.get_data_from_url(
                    self.icon_url_dict[self.language]))
            self.site_icon = self.site_icon_dict[self.language]

    def set_names(self, text, base, ruby):
        """
        Set the display text and file base name variables.
        """
        if self.language == self.chinese_code:
            if not ruby:
                raise ValueError('Nothing to download')
            self.base_name = u"{0}_{1}".format(base, ruby)
            self.display_text = u"{1} ({0})".format(base, ruby)
        else:
            if not text:
                raise ValueError('Nothing to download')
            self.base_name = text
            self.display_text = text
Ejemplo n.º 28
0
    def handle_noargs(self, **options):
        # EXAMPLE: 一中一台 [yi1 Zhong1 yi1 Tai2] /first meaning/second meaning/
        file = open(settings.DICT_FILE_LOCATION)
        r_server = _get_redis()
        
        # EMPTY ALL EN KEYS FROM THE DATABASE
        item_count = 0
        keys = r_server.keys('EN:*')
        for x in keys:
            r_server.delete(x)
            item_count += 1
        print "Deleted %s items" % item_count
        
        
        # NOW LETS START
        item_count = 0
        for line in file:
            if not line.startswith("#"):

                # GATHER ALL THE MAIN VARIABLES
                new = line.split()
                characters = new[1]
                numbered_pinyin = line[(line.index('[')+1):(line.index(']'))]
                f = ReadingFactory()
                tonal_pinyin =  f.convert(numbered_pinyin, 'Pinyin', 'Pinyin',
                    sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v',
                    'missingToneMark': 'fifth'})
                meanings = line[(line.index('/')+1):(line.rindex('/'))]               
                
                # CREATE AN INDEX: What we'll do first is try to strip out
                # as much crap as possible from each definition, and as close as
                # possible find a single word that we can index on.
                
                for x in meanings.split('/'):
                    
                    ns = x # new_string
                    
                    # REMOVE ANYTHING BETWEEN BRACKETS
                    try:
                        ns = ns.replace(ns[(ns.index('(')+1):(ns.index(')'))], '')
                        ns = ns.replace('(', '').replace(')', '') #replace the brackets too
                    except ValueError:
                        pass
                    
                    # REMOVE ANYTHING BETWEEN SQUARE BRACKETS
                    try:
                        ns = ns.replace(ns[(ns.index('[')+1):(ns.index(']'))], '')
                        ns = ns.replace('[', '').replace(']', '') #replace the brackets too
                    except ValueError:
                        pass
                    
                    # IGNORE THE MEANING IF IT CONTAINS AN EXCLUDED PHRASE 
                    if len(filter(lambda y: y not in ns, EXCLUSIONS)) != len(EXCLUSIONS):
                        continue
                                        
                    # IF THE MEANING IS NOW EMPTY, IGNORE IT
                    ns = ns.strip()
                    if ns == '':
                        continue
                    
                    # DEAL WITH INFINITIVE VERBS LIKE "TO DO" WITH 2 WORDS
                    if len(ns.split(' ')) <= 3 and ns.startswith('to '):
                        ns = ns.split(' ', 1)[1]
                    
                    # REMOVE ITEMS LIKE "SEE XYZ"
                    if ns.split(' ')[0] == 'see' and ns[-1] not in string.ascii_letters:
                        continue
                    
                    # THERE'S ALSO SOME ANNOYING "..." MARKS TOO
                    if "..." in ns:
                        ns = ns.replace('...', '')                    
                    
                    
                    # FOR NOW, JUST ADD ITEMS WITH 2 WORDs
                    if len(ns.split(' ')) <= 3:
                        
                        key = "EN:%sW:%s" % (len(ns.split(' ')), ns.lower())
                        print key
                        if r_server.exists(key):
                            values = json.loads(_search_redis(key))
                            values['characters'].append(characters)
                            r_server.set(key, json.dumps(values))

                        else:
                            
                            values = {
                                'english': x,
                                'characters': [characters,],
                            }
                            
                            r_server.set(key, json.dumps(values))
                        
                        item_count += 1
                        print item_count
                        
            #if item_count > 20:
            #    break
                                        
                    
                
                
                
                                
        
        print "%s English dictionary items added" % item_count          
        file.close()        
Ejemplo n.º 29
0
class NTrain(Tk.Tk):
    def __init__(self, *args, **kwargs):
        Tk.Tk.__init__(self, *args, **kwargs)
        self.title("Ntrain")
        # place window in the center
        self.eval('tk::PlaceWindow %s center' %
                  self.winfo_pathname(self.winfo_id()))
        self._default_font = tkFont.nametofont("TkDefaultFont")
        self._default_font.configure(size=30)
        # define default dataset
        self._defaultfile = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'chinese100.xlsx')

        # load default filename into label
        basename = os.path.basename(self._defaultfile)
        self._filename_value = Tk.StringVar()
        self._sett_fn_label = Tk.Entry(textvariable=self._filename_value,
                                       font=self._default_font,
                                       width=12)
        self._filename_value.set(basename)
        self._sett_fn_label.grid(row=1, column=0, sticky=Tk.W)

        # button to browse for datafile
        self.browse = Tk.Button(self, text="Browse", command=self._get_file)
        self.browse.grid(row=1, column=1, sticky=Tk.W)

        # OK button to start game
        self._reset_button = Tk.Button(text="Reset", command=self._reset_list)
        self._reset_button.grid(row=1, column=2)

        # label
        self._sett_label = Tk.Label(text="Number of Cards:")
        self._sett_label.grid(row=2, column=0, sticky=Tk.E)

        # entry field for number of cards
        entryText = Tk.StringVar()
        self._sett_entry = Tk.Entry(textvariable=entryText,
                                    font=self._default_font,
                                    width=3)
        entryText.set("30")
        self._sett_entry.grid(row=2, column=1, sticky=Tk.W)
        self._sett_entry.focus_set()

        # reverse option
        self._radio_val = Tk.IntVar()
        self._radio1 = Tk.Radiobutton(text="Ch to E",
                                      variable=self._radio_val,
                                      value=1)
        self._radio1.grid(row=4, column=0)
        self._radio2 = Tk.Radiobutton(text="E to Ch",
                                      variable=self._radio_val,
                                      value=2)
        self._radio2.grid(row=4, column=1)
        self._radio_val.set(1)

        # OK button to start game
        self._sett_button = Tk.Button(text="OK", command=self._start_game)
        self._sett_button.grid(columnspan=3)

        # Bind return key to start game
        self.bind('<Return>', self._start_game)

        self._p = Pinyin()
        self._f = ReadingFactory()

    def _get_file(self):
        # open dialogue to chose datafile
        my_file = askopenfilename()
        # update label to show filename in gui
        self._filename_value.set(os.path.basename(my_file))

    def _start_game(self, *args):
        # get filname
        self._datafile = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            self._filename_value.get())

        # get number of cards
        self._n_cards = int(self._sett_entry.get())
        # remove previous gui components
        self._sett_fn_label.destroy()
        self.browse.destroy()
        self._reset_button.destroy()
        self._sett_label.destroy()
        self._sett_entry.destroy()
        self._sett_button.destroy()
        self._radio1.destroy()
        self._radio2.destroy()

        self._save_reminder = 0
        # load in data file
        try:
            self._vocTot = pd.read_excel(self._datafile)
        except:
            tkMessageBox.showinfo("Error", "File not found!", icon='warning')
            self._restart()
        #pdb.set_trace()

        # get indices of all filled cards
        filled_idx = self._vocTot[self._vocTot['Learned'] == 0].index.tolist()
        del filled_idx[0]
        # shuffle indices
        self._renew_index(filled_idx)
        # setup new gui
        self._setup_game_gui()
        # start with first question
        self._show_next_question()

    def _setup_game_gui(self):
        # labels for chinese symbols
        self.C_labels = []
        # labels for questions
        self.Q_labels = []

        # label for correct solution
        self._sol_label_value = Tk.StringVar()
        self._sol_label = Tk.Label(textvariable=self._sol_label_value)
        self._sol_label.grid(row=3, column=2)

        # entry field for answer
        self._entry_value = Tk.StringVar()
        self._entry = Tk.Entry(textvariable=self._entry_value,
                               font=self._default_font)
        self._entry.grid(row=4, column=2)
        self._entry.focus_set()
        self.bind('<Return>', self._check_answer)

        # Check button
        self._check_button = Tk.Button(text="Check",
                                       command=self._check_answer)
        self._check_button.grid(row=1, column=1, sticky=Tk.W)

        # Save button
        self._save_button = Tk.Button(text="Save", command=self._save)
        self._save_button.grid(row=2, column=1, sticky=Tk.W)

        # Next button
        self._next_button = Tk.Button(text="Next",
                                      command=self._show_next_question)
        self._next_button.grid(row=3, column=1, sticky=Tk.W)

        # New button
        self._new_button = Tk.Button(text="New", command=self._restart)
        self._new_button.grid(row=4, column=1, sticky=Tk.W)

        # translate field
        self._tr_value = Tk.StringVar()
        self._tr = Tk.Entry(textvariable=self._tr_value,
                            font=self._default_font)
        self._tr.grid(row=5, column=2)

        self._tr_button = Tk.Button(text="E-C", command=self._translate)
        self._tr_button.grid(row=5, column=1, sticky=Tk.W)

        # initialize list of wrong cards
        self._wrong_indices = []
        # initialize current index
        self._no = 0

    def _renew_index(self, indices):
        # TODO: catch too many cards chosen as input
        shuffle(indices)
        # take the first n cards
        self._indices = indices[0:self._n_cards]

    def _show_next_question(self):
        try:
            # get the next index in the list
            self._no = self._indices.pop(0)

            # empty entry field
            self._entry_value.set("")
            # empty Q and C labels
            for i in self.C_labels:
                i.destroy()
            for i in self.Q_labels:
                i.destroy()
            i = 1
            self.C_labels = []
            self.Q_labels = []

            # loop over Chinese characters
            for char in self._vocTot.C[self._no]:
                my_pinyin = self._p.get_pinyin(char, ' ')
                self.C_labels.append(Tk.Label(text=char))
                self.C_labels[-1].grid(row=2, column=i + 1)
                to_tone = (to_tone_number(my_pinyin))
                if "1" in to_tone:
                    self.C_labels[-1].config(fg='red')
                elif "2" in to_tone:
                    self.C_labels[-1].config(fg='green')
                elif "3" in to_tone:
                    self.C_labels[-1].config(fg='blue')
                elif "4" in to_tone:
                    self.C_labels[-1].config(fg='purple')
                else:
                    self.C_labels[-1].config(fg='grey')
                if self._radio_val.get() == 1:
                    self.Q_labels.append(Tk.Label(text=my_pinyin))
                    self.Q_labels[-1].grid(row=1, column=i + 1)
                i += 1

            if self._radio_val.get() == 1:
                self._curr_ans = self._vocTot.E[self._no].encode('utf-8')
            elif self._radio_val.get() == 2:
                try:
                    my_english = self._vocTot.E_long[self._no].encode('utf-8')
                except:
                    my_english = self._vocTot.E[self._no].encode('utf-8')
                self.Q_labels.append(Tk.Label(text=my_english))
                self.Q_labels[-1].grid(row=1, column=2, columnspan=i - 1)
                self._curr_ans = self._p.get_pinyin(self._vocTot.C[self._no],
                                                    ' ')

            self._entry.grid(row=4, column=2, columnspan=i - 1)

            # set real_correct to default value of yes
            self._real_correct = 1
        except IndexError:
            # start new round, when no card in list left
            self._new_round()

    def _check_answer(self, *args):

        # derive input
        answer = self._entry_value.get().strip().lower()

        # convert numbers, if provided, to pinjin tone mark
        tone = 0
        if any(char.isdigit() for char in answer):
            tone = 1
            answer = self._f.convert(answer,
                                     'Pinyin',
                                     'Pinyin',
                                     sourceOptions={
                                         'toneMarkType': 'numbers'
                                     }).encode('utf-8')

        # derive expected answer
        # ask for English word
        if self._radio_val.get() == 1:
            answer_to_check = self._curr_ans.encode('utf-8').lower()
        # ask for Chinese word
        elif self._radio_val.get() == 2:
            # pinyin with tone marks
            if tone == 1:
                answer_to_check = self._p.get_pinyin(
                    self._vocTot.C[self._no], ' ').encode('utf-8').lower()
            # pinyin without tone marks
            else:
                answer_to_check = self._p.get_pinyin(
                    self._vocTot.C[self._no], ' ').encode('utf-8').lower()

        # check if answer is correct
        if answer == answer_to_check:
            # if correct: move to 'learned' columns
            # move columns if card was correct on first attempt
            if self._real_correct:
                self._vocTot.Learned[self._no] = 1
                self._save_reminder = 1

            self._sol_label_value.set("")
            # if correct, go on to next card
            self._show_next_question()
        else:
            # if wrong:
            self._real_correct = 0
            # store index in list of wrong cards
            self._wrong_indices.append(self._no)
            # display correct answer
            self._sol_label_value.set(self._curr_ans)
            self._sol_label.grid(row=3,
                                 column=2,
                                 columnspan=len(self.C_labels))
            # clear entry field
            self._entry_value.set("")

    def _new_round(self):
        # if wrong cards still left, start new round
        if self._wrong_indices:
            # empty all display fields
            self._sol_label_value.set("New round!")
            self._renew_index(self._wrong_indices)
            # clear list of wrong indices
            self._wrong_indices = []
            # start new round with the next question
            self._show_next_question()
        else:
            # if no wrong cards left, finish the session
            self._exit()

    def _reset_list(self):
        self._datafile = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            self._filename_value.get())
        my_file = pd.read_excel(self._datafile)
        my_file['Learned'] = 0
        writer = ExcelWriter(self._datafile)
        #pdb.set_trace()
        my_file.to_excel(writer, 'Sheet1', index=False)
        writer.save()

    def _exit(self):
        try:
            self._save()
        except:
            self._sol_label_value.set("Didn't work?!")
            return
        self._sol_label_value.set("")
        self._entry_value.set("")
        self._sol_label_value.set("Done!")
        self._check_button['state'] = 'disabled'
        self._save_button['state'] = 'disabled'
        self.unbind('<Return>')
        self._next_button['state'] = 'disabled'

    def _save(self, *args):
        writer = ExcelWriter(self._datafile)
        self._vocTot.to_excel(writer, 'Sheet1', index=False)
        writer.save()
        self._sol_label_value.set("Saved!")
        self._sol_label.grid(row=3, column=2, columnspan=len(self.C_labels))
        self._save_reminder = 0

    def _restart(self):
        if self._save_reminder:
            result = tkMessageBox.askquestion("Warning",
                                              "Save before exiting?")
            if result == 'yes':
                try:
                    self._save()
                except:
                    self._sol_label_value.set("Didn't work?!")
                    return
        python = sys.executable
        os.execl(python, python, *sys.argv)

    def _translate(self):
        to_translate = self._tr_value.get()
        if isinstance(to_translate, unicode):
            # Todo: doesn't work
            url = 'https://translate.google.com/#zh-CN/en/' + to_translate
        else:
            to_translate = to_translate.replace(' ', '%20')
            url = 'https://translate.google.com/#en/zh-CN/' + to_translate
        webbrowser.open(url)
Ejemplo n.º 30
0
from cjklib.reading import ReadingFactory

f = ReadingFactory()


[
    'GR', 'Pinyin', 'WadeGiles', 'MandarinBraille', 'MandarinIPA',
    'ShanghaineseIPA',
    #'Hangul',
    #'Kana', 'Hiragana', 'Katakana',
    'CantoneseYale', 'CantoneseIPA', 'Jyutping'
]


DConv = {
    # Mandarin conversions
    ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|x-Pinyin'): lambda s: f.convert(s, 'GR', 'Pinyin'),
    ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Wade-Giles'): lambda s: f.convert(s, 'GR', 'WadeGiles'),
    ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Braille'): lambda s: f.convert(s, 'GR', 'MandarinBraille'),
    ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Alternative IPA'): lambda s: f.convert(s, 'GR', 'MandarinIPA'),


    ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|x-Pinyin'): lambda s: f.convert(s, 'Pinyin', 'Pinyin', sourceOptions={
        'toneMarkType': 'numbers'
    }),
    ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|Gwoyeu Romatzyh'): lambda s: f.convert(s, 'Pinyin', 'GR', sourceOptions={
        'toneMarkType': 'numbers'
    }),
    ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|Wade-Giles'): lambda s: f.convert(s, 'Pinyin', 'WadeGiles', sourceOptions={
        'toneMarkType': 'numbers'
    }),
Ejemplo n.º 31
0
    def __init__(self, *args, **kwargs):
        Tk.Tk.__init__(self, *args, **kwargs)
        self.title("Ntrain")
        # place window in the center
        self.eval('tk::PlaceWindow %s center' %
                  self.winfo_pathname(self.winfo_id()))
        self._default_font = tkFont.nametofont("TkDefaultFont")
        self._default_font.configure(size=30)
        # define default dataset
        self._defaultfile = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'chinese100.xlsx')

        # load default filename into label
        basename = os.path.basename(self._defaultfile)
        self._filename_value = Tk.StringVar()
        self._sett_fn_label = Tk.Entry(textvariable=self._filename_value,
                                       font=self._default_font,
                                       width=12)
        self._filename_value.set(basename)
        self._sett_fn_label.grid(row=1, column=0, sticky=Tk.W)

        # button to browse for datafile
        self.browse = Tk.Button(self, text="Browse", command=self._get_file)
        self.browse.grid(row=1, column=1, sticky=Tk.W)

        # OK button to start game
        self._reset_button = Tk.Button(text="Reset", command=self._reset_list)
        self._reset_button.grid(row=1, column=2)

        # label
        self._sett_label = Tk.Label(text="Number of Cards:")
        self._sett_label.grid(row=2, column=0, sticky=Tk.E)

        # entry field for number of cards
        entryText = Tk.StringVar()
        self._sett_entry = Tk.Entry(textvariable=entryText,
                                    font=self._default_font,
                                    width=3)
        entryText.set("30")
        self._sett_entry.grid(row=2, column=1, sticky=Tk.W)
        self._sett_entry.focus_set()

        # reverse option
        self._radio_val = Tk.IntVar()
        self._radio1 = Tk.Radiobutton(text="Ch to E",
                                      variable=self._radio_val,
                                      value=1)
        self._radio1.grid(row=4, column=0)
        self._radio2 = Tk.Radiobutton(text="E to Ch",
                                      variable=self._radio_val,
                                      value=2)
        self._radio2.grid(row=4, column=1)
        self._radio_val.set(1)

        # OK button to start game
        self._sett_button = Tk.Button(text="OK", command=self._start_game)
        self._sett_button.grid(columnspan=3)

        # Bind return key to start game
        self.bind('<Return>', self._start_game)

        self._p = Pinyin()
        self._f = ReadingFactory()
Ejemplo n.º 32
0
Radical 9	9	4EBA	man	rén
Radical 30	30	53E3	mouth	kǒu	
Radical 61	61	5FC3	heart	xīn	
Radical 3	3	4E36	dot	zhù	
Radical 4	4	4E3F	slash	piě
Radical 5	5	4E59	second, fishing hook	yǐ	
Radical 6	6	4E85	hook	jué	
Radical 7	7	4E8C	two	èr	
Radical 8	8	4EA0	lid, head	tóu
Radical 10	10	513F	legs	ér	
Radical 11	11	5165	enter	rù	
Radical 12	12	516B	eight	bā
Radical 140	140	8278	grass	cǎo
Radical 24	24	5341	ten	shí
Radical 13	13	5182	wide	jiōng
Radical 14	14	5196	cover	mī
Radical 15	15	51AB	ice	bīng
"""

from cjklib.reading import ReadingFactory
f = ReadingFactory()

for line in entries.split('\n'):
    if not line.strip():
        continue
    _, radicalIdx, _, meaning, pinyin = line.strip('\t').split('\t')
    pinyinNumbers = f.convert(pinyin, 'Pinyin', 'Pinyin',
        targetOptions={'toneMarkType': 'numbers'})
    print '%(idx)d,"%(pinyin)s","%(meaning)s"' \
        % {'meaning': meaning, 'idx': int(radicalIdx), 'pinyin': pinyinNumbers}
Ejemplo n.º 33
0
class CharacterLookupReadingMethodsTest(CharacterLookupTest, unittest.TestCase):
    """
    Runs consistency checks on the reading methods of the
    :class:`~cjklib.characterlookup.CharacterLookup` class.

    .. todo::
        * Impl: include script table from Unicode 5.2.0 to get character ranges
          for Hangul and Kana
    """
    DIALECTS = {}

    SPECIAL_ENTITY_LIST = {}

    def setUp(self):
        CharacterLookupTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

    def testReadingMappingAvailability(self):
        """
        Test if the readings under
        ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for
        conversion.
        """
        # mock to simulate availability of all tables in
        #   characterLookup.CHARARACTER_READING_MAPPING
        tables = [table for table, _ \
            in self.characterLookup.CHARARACTER_READING_MAPPING.values()]
        self.characterLookup.db.engine = EngineMock(
                self.characterLookup.db.engine, mockTables=tables)

        for reading in self.characterLookup.CHARARACTER_READING_MAPPING:
            # only if table exists
            table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[reading]

            self.assert_(
                self.characterLookup.hasMappingForReadingToCharacter(reading))
            self.assert_(
                self.characterLookup.hasMappingForCharacterToReading(reading))

        # test proper checking for all known readings
        for reading in self.f.getSupportedReadings():
            self.assert_(
                self.characterLookup.hasMappingForReadingToCharacter(reading) \
                in [True, False])
            self.assert_(
                self.characterLookup.hasMappingForCharacterToReading(reading) \
                in [True, False])

    @attr('slow')
    def testGetCharactersForReadingAcceptsAllEntities(self):
        """Test if ``getCharactersForReading`` accepts all reading entities."""
        for reading in self.f.getSupportedReadings():
            if not self.characterLookup.hasMappingForReadingToCharacter(
                reading):
                continue

            dialects = [{}]
            if reading in self.DIALECTS:
                dialects.extend(self.DIALECTS[reading])

            for dialect in dialects:
                if hasattr(self.f.getReadingOperatorClass(reading),
                    'getReadingEntities'):
                    entities = self.f.getReadingEntities(reading, **dialect)
                elif reading in self.SPECIAL_ENTITY_LIST:
                    entities = self.SPECIAL_ENTITY_LIST[reading]
                else:
                    continue

                for entity in entities:
                    try:
                        results = self.characterLookup.getCharactersForReading(
                            entity, reading, **dialect)

                        self.assertEquals(type(results), type([]),
                            "Method getCharactersForReading() doesn't return" \
                                + " a list for entity %s " % repr(entity) \
                        + ' (reading %s, dialect %s)' % (reading, dialect))

                        for entry in results:
                            self.assertEquals(len(entry), 1,
                                "Entry %s in result for %s has length != 1" \
                                    % (repr(entry), repr(entity)) \
                                + ' (reading %s, dialect %s)' \
                                % (reading, dialect))
                    except exception.UnsupportedError:
                        pass
                    except exception.ConversionError:
                        pass
Ejemplo n.º 34
0
 def setUp(self):
     CharacterLookupTest.setUp(self)
     self.f = ReadingFactory(dbConnectInst=self.db)
Ejemplo n.º 35
0
def runTests(tests, databases, registerUnicode, iteration=10):
    f = ReadingFactory()

    timing = {}
    for no in tests:
        print("Running test %d (reading from %s)..." % (no, databases[no]))

        connection = {
            'sqlalchemy.url': 'sqlite:///%s' % databases[no],
            'attach': ['cjklib'],
            'registerUnicode': registerUnicode[no]
        }
        db = dbconnector.getDBConnector(connection)
        availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                          in dictionary.BaseDictionary\
                             .getAvailableDictionaries(db)]
        dictionaries = list(
            set(availableDicts)
            & set(db.engine.table_names(schema=db._mainSchema)))
        if not dictionaries:
            raise ValueError("No dictionaries found")

        print("Found dictionaries '%s'" % "', '".join(dictionaries))

        runTime = {}
        for dictName in dictionaries:
            dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName)
            dictInstance = dictClass(dbConnectInst=db)

            opClass = (dictClass.READING
                       and f.getReadingOperatorClass(dictClass.READING))
            if hasattr(opClass, 'guessReadingDialect'):
                requestList = []
                for request in SEARCH_REQUESTS:
                    options = opClass.guessReadingDialect(request)
                    requestList.append((request, options))
            else:
                requestList = [(request, {}) for request in SEARCH_REQUESTS]

            mod = imp.new_module('timeit_runmod')
            mod.runRequest = runRequest
            mod.dictInstance = dictInstance
            mod.requestList = requestList

            sys.modules['timeit_runmod'] = mod

            methodTime = {}
            for method in ('getFor', 'getForHeadword', 'getForReading',
                           'getForTranslation'):
                t = Timer(
                    """timeit_runmod.runRequest(
                                timeit_runmod.dictInstance,
                                timeit_runmod.requestList,
                                method='%s')
                          """ % method, "import timeit_runmod")
                methodTime[method] = t.timeit(iteration)
            runTime[dictName] = methodTime

        timing[no] = runTime

    return timing
Ejemplo n.º 36
0
class CharacterLookupReadingMethodsTest(CharacterLookupTest,
                                        unittest.TestCase):
    """
    Runs consistency checks on the reading methods of the
    :class:`~cjklib.characterlookup.CharacterLookup` class.

    .. todo::
        * Impl: include script table from Unicode 5.2.0 to get character ranges
          for Hangul and Kana
    """
    DIALECTS = {}

    SPECIAL_ENTITY_LIST = {}

    def setUp(self):
        CharacterLookupTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

    def testReadingMappingAvailability(self):
        """
        Test if the readings under
        ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for
        conversion.
        """
        # mock to simulate availability of all tables in
        #   characterLookup.CHARARACTER_READING_MAPPING
        tables = [table for table, _ \
            in list(self.characterLookup.CHARARACTER_READING_MAPPING.values())]
        self.characterLookup.db.engine = EngineMock(
            self.characterLookup.db.engine, mockTables=tables)

        for reading in self.characterLookup.CHARARACTER_READING_MAPPING:
            # only if table exists
            table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[
                reading]

            self.assertTrue(
                self.characterLookup.hasMappingForReadingToCharacter(reading))
            self.assertTrue(
                self.characterLookup.hasMappingForCharacterToReading(reading))

        # test proper checking for all known readings
        for reading in self.f.getSupportedReadings():
            self.assertTrue(
                self.characterLookup.hasMappingForReadingToCharacter(reading) \
                in [True, False])
            self.assertTrue(
                self.characterLookup.hasMappingForCharacterToReading(reading) \
                in [True, False])

    @attr('slow')
    def testGetCharactersForReadingAcceptsAllEntities(self):
        """Test if ``getCharactersForReading`` accepts all reading entities."""
        for reading in self.f.getSupportedReadings():
            if not self.characterLookup.hasMappingForReadingToCharacter(
                    reading):
                continue

            dialects = [{}]
            if reading in self.DIALECTS:
                dialects.extend(self.DIALECTS[reading])

            for dialect in dialects:
                if hasattr(self.f.getReadingOperatorClass(reading),
                           'getReadingEntities'):
                    entities = self.f.getReadingEntities(reading, **dialect)
                elif reading in self.SPECIAL_ENTITY_LIST:
                    entities = self.SPECIAL_ENTITY_LIST[reading]
                else:
                    continue

                for entity in entities:
                    try:
                        results = self.characterLookup.getCharactersForReading(
                            entity, reading, **dialect)

                        self.assertEqual(type(results), type([]),
                            "Method getCharactersForReading() doesn't return" \
                                + " a list for entity %s " % repr(entity) \
                        + ' (reading %s, dialect %s)' % (reading, dialect))

                        for entry in results:
                            self.assertEqual(len(entry), 1,
                                "Entry %s in result for %s has length != 1" \
                                    % (repr(entry), repr(entity)) \
                                + ' (reading %s, dialect %s)' \
                                % (reading, dialect))
                    except exception.UnsupportedError:
                        pass
                    except exception.ConversionError:
                        pass
Ejemplo n.º 37
0
class BKRS2DB(object):

    """Class to convert BKRS.info dictionary into Pleco database format"""
    def __init__(self, configfile):
        super(BKRS2DB, self).__init__()

        #statprof.start()

        self.get_config(configfile)
        self.comma_symbols = [u',', u'﹐', ',']
        self.BUFFER_SIZE = 10000
        self.buffer_index = 0

        self.read_fab = ReadingFactory()
        self.cedict = CEDICT()
        self.cjk = characterlookup.CharacterLookup('T')
        self.pinyinOp = self.read_fab.createReadingOperator('Pinyin')
        self.charInfo =  cjknife.CharacterInfo()
        self.last_error = {'description':'', 'match':'', 'not_match': ''}
        self.bad_word_index = 0
        self.additional_reading = {}
        self.hanzi_stat = {}
        self.hanzi_freq = {}
        self.hanzi_pron_var = {}
        self.errors_description = {
            'pinyin_not_match':'Не совпадает', 
            'no_pinyin':'Нет чтения',
            'pinyin_have_tag_symbol':'В пиньине теги', 
            'pinyin_have_bad_symbol':'В пиньине плохие символы', 
            'pinyin_have_rus_letter':'В пиньине русские буквы',
            'pinyin_have_number_symbol':'В пиньине цифры',
            'word_have_alpha_symbol':'В слове alfa символы'
        }

        self.log_file = open(self.params['log_file'], 'w', 1000)
        if self.params['write_to_pleco_db']:
            self.pleco = Pleco(self.params['output_pleco_database_file'], self)
        self.bad_hanzi_list = False
        
    def export(self):
        if self.params['write_to_db']:
            self.conn = sqlite3.connect(self.params['output_database_file'])
            self.cursor = self.conn.cursor()

        self.bad_words_file = open(self.params['bad_words_file'], 'w', 1000)
        self.bad_words_list = open(self.params['bad_words_list'], 'w', 100)
        self.bad_hanzi_list = open(self.params['bad_hanzi_list'], 'w', 100)
        self.start_bad_words_file()
        
        self.log('Start of export. Input: '+self.params['input_bkrs_file']+', output: '+self.params['output_pleco_database_file'])
        self.start_time = time.time()
 
        if self.params['write_to_db']:
            self.create_db()
        self.dic = open(self.params['input_bkrs_file'], mode='r')
        line_type = ''
        word = ''
        pronounce = ''
        translate = ''
        word_index = 0
        good_words = 0
        have_no_rus_translate = 0
        bad_word_not_found_pron_variant = 0
        pinyin_have_number_symbol = 0
        self.ambiguous_decomposition = 0
        pinyin_have_tag_symbol = 0
        num_pinyin_have_tone_mark = 0
        no_pron_symbols_in_pinyin = 0

        self.load_character_frequency()

        if self.params['additional_pronounces_file']:
            self.load_additional_pronounces()

        self.flog('Start work with BKRS data file...')
        for line in self.dic:
            if line == '\n':
                line_type = 'word'
            else:
                if not line.startswith('#'):
                    if line_type == 'word':
                        word_index = word_index+1
                        if self.params['show_progress']:
                            self.show_progress(word_index, self.params['to_word_number'])
                        word = (line[:-1]).strip().decode('utf-8')
                        self.stat_words_hanzi(word)
                        #word = self.join_nonprintable_hanzi(word) # 鱼岁 = 鱥
                        line_type = 'pronounce'
                    elif line_type == 'pronounce':
                        pronounce = (line[1:-1]).strip().decode('utf-8')
                        line_type = 'translate'
                    elif line_type == 'translate':
                        if word_index <= self.params['from_word_number']:
                            continue
                        if self.params['to_word_number'] > 0:
                            if word_index >= self.params['to_word_number']:
                                break

                        translate = (line[1:-1]).strip().decode('utf-8')
                        translate_with_tags = translate
                        translate_pleco = self.pleco.remove_html_tags(translate)
                        word_info = ' line #'+str(word_index*4+1)+' word #'+str(word_index)+' word: '+word+' pinyin: '+pronounce
                        pronounce = self.filter_pinyin(pronounce)

                        if self.have_rus_letters(pronounce):
                            self.log('Warning: pinyin have russian letters'+word_info)
                            self.log_bad_word(word, pronounce, 'pinyin_have_rus_letter', translate_with_tags, word_index)
                            self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n')
                            continue

                        if self.have_number_symbol(pronounce) and not self.have_number_symbol(word):
                            pinyin_have_number_symbol += 1
                            self.log('Pinyin have tone number '+word_info)
                            self.log_bad_word(word, pronounce, 'pinyin_have_number_symbol', translate_with_tags, word_index)
                            self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n')
                            continue

                        if self.have_tag_symbol(pronounce):
                            pinyin_have_tag_symbol +=1
                            self.log('Warning: pinyin have tag symbols '+word_info)
                            self.log_bad_word(word, pronounce, 'pinyin_have_tag_symbol', translate_with_tags, word_index)
                            self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n')
                            continue
                            
                        if not self.have_rus_letters(translate_pleco):
                            have_no_rus_translate += 1
                            continue

                        if self.have_pron_symbol(pronounce):
                            ob_pronounce = self.convert_full_pinyin(word, pronounce)
                            pronounce_numeric_tone = self.get_string_pron(ob_pronounce)
                            if not pronounce_numeric_tone:
                                self.log('Error not found pronounce variant'+word_info)
                                bad_word_not_found_pron_variant += 1
                                if self.last_error['description'] != 'HANZI_WITH_NO_PRON':
                                    if self.translate_have_rus(translate_with_tags):
                                        if self.have_lat_letters_or_numbers(word):
                                            self.log_bad_word(word, pronounce, 'word_have_alpha_symbol', translate_with_tags, word_index)
                                        else:
                                            self.log_bad_word(word, pronounce, 'pinyin_not_match', translate_with_tags, word_index)
                                        self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n')              
                                continue
                        else:
                            #if self.translate_have_rus(translate_with_tags):
                            #    self.log_bad_word(word, pronounce, 'no_pinyin', translate_with_tags, word_index)
                            no_pron_symbols_in_pinyin += 1
                            continue

                        if self.pinyin_have_bad_symbol(pronounce):
                            self.log_bad_word(word, pronounce, 'pinyin_have_bad_symbol', translate_with_tags, word_index)

                        trad_word = self.get_trad(word)

                        if self.params['write_to_pleco_db']:
                            self.pleco.write_db(word, trad_word, pronounce_numeric_tone, translate_pleco)
                            self.pleco.create_db_word_index(ob_pronounce, len(word))
                        if self.params['write_to_db']:
                            freq = self.get_word_freq(word)
                            self.write_db(trad_word, word, pronounce_numeric_tone, translate, freq)
                        self.clear_last_error()
                        good_words += 1

        self.flog('OK.. ###################################################################################')
        self.flog('Count of words:\t\t\t\t\t'+str(word_index))
        self.flog('Good words:\t\t\t\t\t\t'+str(good_words)+'\t\t('+str(round(float(good_words)*100/word_index,2))+'%)')
        self.flog('Have no rus translate:\t\t\t'+str(have_no_rus_translate)+'\t\t('+str(round(float(have_no_rus_translate)*100/word_index,2))+'%)')
        self.flog('Not found pronounce variant:  \t'+str(bad_word_not_found_pron_variant)+'\t\t('+str(round(float(bad_word_not_found_pron_variant)*100/word_index,2))+'%)')
        self.flog('Numeric pinyin have tone mark:\t'+str(num_pinyin_have_tone_mark)+'\t\t('+str(round(float(num_pinyin_have_tone_mark)*100/word_index,2))+'%)')
        self.flog('Pinyin field have tone number:\t'+str(pinyin_have_number_symbol)+'\t\t('+str(round(float(pinyin_have_number_symbol)*100/word_index,2))+'%)')
        self.flog('Pinyin pinyin have tag symbol:\t'+str(pinyin_have_tag_symbol)+'\t\t('+str(round(float(pinyin_have_tag_symbol)*100/word_index,2))+'%)')
        self.flog('Pinyin have no pron symbols:  \t'+str(no_pron_symbols_in_pinyin)+'\t\t('+str(round(float(no_pron_symbols_in_pinyin)*100/word_index,2))+'%)')
        self.log_hanzi_stat()

        if self.params['write_to_pleco_db']:
            self.pleco.create_db_index()
            self.pleco.conn.commit()
            self.pleco.conn.close()

        if self.params['write_to_db']:
            self.create_db_index()
            self.conn.commit()
            self.conn.close()

        self.dic.close()
        self.end_time = time.time()
        self.flog('End of export. Total time: '+str(round(self.end_time - self.start_time ,2))+' sec')
        
        self.end_bad_words_file()
        self.bad_words_list.close()
        self.bad_hanzi_list.close()
        self.bad_words_file.close()

        #statprof.stop()
        #statprof.display()
        
    def __del__(self):
        self.log_file.close()

    def get_config(self, configfile):
        self.config = ConfigParser.ConfigParser()
        configPath = os.path.dirname(__file__)+'/'+configfile
        self.config.read(configPath)

        self.params = {}
        self.params['write_to_db'] =            self.config.getboolean('Main', 'write_to_db')
        self.params['write_to_pleco_db'] =      self.config.getboolean('Main', 'write_to_pleco_db')
        self.params['show_progress'] =          self.config.getboolean('Main', 'show_progress')
        self.params['approx_count_of_words'] =  self.config.getint('Main', 'approx_count_of_words')
        self.params['from_word_number'] =       self.config.getint('Main', 'from_word_number')
        self.params['to_word_number'] =         self.config.getint('Main', 'to_word_number')
        self.params['log_console'] =  False

        self.params['input_bkrs_file'] =        self.config.get('Input files', 'bkrs_db')
        self.params['additional_pronounces_file'] = self.config.get('Input files', 'additional_pronounces')
        self.params['char_freq_file'] =         self.config.get('Input files', 'char_freq')
        self.params['log_template'] =           self.config.get('Input files', 'log_template')

        self.params['output_pleco_database_file'] = self.config.get('Output files', 'pleco_db')
        self.params['output_database_file'] =   self.config.get('Output files', 'sqlite_db')
        self.params['log_file'] =               self.config.get('Output files', 'log_file')
        self.params['bad_words_file'] =         self.config.get('Output files', 'bad_words_html')
        self.params['bad_words_list'] =         self.config.get('Output files', 'bad_words_list')
        self.params['bad_hanzi_list'] =         self.config.get('Output files', 'bad_hanzi_list')
        self.params['frequency_file'] =         self.config.get('Output files', 'frequency')

    def log_hanzi_stat(self):

        frequency_file = open(self.params['frequency_file'], 'w', 100)
        hanzilist = []
        for key, val in self.hanzi_stat.items():
            hanzilist.append(val)

        uniquehanzi = len(hanzilist)
        allhanzi = 0
        for h in hanzilist:
            allhanzi += h['count']

        self.log('Hanzi statistic ################################################################################')
        self.log('Total hanzi:  '+str(allhanzi))
        self.log('Unique hanzi: '+str(uniquehanzi))
        self.log('Top 100 error hanzi ############################################################################')
        hanzilist.sort(key=lambda x: x['error'], reverse = True)
        i = 0
        for hanzi in hanzilist:
            i += 1
            self.log('Hanzi: '+hanzi['hanzi']+' \t Count: '+str(hanzi['count'])+'\t\tError: '+str(hanzi['error']), with_time = False)
            if i>100:
                break
        self.log('Hanzi frequency ##################################################################################')
        hanzilist.sort(key=lambda x: x['count'], reverse = True)

        for hanzi in hanzilist:
            frequency_file.write(hanzi['hanzi'].encode('utf-8')+'\t'+str(hanzi['count'])+'\n')
            
        frequency_file.close()

    def get_string_pron(self, ob_pron): 
        """Get list of pron: [pron1,pron2]
        pron = [(hanzi, num_pinyin, sep), ...]
        return string 
        """
        if not ob_pron:
            return ''
        list_pron = []
        for pron in ob_pron:
            str_pron = ''
            for hanzi, num_pinyin, sep in pron:
                str_pron = str_pron+num_pinyin+sep
            list_pron.append(str_pron)

        num_pron = ', '.join(list_pron)
        return num_pron.strip()

    def convert_full_pinyin(self, hanziword, pinyin):
        clean_hanzi = self.filter_hanzi(hanziword)
        pinyin = self.filter_pinyin(pinyin)

        if self.hanziword_have_comma(hanziword):
            pinyin = self.replace_comma(pinyin, rep = ' ')
            
        pinyins = pinyin.split(',')

        if len(clean_hanzi) == 0: 
            self.log('Error Hanzi word length is zero! Hanzi: '+hanziword+' clean Hanzi: '+clean_hanzi)
            self.last_error['description'] = 'NOT_CHINESE_CHARS'
            return False

        pinyins_good_results = []
        for atom_pinyin in pinyins:
            pron = self.convert_pinyin(clean_hanzi, atom_pinyin, reverse_sort = True)
            if not pron:
                pron = self.convert_pinyin(clean_hanzi, atom_pinyin, reverse_sort = False)
            if pron:
                pinyins_good_results.append(pron)


        return pinyins_good_results

    def convert_pinyin(self, clean_hanzi, pinyin, reverse_sort = True):        
        old_pinyin = pinyin
        ob_pronounce = []
        for hanzi in clean_hanzi:
            all_pron_variants = self.get_pron_variants(hanzi)
            all_pron_variants_mixed = self.get_pron_variants_mixed(hanzi, reverse_sort)
            not_found_pron = True
            if not all_pron_variants:
                self.log('No fonded any pronounciation for hanzi: '+hanzi)
                self.last_error['description'] = 'HANZI_WITH_NO_PRON'
                self.last_error['hanzi'] = hanzi
                if self.bad_hanzi_list: 
                    self.bad_hanzi_list.write(hanzi.encode('utf-8')+'\n')
                return False
            for pron_var in all_pron_variants_mixed:
                if pinyin.startswith(pron_var):
                    not_found_pron = False
                    pinyin_splited = self.split_pinyin(hanzi, pron_var, pinyin)
                    pinyin = pinyin_splited['pinyin']
                    ob_pronounce.append((hanzi, pinyin_splited['num_pron'], pinyin_splited['sep']))
                    break
            if not_found_pron: 
                self.stat_add_hanzi_error(hanzi)
                matched_str = ' '.join('['+h+':'+p+']' for h,p,s in ob_pronounce)
                self.log('Not found pron for hanzi: '+hanzi+' ['+ ' '.join(s for s in all_pron_variants)+'] P1:'+old_pinyin+' P2:'+pinyin+' '+matched_str)
                self.last_error['match'] = matched_str
                self.last_error['not_match'] = hanzi+' ['+ ' '.join(s for s in all_pron_variants)+']'
                return False

        return ob_pronounce


    def split_pinyin(self, hanzi, pron_var, pinyin):
        all_separators = [u' ', u'’']
        pinyin = pinyin.replace(pron_var, '', 1)
        separator = ''
        for sep in all_separators:
            if pinyin.startswith(sep): 
                pinyin = pinyin.lstrip(sep)
                separator = sep
                break
        if not self.have_tone_mark(pron_var) and re.match(u'^[a-zA-Zα-ωΑ-Ω]$',hanzi):
            num_pron = pron_var
        else:
            num_pron = self.get_numeric_tone(pron_var)
        return {'num_pron':num_pron,'sep':separator, 'pinyin':pinyin}

    def get_pron_variants_mixed(self, hanzi, reverse_sort):
        all_pron_variants = self.get_pron_variants(hanzi)
        if re.match('^[0-9]$',hanzi):
            pron_variants = all_pron_variants
        else:
            pron_variants = self.get_with_mixed_tones(all_pron_variants, reverse_sort)
        return pron_variants

    def get_pron_variants(self, hanzi):
        if hanzi in self.hanzi_pron_var:
            return self.hanzi_pron_var[hanzi]

        pron_variants = []
        try:
            pron_variants = pron_variants + self.cjk.getReadingForCharacter(hanzi, 'Pinyin')
        except:
            self.log('Error: getReadingForCharacter. Hanzi: '+hanzi)
        if hanzi in self.additional_reading:
            pron_variants = pron_variants + self.additional_reading[hanzi]

        unique_pron_vars = unique_list(pron_variants, lambda x: x.lower().strip())
        self.hanzi_pron_var[hanzi] = unique_pron_vars
        return unique_pron_vars

    def stat_add_hanzi(self, hanzi):
        if hanzi in self.hanzi_stat:
            self.hanzi_stat[hanzi]['count'] += 1
        else:
            self.hanzi_stat[hanzi] = {'hanzi': hanzi, 'count':1, 'error':0}

    def stat_add_hanzi_error(self, hanzi):
        if hanzi in self.hanzi_stat:
            self.hanzi_stat[hanzi]['error'] += 1
        else:
            self.hanzi_stat[hanzi] = {'hanzi': hanzi, 'count':1, 'error':1}

    def stat_words_hanzi(self, word):
        for hanzi in word:
            self.stat_add_hanzi(hanzi)

    def get_with_mixed_tones(self, pron_var_list, reverse_sort = True):

        mixed_pron_variants = pron_var_list
        for pron_var in pron_var_list:
            none_tone_pron = self.get_without_tone_mark(pron_var)
            alltones = self.get_all_tones(none_tone_pron)
            alltones.append(none_tone_pron)
            mixed_pron_variants = mixed_pron_variants+alltones
        mixed_pron_variants = unique_list(mixed_pron_variants)
        mixed_pron_variants.sort(key=len, reverse = reverse_sort)
        return mixed_pron_variants


    def load_additional_pronounces(self):
        self.flog('Start loading additional reading database...')
        files = self.params['additional_pronounces_file'].split(',')
        for file_name in files:
            addreadfile = open(file_name, mode = 'r')
            words = 0
            for line in addreadfile:
                line = line.replace('','') # replace one not printable symbol
                uline = line.strip().decode('utf-8')
                charhanzilist = uline.split('\t')[0].strip().split(',')
                readings = uline.split('\t')[1].split(',')
                for charhanzi in charhanzilist:
                    if charhanzi in self.additional_reading:
                        self.additional_reading[charhanzi] = unique_list(readings + self.additional_reading[charhanzi], lambda x: x.lower())
                    else:
                        self.additional_reading[charhanzi] = readings
                    words += 1
            addreadfile.close()
            self.flog('Additional reading database loaded from '+file_name+'. Count of hieroglyph: '+str(words))

    def load_character_frequency(self):
        self.flog('Start loading character frequency...')
        freqfile = open(self.params['char_freq_file'], mode = 'r')
        words = 0
        for line in freqfile:
            uline = (line[:-1]).strip().decode('utf-8')
            if '\t' in uline:
                charhanzi = uline.split('\t')[0].strip()
                freq = int(uline.split('\t')[1])
                if freq <= 0:
                    freq = 1
                self.hanzi_freq[charhanzi] = freq
                words += 1
        self.flog('Characters frequency loaded. Hanzi count: '+str(words))

    def get_hanzi_freq(self, hanzi):
        try:
            freq = self.hanzi_freq[hanzi]
        except KeyError:
            freq = 1 
        return freq  

    def get_word_freq(self, word): 
        freq = 0
        length = len(word)
        if not length:
            return 0
        for hanzi in word:
            freq += self.get_hanzi_freq(hanzi)
        freq = int(freq/length)
        return freq   

    def filter_hanzi(self, hanziword):
        # Unicode blocks for Chinese, Japanese and Korean:
        #{InCJK_Compatibility}: U+3300–U+33FF
        #{InCJK_Unified_Ideographs_Extension_A}: U+3400–U+4DBF 
        #{InCJK_Unified_Ideographs}: U+4E00–U+9FFF
        #{InCJK_Compatibility_Ideographs}: U+F900–U+FAFF
        #{InCJK_Compatibility_Forms}: U+FE30–U+FE4F 

        clean_word = ''
        for char in re.findall(ur'[0-9a-zA-Zα-ωΑ-Ω\u3300-\u33FF\u3400-\u4DBF\u4e00-\u9fff\uF900-\uFAFF\uFE30-\uFE4F]+', hanziword):
            clean_word = clean_word+char
        clean_word = self.replace_comma(clean_word, rep = '')
        return clean_word
Ejemplo n.º 38
0
 def setUp(self):
     CharacterLookupTest.setUp(self)
     self.f = ReadingFactory(dbConnectInst=self.db)
Ejemplo n.º 39
0
class LeoDownloader(AudioDownloader):
    """Download audio from LEO"""
    def __init__(self):
        AudioDownloader.__init__(self)
        self.file_extension = u'.mp3'
        self.url = 'http://www.leo.org/dict/audio_{language}/{word}.mp3'
        # And, yes, they use ch for Chinese.
        # (I'm not sure if they really have anything for ru or it.)
        self.language_dict = {
            'de': 'de',
            'en': 'en',
            'es': 'es',
            'fr': 'fr',
            'it': 'it',
            'ru': 'ru',
            'zh': 'ch'
        }
        # It kind of looks like they have Swiss pronunciations, but hey don't.
        self.chinese_code = 'ch'
        # We should keep a number of site icons handy, with the right
        # flag for the request.
        self.site_icon_dict = {}
        self.site_file_name_encoding = 'ISO-8859-1'
        self.icon_url_dict = {
            'de': 'http://dict.leo.org/favicon.ico',
            'en': 'http://dict.leo.org/favicon.ico',
            'es': 'http://dict.leo.org/favicon_es.ico',
            'fr': 'http://dict.leo.org/favicon_fr.ico',
            'it': 'http://dict.leo.org/favicon_it.ico',
            'ru': 'http://dict.leo.org/favicon_ru.ico',
            # When we use this dict, we have already munged the 'zh' to 'ch'
            'ch': 'http://dict.leo.org/favicon_ch.ico'
        }
        # As the name implies, a hack. Try to use the cjklib TTEMPÉ
        # brings along. A syntem-wide installed one should work as
        # well.
        self.have_tried_cjklib_hack = False
        self.reading_factory = None

    def download_files(self, word, base, ruby, split):
        """
        Download a word from LEO

        We try to get pronunciations for the text for German, English,
        Spanish, French, Italian and Russian, and from the ruby for
        Chinese. There may not be any pronunciations available for
        Italian or Russian.
        """
        self.downloads_list = []
        # Fix the language. EAFP.
        self.language = self.language_dict[self.language[:2].lower()]
        # set_names also checks the language.
        self.set_names(word, base, ruby)
        if self.chinese_code == self.language and not split:
            return
        # Only get the icon when we have a word
        # self.maybe_get_icon()
        self.get_flag_icon()
        # EAFP. self.query_url may return None...
        word_url = self.query_url(word, ruby)
        # ... then the get_data will blow up
        word_data = self.get_data_from_url(word_url)
        word_file_path, word_file_name = self.get_file_name()
        with open(word_file_path, 'wb') as word_file:
            word_file.write(word_data)
        # We have a file, but not much to say about it.
        self.downloads_list.append(
            (word_file_path, word_file_name, dict(Source='Leo')))

    def query_url(self, word, ruby):
        """Build query URL"""
        if self.chinese_code == self.language:
            word = self.fix_pinyin(ruby)
        return self.url.format(language=self.language,
                               word=urllib.quote(
                                   word.encode(self.site_file_name_encoding)))

    def fix_pinyin(self, pinyin):
        # Hacks. It is overkill to ship cjklib with this add-on. But
        # to get the tone numbers as numbers, we should use it. My
        # hope (guess) is that the typical user that will want Chinese
        # pronunciations will also have TTEMPÉ's (version of mine)
        # chinese-support-plugin installed. So try to use that and
        # don't complain if it doesn't work.
        if not self.have_tried_cjklib_hack:
            try:
                # If this works, the whole shebang is run as an Anki2
                # add-on. If not, we will still look for a system-wide
                # cjklib, but obviously not for anothre add-on.
                from aqt.utils import isWin
            except:
                pass
            else:
                from aqt import mw
                addon_dir = mw.pm.addonFolder()
                if isWin:
                    # The isWin bit is copied from TTEMPÉ's code.
                    addon_dir = addon_dir.encode(sys.getfilesystemencoding())
                sys.path.append(os.path.join(addon_dir, "chinese"))
            self.have_tried_cjk_hack = True
        if not self.reading_factory:
            try:
                from cjklib.reading import ReadingFactory
            except ImportError:
                return pinyin
            else:
                self.reading_factory = ReadingFactory()
        return self.reading_factory.convert(pinyin,
                                            'Pinyin',
                                            'Pinyin',
                                            targetOptions={
                                                'toneMarkType': 'numbers'
                                            }).replace('5', '0')

    def get_flag_icon(self):
        """
        Set self.site_icon to the right icon.

        We should use different icons, depending on the request
        language.  We store these icons in self.site_icon_dict and use the
        AudioDownloader.maybe_get_icon() if we don't have it yet.
        """
        if not with_pyqt:
            return
        try:
            # If this works we already have it.
            self.site_icon = self.site_icon_dict[self.language]
        except KeyError:
            # We have to get it ourself. (We know it's just 16x16, so
            # no resize. And we know the address).
            self.site_icon_dict[self.language] = \
                QImage.fromData(self.get_data_from_url(
                    self.icon_url_dict[self.language]))
            self.site_icon = self.site_icon_dict[self.language]

    def set_names(self, text, base, ruby):
        """
        Set the display text and file base name variables.
        """
        if self.language == self.chinese_code:
            if not ruby:
                raise ValueError('Nothing to download')
            self.base_name = u"{0}_{1}".format(base, ruby)
            self.display_text = u"{1} ({0})".format(base, ruby)
        else:
            if not text:
                raise ValueError('Nothing to download')
            self.base_name = text
            self.display_text = text
Ejemplo n.º 40
0
    def __init__(self, **options):
        super(ChineseLessonsComMandarinPronunciation, self).__init__(**options)

        self.readingFactory = ReadingFactory()
Ejemplo n.º 41
0
    def handle_noargs(self, **options):
        # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/

        # EMPTY ALL ZH + PY KEYS
        self._del_keys('ZH:*')
        self._del_keys('PY:*')
        
        # NOW LETS START
        file = open(settings.DICT_FILE_LOCATION)
        item_count = 0
        for line in file:
            if line.startswith("#"):
                pass
            else:
                
                # OPEN REDIS CONNECTION NOW
                r_server = _get_redis()
                
                # GATHER ALL THE MAIN VARIABLES
                new = line.split()
                numbered_pinyin = line[(line.index('[')+1):(line.index(']'))]
                f = ReadingFactory()
                tonal_pinyin =  f.convert(numbered_pinyin, 'Pinyin', 'Pinyin',
                    sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v',
                    'missingToneMark': 'fifth'})
                meanings = line[(line.index('/')+1):(line.rindex('/'))]               
                characters = new[1]
                
                # REMOVE ALL THE UGLY CHARACTERS
                if ',' in characters:
                    characters = characters.replace(',', '')
                
                
                # GET AND CLEAN THE MEASURE WORD
                mws = None
                if "CL:" in meanings:
                    new_meanings = meanings.split('/')
                    for idx, val in enumerate(new_meanings):
                        if "CL:" in val:
                            mws = []
                            for x in val.replace('CL:', '').split(','):
                                
                                x = x[:(x.index('['))]
                                if '|' in x:
                                    x = x[(x.index('|')+1):]
                                    
                                    
                                # ADD THE MEAASURE WORDS ENTRY
                                # ----------------------------
                                mws_key = settings.MEASURE_WORD_KEY % x   
                                if r_server.exists(mws_key):
                                    values = json.loads(_search_redis(mws_key))
                                    values['chars'].append(characters)
                                else:
                                    values = {'chars': [characters,]}
                                r_server.set(mws_key, json.dumps(values))                                
                                    
                                mws.append(x)
                            
                            
                            
                            new_meanings.pop(idx)
                    meanings = "/".join(new_meanings)
                

                    
                    
                    
                
                
                
                char_key = settings.CHINESE_WORD_KEY % ((len((characters))/3), characters)                 
                
                # CREATE THE PRONUNCIATION/MEANING PAIR
                pair = {}
                pair['pinyin'] = tonal_pinyin
                pair['pinyin_numbered'] = _normalize_pinyin(numbered_pinyin)
                pair['meaning'] = meanings
                pair['measure_words'] = mws
                
                
                
                # ADD THE PINYIN ENTRY
                # --------------------
                
                py_key = settings.PINYIN_WORD_KEY % _pinyin_to_ascii(numbered_pinyin)
                if r_server.exists(py_key):
                    values = json.loads(_search_redis(py_key))
                    if smart_unicode(characters) not in values:
                        values.append(characters)
                else:
                    values = [characters,]
                
                r_server.set(py_key, json.dumps(values))                    
    
    
    
    
                # ADD THE CHINESE CHARACTER ENTRY
                # -------------------------------
                if r_server.exists(char_key):
                    values = json.loads(_search_redis(char_key))
                    values['meanings'].append(pair)
                else:
                    values = {
                        'chars': characters,
                        'meanings': [pair,],
                    }
                    
                r_server.set(char_key, json.dumps(values))
                
                item_count += 1
                print item_count

                
                               
        
        print "%s Chinese items added" % item_count          
        file.close()        
Ejemplo n.º 42
0
def main():
    language, output_encoding = locale.getdefaultlocale()

    if len(sys.argv) == 2:
        modus = sys.argv[1]
        if modus not in modi:
            print "invalid modus, choose one out of: " + ", ".join(modi.keys())
            sys.exit(1)
    else:
        print "give a modus, choose one out of: " + ", ".join(modi.keys())
        sys.exit(1)

    fromReading, toReading, entryFunc, readingOpt = modi[modus]

    initialRules = INITIAL_RULES[(fromReading, toReading)]
    finialRules = FINAL_RULES[(fromReading, toReading)]
    extraSyllables = EXTRA_SYLLABLES[(fromReading, toReading)]

    # entry set
    global entrySet
    entrySet = set()
    # build table and use scheme with almost perfect grouping according to
    #   pronunciation, then use headers to get the initial's and final's
    #   pronunciation.
    op = ReadingFactory().createReadingOperator(fromReading, **readingOpt)

    # get splitted syllables, finals in first row, initials in first column
    for syllable in op.getReadingEntities():
        initial, final = op.getOnsetRhyme(syllable)
        # only apply rules if syllable isn't given an extra mapping in
        #   EXTRA_SYLLABLES
        if not syllable in extraSyllables:
            # check if we have rules
            if initialRules[initial] != None and finialRules[final] != None:
                # check for ambiguous mappings
                if type(initialRules[initial]) == type({}):
                    initialFeatures = initialRules[initial].keys()
                else:
                    initialFeatures = [None]
                if type(finialRules[final]) == type({}):
                    finalFeatures = finialRules[final].keys()
                else:
                    finalFeatures = [None]

                # go through all mappings
                for initialFeature in initialFeatures:
                    for finalFeature in finalFeatures:
                        if initialFeature:
                            targetInitial \
                                = initialRules[initial][initialFeature]
                        else:
                            targetInitial = initialRules[initial]

                        if finalFeature:
                            targetFinal = finialRules[final][finalFeature]
                        else:
                            targetFinal = finialRules[final]

                        entry = entryFunc(syllable, targetInitial, targetFinal,
                            initialFeature, finalFeature)
                        if entry != None:
                            entrySet.add(entry)
            else:
                print >> sys.stderr, ("missing rule(s) for syllable '" \
                    + syllable + "' with initial/final '" + initial + "'/'" \
                    + final + "'").encode(output_encoding)

    # print extra syllables
    for syllable in extraSyllables:
        if extraSyllables[syllable]:
            initialRule, finalRule = extraSyllables[syllable]
            # check for ambiguous mappings
            if type(initialRule) == type({}):
                initialFeatures = initialRule.keys()
            else:
                initialFeatures = [None]
            if type(finalRule) == type({}):
                finalFeatures = finalRule.keys()
            else:
                finalFeatures = [None]

            # go through all mappings
            for initialFeature in initialFeatures:
                for finalFeature in finalFeatures:
                    if initialFeature:
                        targetInitial = initialRule[initialFeature]
                    else:
                        targetInitial = initialRule

                    if finalFeature:
                        targetFinal = finalRule[finalFeature]
                    else:
                        targetFinal = finalRule

                    entry = entryFunc(syllable, targetInitial, targetFinal,
                        initialFeature, finalFeature)
                    if entry != None:
                        entrySet.add(entry)

    notIncludedSyllables = [syllable for syllable in extraSyllables \
        if not extraSyllables[syllable]]
    if notIncludedSyllables:
        print >> sys.stderr, ("Syllables not included in table: '" \
            + "', '".join(sorted(notIncludedSyllables)) + "'")\
            .encode(output_encoding)

    entryList = list(entrySet)
    entryList.sort()
    print "\n".join(entryList).encode(output_encoding)