Beispiel #1
0
    def __init__(self, language, characterDomain=None, databaseUrl=None,
        dbConnectInst=None, ignoreIllegalSettings=False, **options):

        dbConnectInst = dbConnectInst or getDBConnector(
            getDatabaseConfiguration(databaseUrl))

        locale = self.LANGUAGE_CHAR_LOCALE_MAPPING[language]
        CharacterLookup.__init__(self, locale, characterDomain or 'Unicode',
            dbConnectInst=dbConnectInst)

        self.language = language

        # choose a better character domain if non specified
        if (characterDomain
            and characterDomain not in self.LANGUAGE_CHAR_DOMAIN_MAPPING[
                    self.language]):
            if ignoreIllegalSettings:
                characterDomain = None
            else:
                raise ValueError(
                    "Illegal character domain '%s' for language '%s'"
                    % (characterDomain, self.language))
        if not characterDomain:
            self.setCharacterDomain(self._getCharacterDomain())

        if locale != 'T':
            self._characterLookupTraditional = CharacterLookup('T',
                dbConnectInst=self.db)
Beispiel #2
0
    def __init__(self, databaseUrl=None, dbConnectInst=None):
        """
        Initialises the ReadingFactory.

        If no parameters are given default values are assumed for the connection
        to the database. The database connection parameters can be given in
        databaseUrl, or an instance of
        :class:`~cjklib.dbconnector.DatabaseConnector` can be passed in
        dbConnectInst, the latter one being preferred if both are specified.

        :type databaseUrl: str
        :param databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :type dbConnectInst: instance
        :param dbConnectInst: instance of a
            :class:`~cjklib.dbconnector.DatabaseConnector`
        """
        # get connector to database
        if dbConnectInst:
            self.db = dbConnectInst
        else:
            self.db = dbconnector.getDBConnector(databaseUrl)
        # create object instance cache if needed, shared with all factories
        #   using the same database connection
        if self.db not in self._sharedState:
            # clear also generates the structure
            self.clearCache()
        # publish default reading operators and converters
            for readingOperator in self.getReadingOperatorClasses():
                self.publishReadingOperator(readingOperator)
            for readingConverter in self.getReadingConverterClasses():
                self.publishReadingConverter(readingConverter)
Beispiel #3
0
    def __init__(self, databaseUrl=None, dbConnectInst=None):
        """
        Initialises the ReadingFactory.

        If no parameters are given default values are assumed for the connection
        to the database. The database connection parameters can be given in
        databaseUrl, or an instance of
        :class:`~cjklib.dbconnector.DatabaseConnector` can be passed in
        dbConnectInst, the latter one being preferred if both are specified.

        :type databaseUrl: str
        :param databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :type dbConnectInst: instance
        :param dbConnectInst: instance of a
            :class:`~cjklib.dbconnector.DatabaseConnector`
        """
        # get connector to database
        if dbConnectInst:
            self.db = dbConnectInst
        else:
            self.db = dbconnector.getDBConnector(databaseUrl)
        # create object instance cache if needed, shared with all factories
        #   using the same database connection
        if self.db not in self._sharedState:
            # clear also generates the structure
            self.clearCache()
        # publish default reading operators and converters
            for readingOperator in self.getReadingOperatorClasses():
                self.publishReadingOperator(readingOperator)
            for readingConverter in self.getReadingConverterClasses():
                self.publishReadingConverter(readingConverter)
Beispiel #4
0
    def loadDatabaseBuilder(self):
        if not self.renderThread.hasObject(build.DatabaseBuilder):
            options = EclectusCommandLineBuilder.getDefaultOptions()

            db = getDBConnector(getDatabaseConfiguration(self.databaseUrl))

            self.renderThread.setObject(build.DatabaseBuilder, dbConnectInst=db,
                **options)
Beispiel #5
0
def runTests(tests, databases, registerUnicode, iteration=10):
    f = ReadingFactory()

    timing = {}
    for no in tests:
        print "Running test %d (reading from %s)..." % (no, databases[no])

        connection = {'sqlalchemy.url': 'sqlite:///%s' % databases[no],
                      'attach': ['cjklib'],
                      'registerUnicode': registerUnicode[no]}
        db = dbconnector.getDBConnector(connection)
        availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                          in dictionary.BaseDictionary\
                             .getAvailableDictionaries(db)]
        dictionaries = list(set(availableDicts)
                            & set(db.engine.table_names(schema=db._mainSchema)))
        if not dictionaries:
            raise ValueError("No dictionaries found")

        print "Found dictionaries '%s'" % "', '".join(dictionaries)

        runTime = {}
        for dictName in dictionaries:
            dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName)
            dictInstance = dictClass(dbConnectInst=db)

            opClass = (dictClass.READING
                       and f.getReadingOperatorClass(dictClass.READING))
            if hasattr(opClass, 'guessReadingDialect'):
                requestList = []
                for request in SEARCH_REQUESTS:
                    options = opClass.guessReadingDialect(request)
                    requestList.append((request, options))
            else:
                requestList = [(request, {}) for request in SEARCH_REQUESTS]

            mod = imp.new_module('timeit_runmod')
            mod.runRequest = runRequest
            mod.dictInstance = dictInstance
            mod.requestList = requestList

            sys.modules['timeit_runmod'] = mod

            methodTime = {}
            for method in ('getFor', 'getForHeadword', 'getForReading',
                           'getForTranslation'):
                t = Timer("""timeit_runmod.runRequest(
                                timeit_runmod.dictInstance,
                                timeit_runmod.requestList,
                                method='%s')
                          """ % method,
                          "import timeit_runmod")
                methodTime[method] = t.timeit(iteration)
            runTime[dictName] = methodTime

        timing[no] = runTime

    return timing
Beispiel #6
0
    def __init__(self, dictionary=None, dbConnectInst=None, databaseUrl=None,
        strokeOrderType=None, showAlternativeHeadwords=True, **options):

        self.db = dbConnectInst or getDBConnector(
            util.getDatabaseConfiguration(databaseUrl))

        self.showAlternativeHeadwords = showAlternativeHeadwords
        self.useExtraReadingInformation = options.get(
            'useExtraReadingInformation', False)

        self.availableDictionaryNames = getAvailableDictionaryNames(self.db)

        # get ditionary
        if dictionary in self.availableDictionaryNames:
            self._dictionary = getDictionary(dictionary, dbConnectInst=self.db,
                ignoreIllegalSettings=True, **options)
        else:
            translationLanguage = getTranslationLanguage()
            self._dictionary = getDefaultDictionary(translationLanguage,
                dbConnectInst=self.db, ignoreIllegalSettings=True, **options)

        self.dictionary = self._dictionary.PROVIDES
        self.reading = self._dictionary.reading
        self.language = self._dictionary.language
        self.characterDomain = self._dictionary.charDB.characterDomain
        self.compatibleCharacterDomains \
            = self._dictionary.charDB.getCompatibleCharacterDomains()

        # stroke order
        availableStrokeOrder = self.getAvailableStrokeOrderTypes()
        if strokeOrderType and strokeOrderType in availableStrokeOrder:
            self.strokeOrderType = strokeOrderType
        else:
            # don't show BIG_STROKE_ORDER_TYPE twice
            if self.BIG_STROKE_ORDER_TYPE in availableStrokeOrder:
                index = availableStrokeOrder.index(self.BIG_STROKE_ORDER_TYPE)
                del availableStrokeOrder[index]
            originalType = self.BIG_STROKE_ORDER_TYPE.replace('.segment', '')
            if originalType in availableStrokeOrder:
                index = availableStrokeOrder.index(originalType)
                del availableStrokeOrder[index]

            if availableStrokeOrder:
                self.strokeOrderType = availableStrokeOrder[0]
            else:
                self.strokeOrderType = None
def getAvailableDictionaries(dbConnectInst=None):
    """
    Returns a list of available dictionaries for the given database
    connection.

    :type dbConnectInst: instance
    :param dbConnectInst: optional instance of a
        :class:`~cjklib.dbconnector.DatabaseConnector`
    :rtype: list of class
    :return: list of dictionary class objects
    """
    dbConnectInst = dbConnectInst or dbconnector.getDBConnector()
    available = []
    for dictionaryClass in getDictionaryClasses():
        if dictionaryClass.available(dbConnectInst):
            available.append(dictionaryClass)

    return available
Beispiel #8
0
def getAvailableDictionaries(dbConnectInst=None):
    """
    Returns a list of available dictionaries for the given database
    connection.

    :type dbConnectInst: instance
    :param dbConnectInst: optional instance of a
        :class:`~cjklib.dbconnector.DatabaseConnector`
    :rtype: list of class
    :return: list of dictionary class objects
    """
    dbConnectInst = dbConnectInst or dbconnector.getDBConnector()
    available = []
    for dictionaryClass in getDictionaryClasses():
        if dictionaryClass.available(dbConnectInst):
            available.append(dictionaryClass)

    return available
Beispiel #9
0
def recreateIndex(database, registerUnicode=False):
    connection = {'sqlalchemy.url': 'sqlite:///%s' % database,
                  'attach': ['cjklib'], 'registerUnicode': registerUnicode}
    db = dbconnector.getDBConnector(connection)
    availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                        in dictionary.BaseDictionary\
                            .getAvailableDictionaries(db)]
    dictionaries = (set(availableDicts)
                    & set(db.engine.table_names(schema=db._mainSchema)))

    for dictName in ['CEDICT', 'CEDICTGR', 'HanDeDict', 'CFDICT']:
        if dictName in dictionaries:
            print "Recreating index for '%s'" % dictName
            try:
                db.execute(text("DROP INDEX %s__Reading" % dictName))
            except OperationalError:
                pass
            db.execute(text(("CREATE INDEX %(dict)s__Reading ON %(dict)s"
                             " ('READING' COLLATE NOCASE)")
                            % {'dict': dictName}))
Beispiel #10
0
def recreateIndex(database, registerUnicode=False):
    connection = {
        'sqlalchemy.url': 'sqlite:///%s' % database,
        'attach': ['cjklib'],
        'registerUnicode': registerUnicode
    }
    db = dbconnector.getDBConnector(connection)
    availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                        in dictionary.BaseDictionary\
                            .getAvailableDictionaries(db)]
    dictionaries = (set(availableDicts)
                    & set(db.engine.table_names(schema=db._mainSchema)))

    for dictName in ['CEDICT', 'CEDICTGR', 'HanDeDict', 'CFDICT']:
        if dictName in dictionaries:
            print("Recreating index for '%s'" % dictName)
            try:
                db.execute(text("DROP INDEX %s__Reading" % dictName))
            except OperationalError:
                pass
            db.execute(
                text(("CREATE INDEX %(dict)s__Reading ON %(dict)s"
                      " ('READING' COLLATE NOCASE)") % {'dict': dictName}))
    def _getDecompositionEntriesDict(cls):
        """
        Gets the decomposition table from the database.

        @rtype: dict
        @return: dictionary with key pair character, I{glyph} and the first
            layer decomposition as value with the entry's flag
        """
        decompDict = {}
        # get entries from database
        db = dbconnector.getDBConnector()
        table = db.tables['CharacterDecomposition']

        result = db.selectRows(select([table.c.ChineseCharacter,
            table.c.Glyph, table.c.Decomposition, table.c.Flags])\
                .order_by(table.c.SubIndex))
        entries = []
        for char, glyph, decompString, flags in result:
            decomposition = CharacterLookup.decompositionFromString(
                decompString)
            entries.append((char, glyph, decomposition, set(flags)))

        return entries
Beispiel #12
0
    def _getDecompositionEntriesDict(cls):
        """
        Gets the decomposition table from the database.

        @rtype: dict
        @return: dictionary with key pair character, I{glyph} and the first
            layer decomposition as value with the entry's flag
        """
        decompDict = {}
        # get entries from database
        db = dbconnector.getDBConnector()
        table = db.tables["CharacterDecomposition"]

        result = db.selectRows(
            select([table.c.ChineseCharacter, table.c.Glyph, table.c.Decomposition, table.c.Flags]).order_by(
                table.c.SubIndex
            )
        )
        entries = []
        for char, glyph, decompString, flags in result:
            decomposition = CharacterLookup.decompositionFromString(decompString)
            entries.append((char, glyph, decomposition, set(flags)))

        return entries
Beispiel #13
0
    def __init__(self, **options):
        """
        Initialises the BaseDictionary instance.

        :keyword entryFactory: entry factory instance
        :keyword columnFormatStrategies: column formatting strategy instances
        :keyword headwordSearchStrategy: headword search strategy instance
        :keyword readingSearchStrategy: reading search strategy instance
        :keyword translationSearchStrategy: translation search strategy instance
        :keyword mixedReadingSearchStrategy: mixed reading search strategy
            instance
        :keyword databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :keyword dbConnectInst: instance of a :class:`~cjklib.dbconnector.DatabaseConnector`
        """
        # get connector to database
        if 'dbConnectInst' in options:
            self.db = options['dbConnectInst']
        else:
            databaseUrl = options.pop('databaseUrl', None)
            self.db = dbconnector.getDBConnector(databaseUrl)
            """:class:`~cjklib.dbconnector.DatabaseConnector` instance"""

        if 'entryFactory' in options:
            self.entryFactory = options['entryFactory']
        else:
            self.entryFactory = entryfactory.Tuple()
            """Factory for formatting row entries."""
        if hasattr(self.entryFactory, 'setDictionaryInstance'):
            self.entryFactory.setDictionaryInstance(self)

        columnFormatStrategies = options.get('columnFormatStrategies', {})
        self.setColumnFormatStrategies(columnFormatStrategies)

        if 'headwordSearchStrategy' in options:
            self.headwordSearchStrategy = options['headwordSearchStrategy']
        else:
            self.headwordSearchStrategy = searchstrategy.Wildcard()
            """Strategy for searching readings."""
        if hasattr(self.headwordSearchStrategy, 'setDictionaryInstance'):
            self.headwordSearchStrategy.setDictionaryInstance(self)

        if 'readingSearchStrategy' in options:
            self.readingSearchStrategy = options['readingSearchStrategy']
        else:
            self.readingSearchStrategy = searchstrategy.Wildcard()
            """Strategy for searching readings."""
        if hasattr(self.readingSearchStrategy, 'setDictionaryInstance'):
            self.readingSearchStrategy.setDictionaryInstance(self)

        self.mixedReadingSearchStrategy = options.get(
            'mixedReadingSearchStrategy', None)
        """Strategy for mixed searching of headword/reading."""
        if (self.mixedReadingSearchStrategy
            and hasattr(self.mixedReadingSearchStrategy,
                'setDictionaryInstance')):
            self.mixedReadingSearchStrategy.setDictionaryInstance(self)

        if 'translationSearchStrategy' in options:
            self.translationSearchStrategy \
                = options['translationSearchStrategy']
        else:
            self.translationSearchStrategy \
                = searchstrategy.WildcardTranslation()
            """Strategy for searching translations."""
        if hasattr(self.translationSearchStrategy, 'setDictionaryInstance'):
            self.translationSearchStrategy.setDictionaryInstance(self)
 def setUp(self):
     self.db = dbconnector.getDBConnector()
Beispiel #15
0
    def __init__(self, **options):
        """
        To modify the behaviour of :class:`~cjklib.build.builder.TableBuilder`
        instances, global or local options can be specified, see
        :meth:`~cjklib.build.builder.TableBuilder.getBuilderOptions`.

        :keyword databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :keyword dbConnectInst: instance of a
            :class:`~cjklib.dbconnector.DatabaseConnector`
        :keyword dataPath: optional list of paths to the data file(s)
        :keyword quiet: if ``True`` no status information will be printed to
            stderr
        :keyword rebuildDepending: if ``True`` existing tables that depend on
            updated tables will be dropped and built from scratch
        :keyword rebuildExisting: if ``True`` existing tables will be
            dropped and built from scratch
        :keyword noFail: if ``True`` build process won't terminate even if one
            table fails to build
        :keyword prefer: list of :class:`~cjklib.build.builder.TableBuilder`
            names to prefer in conflicting cases
        :keyword additionalBuilders: list of externally provided TableBuilders
        :raise ValueError: if two different options from two different builder
            collide.
        """
        if "dataPath" not in options:
            # look for data underneath the build module
            projectDataPath = locateProjectFile("cjklib/data", "cjklib")
            if not projectDataPath:
                projectDataPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../data")
            options["dataPath"] = [projectDataPath]

        elif isinstance(options["dataPath"], basestring):
            # wrap as list
            options["dataPath"] = [options["dataPath"]]

        self.quiet = options.get("quiet", False)
        """Controls status information printed to stderr"""
        self.rebuildDepending = options.pop("rebuildDepending", True)
        """Controls if tables that depend on updated tables will be rebuilt."""
        self.rebuildExisting = options.pop("rebuildExisting", True)
        """Controls if existing tables will be rebuilt."""
        self.noFail = options.pop("noFail", False)
        """Controls if build process terminate on failed tables."""
        # get connector to database
        databaseUrl = options.pop("databaseUrl", None)
        if "dbConnectInst" in options:
            self.db = options.pop("dbConnectInst")
        else:
            self.db = dbconnector.getDBConnector({"sqlalchemy.url": databaseUrl})
            """:class:`~cjklib.dbconnector.DatabaseConnector` instance"""

        # get TableBuilder classes
        tableBuilderClasses = DatabaseBuilder.getTableBuilderClasses(
            set(options.pop("prefer", [])), quiet=self.quiet, additionalBuilders=options.pop("additionalBuilders", [])
        )

        # build lookup
        self._tableBuilderLookup = {}
        for tableBuilder in tableBuilderClasses:
            if tableBuilder.PROVIDES in self._tableBuilderLookup:
                raise Exception("Table '%s' provided by several builders" % tableBuilder.PROVIDES)
            self._tableBuilderLookup[tableBuilder.PROVIDES] = tableBuilder

        # options for TableBuilders
        self.options = options
        """Table builder options dictionary"""
Beispiel #16
0
 def __init__(self, dbConnectInst=None, databaseUrl=None):
     self.db = dbConnectInst or getDBConnector(
         getDatabaseConfiguration(databaseUrl))
def main():
    # get cjklib database table
    databaseTable = {}
    db = dbconnector.getDBConnector()
    table = db.tables["RadicalEquivalentCharacter"]
    entries = db.selectRows(select([table.c.Form, table.c.EquivalentForm, table.c.Locale]))
    for radicalForm, equivalentForm, locale in entries:
        databaseTable[(radicalForm, equivalentForm)] = locale

    fileEntryCount = 0
    one2oneEntryCount = 0
    noEntryCount = 0
    narrowLocaleCount = 0
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r"\s*#", line) or re.match(r"\s+$", line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(r"([1234567890ABCDEF]{4});\s+([1234567890ABCDEF]{4,5})\s+#", line)
            if matchObj:
                radicalForm = unichr(int(matchObj.group(1), 16))
                equivalentForm = unichr(int(matchObj.group(2), 16))
                if (radicalForm, equivalentForm) in databaseTable:
                    # entry included in database
                    if databaseTable[(radicalForm, equivalentForm)] != "TCJKV":
                        # locale of entry is narrower, i.e. subset of TCJKV
                        print (
                            "Narrowed locale for '"
                            + radicalForm
                            + "' ("
                            + matchObj.group(1).lower()
                            + "), '"
                            + equivalentForm
                            + "' ("
                            + matchObj.group(2).lower()
                            + "), locale "
                            + databaseTable[(radicalForm, equivalentForm)]
                        ).encode(default_encoding)
                        narrowLocaleCount = narrowLocaleCount + 1
                    else:
                        one2oneEntryCount = one2oneEntryCount + 1
                    del databaseTable[(radicalForm, equivalentForm)]
                else:
                    print (
                        "No entry for '"
                        + radicalForm
                        + "' ("
                        + matchObj.group(1).lower()
                        + "), '"
                        + equivalentForm
                        + "' ("
                        + matchObj.group(2).lower()
                        + ")"
                    ).encode(default_encoding)
                    noEntryCount = noEntryCount + 1
            else:
                print ("error reading line: '" + line + "'").encode(default_encoding)

    # database entries not included in table
    for radicalForm, equivalentForm in databaseTable:
        print (
            "Database entry not included in table: '"
            + radicalForm
            + "' ("
            + hex(ord(radicalForm)).replace("0x", "")
            + "), '"
            + equivalentForm
            + "' ("
            + hex(ord(equivalentForm)).replace("0x", "")
            + "), locale "
            + databaseTable[(radicalForm, equivalentForm)]
        ).encode(default_encoding)

    print "Total " + str(fileEntryCount) + " entries, " + str(
        one2oneEntryCount
    ) + " fully included in database, " + str(noEntryCount) + " without entry, " + str(
        narrowLocaleCount
    ) + " with narrowed locale"
Beispiel #18
0
def runTests(tests, databases, registerUnicode, iteration=10):
    f = ReadingFactory()

    timing = {}
    for no in tests:
        print("Running test %d (reading from %s)..." % (no, databases[no]))

        connection = {
            'sqlalchemy.url': 'sqlite:///%s' % databases[no],
            'attach': ['cjklib'],
            'registerUnicode': registerUnicode[no]
        }
        db = dbconnector.getDBConnector(connection)
        availableDicts = [dictClass.DICTIONARY_TABLE for dictClass
                          in dictionary.BaseDictionary\
                             .getAvailableDictionaries(db)]
        dictionaries = list(
            set(availableDicts)
            & set(db.engine.table_names(schema=db._mainSchema)))
        if not dictionaries:
            raise ValueError("No dictionaries found")

        print("Found dictionaries '%s'" % "', '".join(dictionaries))

        runTime = {}
        for dictName in dictionaries:
            dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName)
            dictInstance = dictClass(dbConnectInst=db)

            opClass = (dictClass.READING
                       and f.getReadingOperatorClass(dictClass.READING))
            if hasattr(opClass, 'guessReadingDialect'):
                requestList = []
                for request in SEARCH_REQUESTS:
                    options = opClass.guessReadingDialect(request)
                    requestList.append((request, options))
            else:
                requestList = [(request, {}) for request in SEARCH_REQUESTS]

            mod = imp.new_module('timeit_runmod')
            mod.runRequest = runRequest
            mod.dictInstance = dictInstance
            mod.requestList = requestList

            sys.modules['timeit_runmod'] = mod

            methodTime = {}
            for method in ('getFor', 'getForHeadword', 'getForReading',
                           'getForTranslation'):
                t = Timer(
                    """timeit_runmod.runRequest(
                                timeit_runmod.dictInstance,
                                timeit_runmod.requestList,
                                method='%s')
                          """ % method, "import timeit_runmod")
                methodTime[method] = t.timeit(iteration)
            runTime[dictName] = methodTime

        timing[no] = runTime

    return timing
"""
from sqlalchemy import select, union

from cjklib import dbconnector
from cjklib import characterlookup

characterSet = 'GB2312Set'

minimalBasicComponents = set()
"""Set of minimal basic components."""
fullyDecomposedCharacters = set()
"""
Set of characters with decomposed components completely contained in
minimalBasicComponents.
"""
db = dbconnector.getDBConnector()
decompositionTable = db.tables['CharacterDecomposition']
strokeOrderTable = db.tables['StrokeOrder']
charsetTable = db.tables[characterSet]

characterQueue = set(
    db.selectRows(
        union(
            select([
                decompositionTable.c.ChineseCharacter,
                decompositionTable.c.Glyph
            ],
                   decompositionTable.c.ChineseCharacter.in_(
                       select([charsetTable.c.ChineseCharacter])),
                   distinct=True),
            select([
Beispiel #20
0
"""
from sqlalchemy import select, union

from cjklib import dbconnector
from cjklib import characterlookup

characterSet = 'GB2312Set'

minimalBasicComponents = set()
"""Set of minimal basic components."""
fullyDecomposedCharacters = set()
"""
Set of characters with decomposed components completely contained in
minimalBasicComponents.
"""
db = dbconnector.getDBConnector()
decompositionTable = db.tables['CharacterDecomposition']
strokeOrderTable = db.tables['StrokeOrder']
charsetTable = db.tables[characterSet]

characterQueue = set(db.selectRows(union(
    select([decompositionTable.c.ChineseCharacter,
            decompositionTable.c.Glyph],
        decompositionTable.c.ChineseCharacter.in_(
            select([charsetTable.c.ChineseCharacter])),
        distinct=True),
    select([strokeOrderTable.c.ChineseCharacter,
            strokeOrderTable.c.Glyph],
        strokeOrderTable.c.ChineseCharacter.in_(
            select([charsetTable.c.ChineseCharacter])),
        distinct=True))))
    def __init__(self, **options):
        """
        Initialises the BaseDictionary instance.

        :keyword entryFactory: entry factory instance
        :keyword columnFormatStrategies: column formatting strategy instances
        :keyword headwordSearchStrategy: headword search strategy instance
        :keyword readingSearchStrategy: reading search strategy instance
        :keyword translationSearchStrategy: translation search strategy instance
        :keyword mixedReadingSearchStrategy: mixed reading search strategy
            instance
        :keyword databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :keyword dbConnectInst: instance of a :class:`~cjklib.dbconnector.DatabaseConnector`
        """
        # get connector to database
        if 'dbConnectInst' in options:
            self.db = options['dbConnectInst']
        else:
            databaseUrl = options.pop('databaseUrl', None)
            self.db = dbconnector.getDBConnector(databaseUrl)
            """:class:`~cjklib.dbconnector.DatabaseConnector` instance"""

        if 'entryFactory' in options:
            self.entryFactory = options['entryFactory']
        else:
            self.entryFactory = entryfactory.Tuple()
            """Factory for formatting row entries."""
        if hasattr(self.entryFactory, 'setDictionaryInstance'):
            self.entryFactory.setDictionaryInstance(self)

        columnFormatStrategies = options.get('columnFormatStrategies', {})
        self.setColumnFormatStrategies(columnFormatStrategies)

        if 'headwordSearchStrategy' in options:
            self.headwordSearchStrategy = options['headwordSearchStrategy']
        else:
            self.headwordSearchStrategy = searchstrategy.Wildcard()
            """Strategy for searching readings."""
        if hasattr(self.headwordSearchStrategy, 'setDictionaryInstance'):
            self.headwordSearchStrategy.setDictionaryInstance(self)

        if 'readingSearchStrategy' in options:
            self.readingSearchStrategy = options['readingSearchStrategy']
        else:
            self.readingSearchStrategy = searchstrategy.Wildcard()
            """Strategy for searching readings."""
        if hasattr(self.readingSearchStrategy, 'setDictionaryInstance'):
            self.readingSearchStrategy.setDictionaryInstance(self)

        self.mixedReadingSearchStrategy = options.get(
            'mixedReadingSearchStrategy', None)
        """Strategy for mixed searching of headword/reading."""
        if (self.mixedReadingSearchStrategy
            and hasattr(self.mixedReadingSearchStrategy,
                'setDictionaryInstance')):
            self.mixedReadingSearchStrategy.setDictionaryInstance(self)

        if 'translationSearchStrategy' in options:
            self.translationSearchStrategy \
                = options['translationSearchStrategy']
        else:
            self.translationSearchStrategy \
                = searchstrategy.WildcardTranslation()
            """Strategy for searching translations."""
        if hasattr(self.translationSearchStrategy, 'setDictionaryInstance'):
            self.translationSearchStrategy.setDictionaryInstance(self)
def main():
    # get cjklib database table
    databaseTable = {}
    db = dbconnector.getDBConnector()
    table = db.tables['RadicalEquivalentCharacter']
    entries = db.selectRows(
        select([table.c.Form, table.c.EquivalentForm, table.c.Locale]))
    for radicalForm, equivalentForm, locale in entries:
        databaseTable[(radicalForm, equivalentForm)] = locale

    fileEntryCount = 0
    one2oneEntryCount = 0
    noEntryCount = 0
    narrowLocaleCount = 0
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r'\s*#', line) or re.match(r'\s+$', line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(
                r'([1234567890ABCDEF]{4});\s+([1234567890ABCDEF]{4,5})\s+#',
                line)
            if matchObj:
                radicalForm = chr(int(matchObj.group(1), 16))
                equivalentForm = chr(int(matchObj.group(2), 16))
                if (radicalForm, equivalentForm) in databaseTable:
                    # entry included in database
                    if databaseTable[(radicalForm, equivalentForm)] != 'TCJKV':
                        # locale of entry is narrower, i.e. subset of TCJKV
                        print(("Narrowed locale for '" + radicalForm \
                            + "' (" + matchObj.group(1).lower() + "), '" \
                            + equivalentForm + "' (" \
                            + matchObj.group(2).lower() + "), locale " \
                            + databaseTable[(radicalForm, equivalentForm)])\
                            .encode(default_encoding))
                        narrowLocaleCount = narrowLocaleCount + 1
                    else:
                        one2oneEntryCount = one2oneEntryCount + 1
                    del databaseTable[(radicalForm, equivalentForm)]
                else:
                    print(("No entry for '" + radicalForm \
                        + "' (" + matchObj.group(1).lower() + "), '" \
                        + equivalentForm + "' (" + matchObj.group(2).lower() \
                        + ")").encode(default_encoding))
                    noEntryCount = noEntryCount + 1
            else:
                print(("error reading line: '" + line + "'")\
                    .encode(default_encoding))

    # database entries not included in table
    for radicalForm, equivalentForm in databaseTable:
        print(("Database entry not included in table: '" + radicalForm \
            + "' (" + hex(ord(radicalForm)).replace('0x', '') + "), '" \
            + equivalentForm + "' (" \
            + hex(ord(equivalentForm)).replace('0x', '') +"), locale " \
            + databaseTable[(radicalForm, equivalentForm)])\
            .encode(default_encoding))

    print("Total " + str(fileEntryCount) + " entries, " \
        + str(one2oneEntryCount) + " fully included in database, " \
        + str(noEntryCount) + " without entry, " \
        + str(narrowLocaleCount) + " with narrowed locale")
Beispiel #23
0
    def __init__(self, **options):
        """
        To modify the behaviour of :class:`~cjklib.build.builder.TableBuilder`
        instances, global or local options can be specified, see
        :meth:`~cjklib.build.builder.TableBuilder.getBuilderOptions`.

        :keyword databaseUrl: database connection setting in the format
            ``driver://user:pass@host/database``.
        :keyword dbConnectInst: instance of a
            :class:`~cjklib.dbconnector.DatabaseConnector`
        :keyword dataPath: optional list of paths to the data file(s)
        :keyword quiet: if ``True`` no status information will be printed to
            stderr
        :keyword rebuildDepending: if ``True`` existing tables that depend on
            updated tables will be dropped and built from scratch
        :keyword rebuildExisting: if ``True`` existing tables will be
            dropped and built from scratch
        :keyword noFail: if ``True`` build process won't terminate even if one
            table fails to build
        :keyword prefer: list of :class:`~cjklib.build.builder.TableBuilder`
            names to prefer in conflicting cases
        :keyword additionalBuilders: list of externally provided TableBuilders
        :raise ValueError: if two different options from two different builder
            collide.
        """
        if 'dataPath' not in options:
            # look for data underneath the build module
            projectDataPath = locateProjectFile('cjklib/data', 'cjklib')
            if not projectDataPath:
                projectDataPath = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), '../data')
            options['dataPath'] = [projectDataPath]

        elif isinstance(options['dataPath'], str):
            # wrap as list
            options['dataPath'] = [options['dataPath']]

        self.quiet = options.get('quiet', False)
        """Controls status information printed to stderr"""
        self.rebuildDepending = options.pop('rebuildDepending', True)
        """Controls if tables that depend on updated tables will be rebuilt."""
        self.rebuildExisting = options.pop('rebuildExisting', True)
        """Controls if existing tables will be rebuilt."""
        self.noFail = options.pop('noFail', False)
        """Controls if build process terminate on failed tables."""
        # get connector to database
        databaseUrl = options.pop('databaseUrl', None)
        if 'dbConnectInst' in options:
            self.db = options.pop('dbConnectInst')
        else:
            self.db = dbconnector.getDBConnector(
                {'sqlalchemy.url': databaseUrl})
            """:class:`~cjklib.dbconnector.DatabaseConnector` instance"""

        # get TableBuilder classes
        tableBuilderClasses = DatabaseBuilder.getTableBuilderClasses(
            set(options.pop('prefer', [])), quiet=self.quiet,
            additionalBuilders=options.pop('additionalBuilders', []))

        # build lookup
        self._tableBuilderLookup = {}
        for tableBuilder in tableBuilderClasses:
            if tableBuilder.PROVIDES in self._tableBuilderLookup:
                raise Exception("Table '%s' provided by several builders" \
                    % tableBuilder.PROVIDES)
            self._tableBuilderLookup[tableBuilder.PROVIDES] = tableBuilder

        # options for TableBuilders
        self.options = options
        """Table builder options dictionary"""