Ejemplo n.º 1
0
    def removeVersion(self):
        if not self.db.mainHasTable('UpdateVersion'):
            return

        table = self.db.tables['UpdateVersion']
        try:
            self.db.execute(table.delete().where(
                table.c.TableName == self.PROVIDES))
        except sqlalchemy.exceptions.IntegrityError, e:
            warn(unicode(e))
            #warn(unicode(insertStatement))
            raise
Ejemplo n.º 2
0
    def insertVersion(self, date):
        if not self.db.mainHasTable('UpdateVersion'):
            return

        self.removeVersion()

        table = self.db.tables['UpdateVersion']
        try:
            self.db.execute(table.insert().values(TableName=self.PROVIDES,
                ReleaseDate=date))
        except sqlalchemy.exceptions.IntegrityError, e:
            warn(unicode(e))
            #warn(unicode(insertStatement))
            raise
Ejemplo n.º 3
0
    def build(self):
        import codecs
        import os.path as path
        filePath = self.findFile(self.FILE_NAMES, "HSK table")
        fileHandle = codecs.open(filePath, 'r', 'utf8')

        # get create statement
        table = self.buildTableObject(self.PROVIDES,
            ['HeadwordTraditional', 'HeadwordSimplified', 'Level'],
            {'HeadwordTraditional': String(255),
                'HeadwordSimplified': String(255), 'Level': Integer()},
            ['HeadwordTraditional'])
        table.create()

        # read entries, check for double entries
        traditionalHeadwordLevelDict = {} # headword to level mapping
        tradSimpHeadwordDict = {} # traditional to simplified
        doubleEntryCount = 0 # double/tripple entries
        for headwordTrad, headwordSimp, level \
            in UnicodeCSVFileIterator(fileHandle):
            # skip headwords already seen
            if headwordTrad in traditionalHeadwordLevelDict:
                doubleEntryCount = doubleEntryCount + 1
                if level \
                    < self.LEVELS[traditionalHeadwordLevelDict[headwordTrad]]:
                    traditionalHeadwordLevelDict[headwordTrad] = level
                    tradSimpHeadwordDict[headwordTrad] = headwordSimp
            else:
                traditionalHeadwordLevelDict[headwordTrad] = level
                tradSimpHeadwordDict[headwordTrad] = headwordSimp

        # write table content
        for headwordTrad in traditionalHeadwordLevelDict:
            headwordSimp = tradSimpHeadwordDict[headwordTrad]
            level = self.LEVELS[traditionalHeadwordLevelDict[headwordTrad]]
            try:
                self.db.execute(table.insert().values(
                    HeadwordTraditional=headwordTrad,
                    HeadwordSimplified=headwordSimp, Level=level))
            except sqlalchemy.exceptions.IntegrityError, e:
                warn(unicode(e))
                #warn(unicode(insertStatement))
                raise
Ejemplo n.º 4
0
        def generator(self):
            """Provides a pronunciation and a path to the audio file."""
            for path in self.dataPath:
                filePath = os.path.join(os.path.expanduser(path),
                    self.baseFolderName)
                if os.path.exists(filePath):
                    break
            else:
                raise IOError("No package found for '" + self.baseFolderName \
                    + "' under path(s)'" + "', '".join(self.dataPath) + "'")

            try:
                xmlFile = bz2.BZ2File(os.path.join(filePath, 'index.xml.bz2'))
            except IOError:
                raise IOError("Index file 'index.xml.bz2' not found under '" \
                    + filePath + "'")

            fileList = []
            indexHandler = SwacAudioCollectionBuilder.SwacXMLIndexHandler(
                fileList)

            saxparser = xml.sax.make_parser()
            saxparser.setContentHandler(indexHandler)
            # don't check DTD as this raises an exception
            saxparser.setFeature(xml.sax.handler.feature_external_ges, False)
            saxparser.parse(xmlFile)

            seenPronunciations = set()
            doubletteCount = 0
            for pronunciation, filePath in fileList:
                relativePath = os.path.join(self.baseFolderName, filePath)
                if pronunciation not in seenPronunciations:
                    yield(pronunciation, relativePath)
                else:
                    doubletteCount += 1

                seenPronunciations.add(pronunciation)

            if not self.quiet and doubletteCount:
                warn("Found " + str(doubletteCount) \
                    + " similar pronunciations, omitted")
Ejemplo n.º 5
0
        def generator(self):
            """Provides a pronunciation and a path to the audio file."""
            for path in self.dataPath:
                filePath = os.path.join(os.path.expanduser(path),
                    self.baseFolderName)
                if os.path.exists(filePath):
                    break
            else:
                raise IOError("No package found for '" + self.baseFolderName \
                    + "' under path(s)'" + "', '".join(self.dataPath) + "'")

            seenPronunciations = set()
            doubletteCount = 0

            import glob
            try:
                for extension in self.fileExtensions:
                    it = glob.iglob(os.path.join(filePath, '*' + extension))
                    for filePath in it:
                        baseName = os.path.basename(filePath)
                        pronunciation = self.mappingFunc(baseName)

                        relativePath = os.path.join(self.baseFolderName,
                            baseName)
                        if pronunciation:
                            if pronunciation not in seenPronunciations:
                                yield(pronunciation, relativePath)
                            else:
                                doubletteCount += 1

                            seenPronunciations.add(pronunciation)
                        elif not self.quiet:
                            warn("No reading gathered from '" \
                                + str(relativePath) + "',  ommitting")

            except IOError:
                raise IOError("Error reading directory '" + filePath + "'")

            if not self.quiet and doubletteCount:
                warn("Found " + str(doubletteCount) \
                    + " similar pronunciations, omitted")
Ejemplo n.º 6
0
    def getGenerator(self):
        contentFile = self.findFile([self.TABLE_CSV_FILE_MAPPING], "table")

        # write table content
        if not self.quiet:
            warn("Reading table '" + self.PROVIDES + "' from file '" \
                + contentFile + "'")
        import codecs
        fileHandle = codecs.open(contentFile, 'r', 'utf-8')

        radicalEntries = {}
        for line in UnicodeCSVFileIterator(fileHandle):
            if len(line) == 1 and not line[0].strip():
                continue
            radicalIdx, reading, meaning = line
            radicalEntries[int(radicalIdx)] = (reading, meaning.split(','))

        # add unicode ones
        import unicodedata
        for i in range(0, 214):
            radicalIdx = i + 1
            if radicalIdx in radicalEntries:
                reading, meanings = radicalEntries[radicalIdx]
            else:
                reading = None
                meanings = []
            try:
                name = unicodedata.name(unichr(int('2f00', 16) + i))
                name = name.replace('KANGXI RADICAL ', '').lower()
                if name not in meanings:
                    meanings += [name]
            except ValueError:
                pass

            if meanings:
                yield({'RadicalIndex': radicalIdx, 'Reading': reading,
                    'Meaning': ', '.join(meanings)})
Ejemplo n.º 7
0
    def build(self):
        import codecs
        import os.path as path
        filePath = self.findFile(self.FILE_NAMES, "HSK table")
        fileHandle = codecs.open(filePath, 'r', 'utf8')

        # get create statement
        table = self.buildTableObject(self.PROVIDES, ['Headword', 'Level'],
            {'Headword': String(255), 'Level': Integer()}, ['Headword'])
        table.create()

        # write table content
        currentLevel = None     # current level 1-4
        seenHeadwords = set()   # already saved headwords
        doubleEntryCount = 0    # double/tripple entries
        multiEntryCount = 0     # lines with mutiple headwords
        for line in UnicodeCSVFileIterator(fileHandle):
            # check for level boundary
            if line[0] == '':
                if line[1][0] in self.LEVELS.keys():
                    currentLevel = self.LEVELS[line[1][0]]
                elif not self.quiet and not re.match(r'^[a-zA-Z]$', line[1]):
                    # skip categories A, B, ... but warn on other content
                    warn("Errorneous line: '" + "', '".join(line) + "'")
                continue

            if currentLevel == None and not self.quiet:
                warn("No level information found, skipping line: '" \
                    + "', '".join(line) + "'")
                continue

            # create entry, take care of mutiple entries in one line:
            headwords = line[1].split('/')
            # if includes terms in brackets split entry into two
            if line[1].find(u'(') >= 0:
                newHeadwords = []
                for headword in headwords:
                    if headword.find(u'(') >= 0:
                        # one with all words
                        newHeadwords.append(re.sub(u'[\(\)]', headword, ''))
                        # one without chars in brackets
                        newHeadwords.append(re.sub(u'\([\)]*\)', headword, ''))
                    else:
                        newHeadwords.append(headword)
                headwords = newHeadwords

            if len(headwords) > 1:
                multiEntryCount = multiEntryCount + 1
            for headword in headwords:
                # skip headwords already seen
                if headword in seenHeadwords:
                    doubleEntryCount = doubleEntryCount + 1
                    continue
                else:
                    seenHeadwords.add(headword)

                entry = {'Headword': headword, 'Level': currentLevel}

                try:
                    self.db.execute(table.insert().values(**entry))
                except sqlalchemy.exceptions.IntegrityError, e:
                    warn(unicode(e))
                    #warn(unicode(insertStatement))
                    raise