def removeVersion(self): if not self.db.mainHasTable('UpdateVersion'): return table = self.db.tables['UpdateVersion'] try: self.db.execute(table.delete().where( table.c.TableName == self.PROVIDES)) except sqlalchemy.exceptions.IntegrityError, e: warn(unicode(e)) #warn(unicode(insertStatement)) raise
def insertVersion(self, date): if not self.db.mainHasTable('UpdateVersion'): return self.removeVersion() table = self.db.tables['UpdateVersion'] try: self.db.execute(table.insert().values(TableName=self.PROVIDES, ReleaseDate=date)) except sqlalchemy.exceptions.IntegrityError, e: warn(unicode(e)) #warn(unicode(insertStatement)) raise
def build(self): import codecs import os.path as path filePath = self.findFile(self.FILE_NAMES, "HSK table") fileHandle = codecs.open(filePath, 'r', 'utf8') # get create statement table = self.buildTableObject(self.PROVIDES, ['HeadwordTraditional', 'HeadwordSimplified', 'Level'], {'HeadwordTraditional': String(255), 'HeadwordSimplified': String(255), 'Level': Integer()}, ['HeadwordTraditional']) table.create() # read entries, check for double entries traditionalHeadwordLevelDict = {} # headword to level mapping tradSimpHeadwordDict = {} # traditional to simplified doubleEntryCount = 0 # double/tripple entries for headwordTrad, headwordSimp, level \ in UnicodeCSVFileIterator(fileHandle): # skip headwords already seen if headwordTrad in traditionalHeadwordLevelDict: doubleEntryCount = doubleEntryCount + 1 if level \ < self.LEVELS[traditionalHeadwordLevelDict[headwordTrad]]: traditionalHeadwordLevelDict[headwordTrad] = level tradSimpHeadwordDict[headwordTrad] = headwordSimp else: traditionalHeadwordLevelDict[headwordTrad] = level tradSimpHeadwordDict[headwordTrad] = headwordSimp # write table content for headwordTrad in traditionalHeadwordLevelDict: headwordSimp = tradSimpHeadwordDict[headwordTrad] level = self.LEVELS[traditionalHeadwordLevelDict[headwordTrad]] try: self.db.execute(table.insert().values( HeadwordTraditional=headwordTrad, HeadwordSimplified=headwordSimp, Level=level)) except sqlalchemy.exceptions.IntegrityError, e: warn(unicode(e)) #warn(unicode(insertStatement)) raise
def generator(self): """Provides a pronunciation and a path to the audio file.""" for path in self.dataPath: filePath = os.path.join(os.path.expanduser(path), self.baseFolderName) if os.path.exists(filePath): break else: raise IOError("No package found for '" + self.baseFolderName \ + "' under path(s)'" + "', '".join(self.dataPath) + "'") try: xmlFile = bz2.BZ2File(os.path.join(filePath, 'index.xml.bz2')) except IOError: raise IOError("Index file 'index.xml.bz2' not found under '" \ + filePath + "'") fileList = [] indexHandler = SwacAudioCollectionBuilder.SwacXMLIndexHandler( fileList) saxparser = xml.sax.make_parser() saxparser.setContentHandler(indexHandler) # don't check DTD as this raises an exception saxparser.setFeature(xml.sax.handler.feature_external_ges, False) saxparser.parse(xmlFile) seenPronunciations = set() doubletteCount = 0 for pronunciation, filePath in fileList: relativePath = os.path.join(self.baseFolderName, filePath) if pronunciation not in seenPronunciations: yield(pronunciation, relativePath) else: doubletteCount += 1 seenPronunciations.add(pronunciation) if not self.quiet and doubletteCount: warn("Found " + str(doubletteCount) \ + " similar pronunciations, omitted")
def generator(self): """Provides a pronunciation and a path to the audio file.""" for path in self.dataPath: filePath = os.path.join(os.path.expanduser(path), self.baseFolderName) if os.path.exists(filePath): break else: raise IOError("No package found for '" + self.baseFolderName \ + "' under path(s)'" + "', '".join(self.dataPath) + "'") seenPronunciations = set() doubletteCount = 0 import glob try: for extension in self.fileExtensions: it = glob.iglob(os.path.join(filePath, '*' + extension)) for filePath in it: baseName = os.path.basename(filePath) pronunciation = self.mappingFunc(baseName) relativePath = os.path.join(self.baseFolderName, baseName) if pronunciation: if pronunciation not in seenPronunciations: yield(pronunciation, relativePath) else: doubletteCount += 1 seenPronunciations.add(pronunciation) elif not self.quiet: warn("No reading gathered from '" \ + str(relativePath) + "', ommitting") except IOError: raise IOError("Error reading directory '" + filePath + "'") if not self.quiet and doubletteCount: warn("Found " + str(doubletteCount) \ + " similar pronunciations, omitted")
def getGenerator(self): contentFile = self.findFile([self.TABLE_CSV_FILE_MAPPING], "table") # write table content if not self.quiet: warn("Reading table '" + self.PROVIDES + "' from file '" \ + contentFile + "'") import codecs fileHandle = codecs.open(contentFile, 'r', 'utf-8') radicalEntries = {} for line in UnicodeCSVFileIterator(fileHandle): if len(line) == 1 and not line[0].strip(): continue radicalIdx, reading, meaning = line radicalEntries[int(radicalIdx)] = (reading, meaning.split(',')) # add unicode ones import unicodedata for i in range(0, 214): radicalIdx = i + 1 if radicalIdx in radicalEntries: reading, meanings = radicalEntries[radicalIdx] else: reading = None meanings = [] try: name = unicodedata.name(unichr(int('2f00', 16) + i)) name = name.replace('KANGXI RADICAL ', '').lower() if name not in meanings: meanings += [name] except ValueError: pass if meanings: yield({'RadicalIndex': radicalIdx, 'Reading': reading, 'Meaning': ', '.join(meanings)})
def build(self): import codecs import os.path as path filePath = self.findFile(self.FILE_NAMES, "HSK table") fileHandle = codecs.open(filePath, 'r', 'utf8') # get create statement table = self.buildTableObject(self.PROVIDES, ['Headword', 'Level'], {'Headword': String(255), 'Level': Integer()}, ['Headword']) table.create() # write table content currentLevel = None # current level 1-4 seenHeadwords = set() # already saved headwords doubleEntryCount = 0 # double/tripple entries multiEntryCount = 0 # lines with mutiple headwords for line in UnicodeCSVFileIterator(fileHandle): # check for level boundary if line[0] == '': if line[1][0] in self.LEVELS.keys(): currentLevel = self.LEVELS[line[1][0]] elif not self.quiet and not re.match(r'^[a-zA-Z]$', line[1]): # skip categories A, B, ... but warn on other content warn("Errorneous line: '" + "', '".join(line) + "'") continue if currentLevel == None and not self.quiet: warn("No level information found, skipping line: '" \ + "', '".join(line) + "'") continue # create entry, take care of mutiple entries in one line: headwords = line[1].split('/') # if includes terms in brackets split entry into two if line[1].find(u'(') >= 0: newHeadwords = [] for headword in headwords: if headword.find(u'(') >= 0: # one with all words newHeadwords.append(re.sub(u'[\(\)]', headword, '')) # one without chars in brackets newHeadwords.append(re.sub(u'\([\)]*\)', headword, '')) else: newHeadwords.append(headword) headwords = newHeadwords if len(headwords) > 1: multiEntryCount = multiEntryCount + 1 for headword in headwords: # skip headwords already seen if headword in seenHeadwords: doubleEntryCount = doubleEntryCount + 1 continue else: seenHeadwords.add(headword) entry = {'Headword': headword, 'Level': currentLevel} try: self.db.execute(table.insert().values(**entry)) except sqlalchemy.exceptions.IntegrityError, e: warn(unicode(e)) #warn(unicode(insertStatement)) raise