def testDid(self): # throws Exception page = self.parse("did.txt") entryIter = List(page.getEntries()).iterator() self.assertEntry(Language.findByName("Translingual"), PartOfSpeech.NUMBER, 1, entryIter.next()) self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 1, entryIter.next()) self.assertEntry(Language.findByName("Old Welsh"), PartOfSpeech.NOUN, 1, entryIter.next()) self.assertFalse(entryIter.hasNext())
def testAbele(self): # throws Exception page = self.parse("abele.txt") entryIter = List(page.getEntries()).iterator() self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1, entryIter.next()) self.assertEntry(Language.findByName("Novial"), PartOfSpeech.NOUN, 1, entryIter.next()) self.assertFalse(entryIter.hasNext())
def connect(self, isReadOnly, allowCreateNew, overwriteExisting, cacheSize): # throws DatabaseException # Configure DB environment. envConfig = EnvironmentConfig() envConfig.setAllowCreate(allowCreateNew) envConfig.setReadOnly(isReadOnly) envConfig.setTransactional(False) if cacheSize is not None: envConfig.setCacheSize(cacheSize) self.env = Environment(self.dbPath, envConfig) # Configure store. storeConfig = StoreConfig() storeConfig.setAllowCreate(allowCreateNew) storeConfig.setTransactional(False) storeConfig.setReadOnly(isReadOnly) self.store = EntityStore(self.env, DBWiktionaryEdition.DATABASE_NAME, storeConfig) # Load properties. self.properties = Properties() propFile = File(self.dbPath.filepath, DBWiktionaryEdition.PROPERTY_FILE_NAME) if propFile.exists(): try: reader = FileReader(propFile) self.properties.load(reader) except IOException as e: raise DatabaseException("Unable to load property file", e) lang = self.properties.getProperty("wiktionary.language") if lang is None: lang = self.properties.getProperty("entry_language") self.language = Language.get(lang) # Load index. self.pageById = self.store.getPrimaryIndex(Long.__class__, WiktionaryPage.__class__) self.pageByTitle = self.store.getSecondaryIndex( self.pageById, String.__class__, "title") self.pageByNormalizedTitle = self.store.getSecondaryIndex( self.pageById, String.__class__, "normalizedTitle") self.entryByKey = self.store.getPrimaryIndex( String.__class__, DBWiktionaryEdition.WiktionaryEntryProxy.__class__) self.entryById = self.store.getSecondaryIndex(self.entryByKey, Long.__class__, "entryId") self.senseByKey = self.store.getPrimaryIndex( String.__class__, DBWiktionaryEdition.WiktionarySenseProxy.__class__) self.openCursors = set()
def testMay(self): # throws Exception page = self.parse("may.txt") entryIter = List(page.getEntries()).iterator() self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 4, entryIter.next()) self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1, entryIter.next()) self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 1, entryIter.next()) self.assertEntry(Language.findByName("Crimean Tatar"), PartOfSpeech.NOUN, 1, entryIter.next()) self.assertEntry(Language.findByName("Kurdish"), PartOfSpeech.NOUN, 1, entryIter.next()) self.assertEntry(Language.findByName("Mapudungun"), PartOfSpeech.ADVERB, 1, entryIter.next()) self.assertEntry(Language.findByName("Tagalog"), PartOfSpeech.VERB, 1, entryIter.next()) self.assertEntry(Language.findByName("Tatar"), PartOfSpeech.NOUN, 1, entryIter.next()) self.assertFalse(entryIter.hasNext())
def testBass(self): # throws Exception page = self.parse("bass.txt") entryIter = List(page.getEntries()).iterator() self.assertEntry(Language.ENGLISH, PartOfSpeech.ADJECTIVE, 1, entryIter.next()) self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 5, entryIter.next()) self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1, entryIter.next()) self.assertEntry(Language.findByName("Romansch"), PartOfSpeech.ADJECTIVE, 1, entryIter.next()) self.assertFalse(entryIter.hasNext())
def processBody(self, text, context): text = text.strip() if text.startswith("{{trans-mid}}") or text.startswith("{{mid}}"): return True if text.startswith("{{trans-top|") and ("}}" in text): template = TemplateParser.parseTemplate(text[2:text.find("}}")]) if template is not None and template.getNumberedParamsCount() >= 1: self.currentSense = template.getNumberedParam(0) return True if text.startswith("{{top}}"): self.currentSense = "" return True if text.startswith("{{trans-bottom}}") or text.startswith( "{{bottom}}" ): # This template indicates the end of the translation block return False if text.startswith("{{") or text.startswith( "=="): # Indicates that a new block has just started. return False matcher = re.search(ENTranslationHandler.LANGUAGE, text) if matcher is None: return False languageText = WikiString.removeWikiLinks(matcher.group(1).strip()) language = Language.findByName(languageText) endOffSet = matcher.end() if endOffSet > len(text) - 1: return False remainingText = text[endOffSet:] for part in self.splitTranslationParts(remainingText): translation = self.parseTranslation(language, part) if translation is not None: # Save the translation if self.currentSense not in self.sensNum2trans: self.sensNum2trans[self.currentSense] = list() translations = self.sensNum2trans[self.currentSense] translations.append(translation) return True
def parseTemplate(self, templateString): template = TemplateParser.parseTemplate(templateString[2:-2]) if template is None or template.getNumberedParamsCount() <= 1: return None translationText = StringUtils.cleanText( WikiString.removeWikiLinks(template.getNumberedParam(1))) if not translationText: return None languageCode = template.getNumberedParam(0) transliteration = template.getNamedParam("tr") translation = WiktionaryTranslation(Language.findByCode(languageCode), translationText) if template.getNumberedParamsCount( ) > 2 and "=" not in template.getNumberedParam(2): translation.setGender(template.getNumberedParam(2)) translation.setCheckNeeded("check" in template.getName()) if transliteration is not None: translation.setTransliteration( StringUtils.cleanText(transliteration)) return translation
def getWordLanguage(self): if self._wordLanguage is None and self.wordLanguageStr is not None: self._wordLanguage = Language.get(self.wordLanguageStr) return self._wordLanguage
def getEntryLanguage(self): if self._entryLanguage is None and self.entryLanguageStr is not None: self._entryLanguage = Language.get(self.entryLanguageStr) return self._entryLanguage
def parseWikisaurusEntries(self, title, text): result = set() reader = StringReader(text) currentLang = None currentPos = None currentRelType = None inList = False inRelation = False inSense = False wikisaurusSense = None try: for line in reader.readLines(): line = line.strip() if not len(line): continue countSectionIdentifier = 0 while countSectionIdentifier < len( line) and line[countSectionIdentifier] == '=': countSectionIdentifier += 1 line = line.replace("=", "") if wikisaurusSense is not None and 2 <= countSectionIdentifier < 4 \ or (countSectionIdentifier == 4 and line.startswith("{{ws sense")): result.add(wikisaurusSense) wikisaurusSense = None if countSectionIdentifier == 2: # Language currentLang = Language.findByName(line) inRelation = False inSense = False elif countSectionIdentifier == 3: # POS currentPos = PartOfSpeech.findByName( line) # TODO: language-specific POS tags? inRelation = False inSense = False elif countSectionIdentifier == 4 and line.startswith( "{{ws sense"): # Sense senseDef = self.extractSenseDefinition(line) wikisaurusSense = WikisaurusEntry(title, currentPos, currentLang, senseDef) inRelation = False inSense = True elif (countSectionIdentifier == 5 or countSectionIdentifier == 4) and inSense: # Relation type currentRelType = self.relTypeMap.get(line.strip().lower()) inRelation = True if currentRelType is None: print(title + " RELATION NOT FOUND: " + line) if line in self.notFoundRelation: self.notFoundRelation[ line] = self.notFoundRelation[line] + 1 else: self.notFoundRelation[line] = 1 elif line.startswith("{{ws beginlist"): inList = True elif line.startswith("{{ws endlist"): inList = False elif line.startswith("{{ws|") and inRelation and inList: target = self.extractRelTarget(line) if currentRelType is not None: wikisaurusSense.addRelation(target[0], target[1], currentRelType) if wikisaurusSense is not None: result.add(wikisaurusSense) except IOException as e: raise RuntimeException( "Error while parsing text of Wikisaurus page " + title, e) return result
def getLanguage(self): if self._language is None and self.languageStr is not None: self._language = Language.get(self.languageStr) return self._language
def resolveLanguage(self, baseURL): idx = baseURL.find("://") language = baseURL[idx + 3: idx + 5] return Language.findByCode(language)
def testISOCodes(self): self.assertEqual("en", Language.findByName("English").getISO639_1()) self.assertEqual("eng", Language.findByName("English").getISO639_2B()) self.assertEqual("eng", Language.findByName("English").getISO639_2T()) self.assertEqual("eng", Language.findByName("English").getISO639_3()) self.assertEqual("de", Language.findByName("German").getISO639_1()) self.assertEqual("ger", Language.findByName("German").getISO639_2B()) self.assertEqual("deu", Language.findByName("German").getISO639_2T()) self.assertEqual("deu", Language.findByName("German").getISO639_3()) self.assertEqual("", Language.findByName("Dimili").getISO639_1()) self.assertEqual("zza", Language.findByName("Dimili").getISO639_2B()) self.assertEqual("zza", Language.findByName("Dimili").getISO639_2T()) self.assertEqual("zza", Language.findByName("Dimili").getISO639_3()) self.assertEqual("", Language.findByName("Aasáx").getISO639_1()) self.assertEqual("", Language.findByName("Aasáx").getISO639_2B()) self.assertEqual("", Language.findByName("Aasáx").getISO639_2T()) self.assertEqual("aas", Language.findByName("Aasáx").getISO639_3()) self.assertEqual("", Language.findByName("Tokipona").getISO639_1()) self.assertEqual("", Language.findByName("Tokipona").getISO639_2B()) self.assertEqual("", Language.findByName("Tokipona").getISO639_2T()) self.assertEqual("", Language.findByName("Tokipona").getISO639_3())
def testFindByName(self): self.assertEqual("eng", Language.findByName("English").getCode()) self.assertEqual("deu", Language.findByName("German").getCode()) self.assertEqual("zza", Language.findByName("Dimili").getCode()) self.assertEqual("zza", Language.findByName("Kirmanjki").getCode()) self.assertEqual("zza", Language.findByName("Kirdki").getCode()) self.assertEqual("zza", Language.findByName("Dimli").getCode()) self.assertEqual("zza", Language.findByName("Zazaki").getCode()) self.assertEqual("zza", Language.findByName("Zaza").getCode()) self.assertEqual("eng", Language.findByName("english").getCode()) self.assertEqual("eng", Language.findByName("ENGLISH").getCode()) self.assertEqual("deu", Language.findByName("german").getCode()) self.assertEqual("deu", Language.findByName("GERMAN").getCode()) self.assertEqual("Xart-tok", Language.findByName("tokipona").getCode()) self.assertEqual("nmn", Language.findByName("!xóõ").getCode()) self.assertEqual("nmn", Language.findByName("!Xóõ").getCode()) self.assertEqual("nmn", Language.findByName("!Xóõ").getCode()) self.assertEqual("nmn", Language.findByName("!Xóõ").getCode()) self.assertEqual("nmn", Language.findByName("!xóõ").getCode()) self.assertEqual("nmn", Language.findByName("ǃXóõ").getCode())
def testFindByCode(self): self.assertEqual("eng", Language.findByCode("eng").getCode()) self.assertEqual("deu", Language.findByCode("deu").getCode()) self.assertEqual("eng", Language.findByCode("en").getCode()) self.assertEqual("deu", Language.findByCode("de").getCode()) self.assertEqual("deu", Language.findByCode("ger").getCode())
def testGet(self): self.assertEqual("eng", Language.get("eng").getCode()) self.assertEqual("deu", Language.get("deu").getCode()) self.assertIsNone(Language.get("en")) self.assertIsNone(Language.get("de")) self.assertIsNone(Language.get("ger"))