Esempio n. 1
0
 def testDid(self):  # throws Exception
     page = self.parse("did.txt")
     entryIter = List(page.getEntries()).iterator()
     self.assertEntry(Language.findByName("Translingual"),
                      PartOfSpeech.NUMBER, 1, entryIter.next())
     self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Old Welsh"), PartOfSpeech.NOUN,
                      1, entryIter.next())
     self.assertFalse(entryIter.hasNext())
Esempio n. 2
0
 def testAbele(self):  # throws Exception
     page = self.parse("abele.txt")
     entryIter = List(page.getEntries()).iterator()
     self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Novial"), PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertFalse(entryIter.hasNext())
Esempio n. 3
0
    def connect(self, isReadOnly, allowCreateNew, overwriteExisting,
                cacheSize):  # throws DatabaseException
        # Configure DB environment.

        envConfig = EnvironmentConfig()
        envConfig.setAllowCreate(allowCreateNew)
        envConfig.setReadOnly(isReadOnly)
        envConfig.setTransactional(False)
        if cacheSize is not None:
            envConfig.setCacheSize(cacheSize)

        self.env = Environment(self.dbPath, envConfig)

        # Configure store.
        storeConfig = StoreConfig()
        storeConfig.setAllowCreate(allowCreateNew)
        storeConfig.setTransactional(False)
        storeConfig.setReadOnly(isReadOnly)
        self.store = EntityStore(self.env, DBWiktionaryEdition.DATABASE_NAME,
                                 storeConfig)

        # Load properties.
        self.properties = Properties()
        propFile = File(self.dbPath.filepath,
                        DBWiktionaryEdition.PROPERTY_FILE_NAME)
        if propFile.exists():
            try:
                reader = FileReader(propFile)
                self.properties.load(reader)
            except IOException as e:
                raise DatabaseException("Unable to load property file", e)

            lang = self.properties.getProperty("wiktionary.language")
            if lang is None:
                lang = self.properties.getProperty("entry_language")
            self.language = Language.get(lang)

        # Load index.
        self.pageById = self.store.getPrimaryIndex(Long.__class__,
                                                   WiktionaryPage.__class__)
        self.pageByTitle = self.store.getSecondaryIndex(
            self.pageById, String.__class__, "title")
        self.pageByNormalizedTitle = self.store.getSecondaryIndex(
            self.pageById, String.__class__, "normalizedTitle")

        self.entryByKey = self.store.getPrimaryIndex(
            String.__class__,
            DBWiktionaryEdition.WiktionaryEntryProxy.__class__)
        self.entryById = self.store.getSecondaryIndex(self.entryByKey,
                                                      Long.__class__,
                                                      "entryId")
        self.senseByKey = self.store.getPrimaryIndex(
            String.__class__,
            DBWiktionaryEdition.WiktionarySenseProxy.__class__)

        self.openCursors = set()
Esempio n. 4
0
 def testMay(self):  # throws Exception
     page = self.parse("may.txt")
     entryIter = List(page.getEntries()).iterator()
     self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 4,
                      entryIter.next())
     self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertEntry(Language.ENGLISH, PartOfSpeech.VERB, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Crimean Tatar"),
                      PartOfSpeech.NOUN, 1, entryIter.next())
     self.assertEntry(Language.findByName("Kurdish"), PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Mapudungun"),
                      PartOfSpeech.ADVERB, 1, entryIter.next())
     self.assertEntry(Language.findByName("Tagalog"), PartOfSpeech.VERB, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Tatar"), PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertFalse(entryIter.hasNext())
Esempio n. 5
0
 def testBass(self):  # throws Exception
     page = self.parse("bass.txt")
     entryIter = List(page.getEntries()).iterator()
     self.assertEntry(Language.ENGLISH, PartOfSpeech.ADJECTIVE, 1,
                      entryIter.next())
     self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 5,
                      entryIter.next())
     self.assertEntry(Language.ENGLISH, PartOfSpeech.NOUN, 1,
                      entryIter.next())
     self.assertEntry(Language.findByName("Romansch"),
                      PartOfSpeech.ADJECTIVE, 1, entryIter.next())
     self.assertFalse(entryIter.hasNext())
Esempio n. 6
0
    def processBody(self, text, context):
        text = text.strip()

        if text.startswith("{{trans-mid}}") or text.startswith("{{mid}}"):
            return True
        if text.startswith("{{trans-top|") and ("}}" in text):
            template = TemplateParser.parseTemplate(text[2:text.find("}}")])
            if template is not None and template.getNumberedParamsCount() >= 1:
                self.currentSense = template.getNumberedParam(0)

            return True

        if text.startswith("{{top}}"):
            self.currentSense = ""
            return True

        if text.startswith("{{trans-bottom}}") or text.startswith(
                "{{bottom}}"
        ):  # This template indicates the end of the translation block
            return False
        if text.startswith("{{") or text.startswith(
                "=="):  # Indicates that a new block has just started.
            return False

        matcher = re.search(ENTranslationHandler.LANGUAGE, text)
        if matcher is None:
            return False

        languageText = WikiString.removeWikiLinks(matcher.group(1).strip())
        language = Language.findByName(languageText)

        endOffSet = matcher.end()
        if endOffSet > len(text) - 1:
            return False

        remainingText = text[endOffSet:]

        for part in self.splitTranslationParts(remainingText):
            translation = self.parseTranslation(language, part)
            if translation is not None:
                # Save the translation
                if self.currentSense not in self.sensNum2trans:
                    self.sensNum2trans[self.currentSense] = list()
                translations = self.sensNum2trans[self.currentSense]
                translations.append(translation)

        return True
Esempio n. 7
0
    def parseTemplate(self, templateString):
        template = TemplateParser.parseTemplate(templateString[2:-2])
        if template is None or template.getNumberedParamsCount() <= 1:
            return None

        translationText = StringUtils.cleanText(
            WikiString.removeWikiLinks(template.getNumberedParam(1)))
        if not translationText:
            return None

        languageCode = template.getNumberedParam(0)
        transliteration = template.getNamedParam("tr")
        translation = WiktionaryTranslation(Language.findByCode(languageCode),
                                            translationText)
        if template.getNumberedParamsCount(
        ) > 2 and "=" not in template.getNumberedParam(2):
            translation.setGender(template.getNumberedParam(2))

        translation.setCheckNeeded("check" in template.getName())
        if transliteration is not None:
            translation.setTransliteration(
                StringUtils.cleanText(transliteration))

        return translation
Esempio n. 8
0
 def getWordLanguage(self):
     if self._wordLanguage is None and self.wordLanguageStr is not None:
         self._wordLanguage = Language.get(self.wordLanguageStr)
     return self._wordLanguage
 def getEntryLanguage(self):
     if self._entryLanguage is None and self.entryLanguageStr is not None:
         self._entryLanguage = Language.get(self.entryLanguageStr)
     return self._entryLanguage
Esempio n. 10
0
    def parseWikisaurusEntries(self, title, text):
        result = set()
        reader = StringReader(text)
        currentLang = None
        currentPos = None
        currentRelType = None
        inList = False
        inRelation = False
        inSense = False
        wikisaurusSense = None
        try:
            for line in reader.readLines():
                line = line.strip()
                if not len(line):
                    continue

                countSectionIdentifier = 0
                while countSectionIdentifier < len(
                        line) and line[countSectionIdentifier] == '=':
                    countSectionIdentifier += 1
                line = line.replace("=", "")

                if wikisaurusSense is not None and 2 <= countSectionIdentifier < 4 \
                        or (countSectionIdentifier == 4 and line.startswith("{{ws sense")):
                    result.add(wikisaurusSense)
                    wikisaurusSense = None

                if countSectionIdentifier == 2:  # Language
                    currentLang = Language.findByName(line)
                    inRelation = False
                    inSense = False
                elif countSectionIdentifier == 3:  # POS
                    currentPos = PartOfSpeech.findByName(
                        line)  # TODO: language-specific POS tags?
                    inRelation = False
                    inSense = False
                elif countSectionIdentifier == 4 and line.startswith(
                        "{{ws sense"):  # Sense
                    senseDef = self.extractSenseDefinition(line)
                    wikisaurusSense = WikisaurusEntry(title, currentPos,
                                                      currentLang, senseDef)
                    inRelation = False
                    inSense = True
                elif (countSectionIdentifier == 5 or countSectionIdentifier
                      == 4) and inSense:  # Relation type
                    currentRelType = self.relTypeMap.get(line.strip().lower())
                    inRelation = True
                    if currentRelType is None:
                        print(title + " RELATION NOT FOUND: " + line)
                        if line in self.notFoundRelation:
                            self.notFoundRelation[
                                line] = self.notFoundRelation[line] + 1
                        else:
                            self.notFoundRelation[line] = 1
                elif line.startswith("{{ws beginlist"):
                    inList = True
                elif line.startswith("{{ws endlist"):
                    inList = False
                elif line.startswith("{{ws|") and inRelation and inList:
                    target = self.extractRelTarget(line)
                    if currentRelType is not None:
                        wikisaurusSense.addRelation(target[0], target[1],
                                                    currentRelType)

            if wikisaurusSense is not None:
                result.add(wikisaurusSense)
        except IOException as e:
            raise RuntimeException(
                "Error while parsing text of Wikisaurus page " + title, e)

        return result
 def getLanguage(self):
     if self._language is None and self.languageStr is not None:
         self._language = Language.get(self.languageStr)
     return self._language
 def resolveLanguage(self, baseURL):
     idx = baseURL.find("://")
     language = baseURL[idx + 3: idx + 5]
     return Language.findByCode(language)
    def testISOCodes(self):
        self.assertEqual("en", Language.findByName("English").getISO639_1())
        self.assertEqual("eng", Language.findByName("English").getISO639_2B())
        self.assertEqual("eng", Language.findByName("English").getISO639_2T())
        self.assertEqual("eng", Language.findByName("English").getISO639_3())

        self.assertEqual("de", Language.findByName("German").getISO639_1())
        self.assertEqual("ger", Language.findByName("German").getISO639_2B())
        self.assertEqual("deu", Language.findByName("German").getISO639_2T())
        self.assertEqual("deu", Language.findByName("German").getISO639_3())

        self.assertEqual("", Language.findByName("Dimili").getISO639_1())
        self.assertEqual("zza", Language.findByName("Dimili").getISO639_2B())
        self.assertEqual("zza", Language.findByName("Dimili").getISO639_2T())
        self.assertEqual("zza", Language.findByName("Dimili").getISO639_3())

        self.assertEqual("", Language.findByName("Aasáx").getISO639_1())
        self.assertEqual("", Language.findByName("Aasáx").getISO639_2B())
        self.assertEqual("", Language.findByName("Aasáx").getISO639_2T())
        self.assertEqual("aas", Language.findByName("Aasáx").getISO639_3())

        self.assertEqual("", Language.findByName("Tokipona").getISO639_1())
        self.assertEqual("", Language.findByName("Tokipona").getISO639_2B())
        self.assertEqual("", Language.findByName("Tokipona").getISO639_2T())
        self.assertEqual("", Language.findByName("Tokipona").getISO639_3())
    def testFindByName(self):
        self.assertEqual("eng", Language.findByName("English").getCode())
        self.assertEqual("deu", Language.findByName("German").getCode())
        self.assertEqual("zza", Language.findByName("Dimili").getCode())
        self.assertEqual("zza", Language.findByName("Kirmanjki").getCode())
        self.assertEqual("zza", Language.findByName("Kirdki").getCode())
        self.assertEqual("zza", Language.findByName("Dimli").getCode())
        self.assertEqual("zza", Language.findByName("Zazaki").getCode())
        self.assertEqual("zza", Language.findByName("Zaza").getCode())

        self.assertEqual("eng", Language.findByName("english").getCode())
        self.assertEqual("eng", Language.findByName("ENGLISH").getCode())
        self.assertEqual("deu", Language.findByName("german").getCode())
        self.assertEqual("deu", Language.findByName("GERMAN").getCode())

        self.assertEqual("Xart-tok", Language.findByName("tokipona").getCode())
        self.assertEqual("nmn", Language.findByName("!xóõ").getCode())
        self.assertEqual("nmn", Language.findByName("!Xóõ").getCode())
        self.assertEqual("nmn", Language.findByName("!Xóõ").getCode())
        self.assertEqual("nmn", Language.findByName("!Xóõ").getCode())
        self.assertEqual("nmn", Language.findByName("!xóõ").getCode())
        self.assertEqual("nmn", Language.findByName("ǃXóõ").getCode())
 def testFindByCode(self):
     self.assertEqual("eng", Language.findByCode("eng").getCode())
     self.assertEqual("deu", Language.findByCode("deu").getCode())
     self.assertEqual("eng", Language.findByCode("en").getCode())
     self.assertEqual("deu", Language.findByCode("de").getCode())
     self.assertEqual("deu", Language.findByCode("ger").getCode())
 def testGet(self):
     self.assertEqual("eng", Language.get("eng").getCode())
     self.assertEqual("deu", Language.get("deu").getCode())
     self.assertIsNone(Language.get("en"))
     self.assertIsNone(Language.get("de"))
     self.assertIsNone(Language.get("ger"))