コード例 #1
0
    def fillContent(self, context):
        """
         * Store POS (a.k.a. part-of-speech), examples, quotations in WordEntry object
         """
        self.saveQuotations()

        # In the special case when article constituents have been found before
        # the first entry, do not create a new entry, but use the automatically
        # created one.
        if context.getPage().getEntryCount() == 1 and context.getPage().getEntry(0).getPartOfSpeech() is None:
            entry = context.getPage().getEntry(0)
            entry.setWordLanguage(context.getLanguage())
            entry.addPartOfSpeech(context.getPartOfSpeech())
            if context.getHeader() is not None:
                entry.setHeader(context.getHeader())
            entry.setWordEtymology(context.getEtymology())
        else:
            entry = self.entryFactory.createEntry(context)
            context.getPage().addEntry(entry)

        pronunciations = context.getPronunciations()
        if pronunciations is not None:
            for pronunciation in pronunciations:
                entry.addPronunciation(pronunciation)

        for senseEntry in self.glossEntryList:
            sense = entry.createSense()
            sense.setGloss(WikiString(senseEntry.getDefinition()))
            for exp in senseEntry.getExampleList():
                translation = senseEntry.getExampleTranslation(exp)
                sense.addExample(WiktionaryExample(WikiString(exp), None if translation is None else WikiString(translation)))

            for quotation in senseEntry.getQuotations():
                sense.addQuotation(quotation)

            entry.addSense(sense)

            for relation in [WiktionaryRelation(target, key) for key, value in senseEntry.getRelations() for target in value]:
                sense.addRelation(relation)

            # for relation in flatMap([WiktionaryRelation(target, key) for key, value in senseEntry.getRelations() for target in value]):
            #     sense.addRelation(relation)

        for form in self.wordFormHandler.getWordForms():
            entry.addWordForm(form)

        entry.setRawHeadwordLine(self.wordFormHandler.getRawHeadwordLine())

        genders = self.wordFormHandler.getGenders()
        if genders is not None:
            for gender in genders:
                entry.addGender(gender)
コード例 #2
0
    def parseTranslation(self, languageHeader, text):
        matcher = re.search(ENTranslationHandler.TRANSLATION, text)
        if matcher is None:
            return None

        prefix = matcher.group("prefix")
        content = matcher.group("content")
        postfix = matcher.group("postfix")

        if content.startswith("{{"):
            translation = self.parseTemplate(content)
        else:
            translation = WiktionaryTranslation(
                languageHeader,
                StringUtils.cleanText(WikiString.removeWikiLinks(content)))

        if translation is not None:
            additionalInformation = ""
            if prefix is not None:
                additionalInformation += prefix.strip()

            if translation.getGender() is not None:
                additionalInformation += " {{" + translation.getGender(
                ) + "}} "

            additionalInformation += postfix
            translation.setAdditionalInformation(
                StringUtils.cleanText(additionalInformation.strip()))
            if self.currentSense is not None and len(
                    self.currentSense.strip()) > 0:
                translation.setRawSense(self.currentSense.strip())

            return translation
        else:
            return None
コード例 #3
0
    def processBody(self, textLine, context):
        textLine = textLine.strip()
        if textLine.startswith("{{quote-"):
            self.inTemplate = True
        elif self.inTemplate or textLine.startswith("|"):
            if "}}" not in textLine:
                self.inTemplate = False
        elif textLine.startswith("{{"):
            self.references.append(WikiString(textLine.strip()))
        elif textLine.startswith("*"):
            textLine = textLine[1:]
            self.references.append(WikiString(textLine.strip()))
        else:
            return False

        return True
コード例 #4
0
    def extractQuotation(self, textLine, additionalLine, context):
        """ Extract a quotation from the given line and add it to the internal list.
            @param textLine
            @param additionalLine if <code>False</code> adds a new quotation to
                the list and otherwise appends the quotation to the last one.
            @param context """
        line = textLine.strip()
        if not line.startswith("*"):
            return False

        line = line[1:].strip()
        if line.startswith(":"):
            if self.quotations:
                q = self.quotations[-1]
                while line.startswith(":"):
                    line = line[1:]
                q.addLine(WikiString(line.strip()))

        elif additionalLine:
            if self.quotations:
                quot = self.quotations[-1]
                idx = len(quot.getLines()) - 1
                if idx >= 0:
                    line = quot.getLines()[idx].getText() + " " + line
                    quot.getLines()[idx] = WikiString(line.strip())
                else:
                    quot.getLines().append(WikiString(line.strip()))

        else:
            quotationEntry = Quotation()
            if line.startswith("{{"):
                quotationEntry.addLine(WikiString(line.strip()))
            else:
                quotationEntry.setSource(WikiString(line.strip()))
            self.quotations.append(quotationEntry)

        return False
コード例 #5
0
    def processBody(self, text, context):
        text = text.strip()

        if text.startswith("{{trans-mid}}") or text.startswith("{{mid}}"):
            return True
        if text.startswith("{{trans-top|") and ("}}" in text):
            template = TemplateParser.parseTemplate(text[2:text.find("}}")])
            if template is not None and template.getNumberedParamsCount() >= 1:
                self.currentSense = template.getNumberedParam(0)

            return True

        if text.startswith("{{top}}"):
            self.currentSense = ""
            return True

        if text.startswith("{{trans-bottom}}") or text.startswith(
                "{{bottom}}"
        ):  # This template indicates the end of the translation block
            return False
        if text.startswith("{{") or text.startswith(
                "=="):  # Indicates that a new block has just started.
            return False

        matcher = re.search(ENTranslationHandler.LANGUAGE, text)
        if matcher is None:
            return False

        languageText = WikiString.removeWikiLinks(matcher.group(1).strip())
        language = Language.findByName(languageText)

        endOffSet = matcher.end()
        if endOffSet > len(text) - 1:
            return False

        remainingText = text[endOffSet:]

        for part in self.splitTranslationParts(remainingText):
            translation = self.parseTranslation(language, part)
            if translation is not None:
                # Save the translation
                if self.currentSense not in self.sensNum2trans:
                    self.sensNum2trans[self.currentSense] = list()
                translations = self.sensNum2trans[self.currentSense]
                translations.append(translation)

        return True
コード例 #6
0
    def findMatchingSense(cls, entry, marker):
        """ @return the word sense whose sense definition
            corresponds to the specified comment (sense marker). The matching
            of the corresponding word sense is achieved by word similarity
            metrics. Returns <code>None</code> if no matching word sense
            could be found. """
        if entry.getSenseCount() == 1:
            return entry.getSense(1)

        # Empty sense marker.
        if marker is None or not len(marker):
            return None

        best1Gram = None
        best3Gram = None
        best1GramScore = -1
        best3GramScore = -1

        for sense in entry.senses:
            if sense.getIndex() <= 0:
                continue  # Skip unassigned sense.

            gloss = WikiString.removeWikiLinks(
                sense.getGloss().getText()).lower()
            similarity = SimilarityUtils.wordSim(marker, gloss)
            if similarity > best1GramScore:
                best1GramScore = similarity
                best1Gram = sense

            similarity = SimilarityUtils.textSim(marker, gloss)
            if similarity > best3GramScore:
                best3GramScore = similarity
                best3Gram = sense

        if best1Gram is None and best3Gram is None:
            return None

        if best1GramScore <= 0 and best3GramScore <= 0:
            return None

        if best1GramScore > best3GramScore:
            return best1Gram
        else:
            return best3Gram
コード例 #7
0
    def parseTemplate(self, templateString):
        template = TemplateParser.parseTemplate(templateString[2:-2])
        if template is None or template.getNumberedParamsCount() <= 1:
            return None

        translationText = StringUtils.cleanText(
            WikiString.removeWikiLinks(template.getNumberedParam(1)))
        if not translationText:
            return None

        languageCode = template.getNumberedParam(0)
        transliteration = template.getNamedParam("tr")
        translation = WiktionaryTranslation(Language.findByCode(languageCode),
                                            translationText)
        if template.getNumberedParamsCount(
        ) > 2 and "=" not in template.getNumberedParam(2):
            translation.setGender(template.getNumberedParam(2))

        translation.setCheckNeeded("check" in template.getName())
        if transliteration is not None:
            translation.setTransliteration(
                StringUtils.cleanText(transliteration))

        return translation
コード例 #8
0
    def testplainText(self):
        text = "* {{sense|of or pertaining to the abdomen}} [[ventral]]"
        w = WikiString(text)
        self.assertEqual("ventral", w.getPlainText())

        text = "# {{zoology|obsolete}} Belonging to the [[abdominales|Abdominales]]; as, ''abdominal'' fishes."
        w = WikiString(text)
        self.assertEqual(
            "# Belonging to the Abdominales; as, abdominal fishes.",
            w.getPlainText())

        text = ":[1] [[eukaryotisch]]es [[Lebewesen|Lebw.]], das keine [[Photosynthese]] betreiben kann, [[Sauerstoff]] zur [[Atmung]] benötigt und tierischen und/oder pflanzlichen Organismen als [[Nahrung]] zu sich nimmt"
        w = WikiString(text)
        self.assertEqual(
            "[1] eukaryotisches Lebw., das keine Photosynthese betreiben kann, Sauerstoff zur Atmung benötigt und tierischen und/oder pflanzlichen Organismen als Nahrung zu sich nimmt",
            w.getPlainText())

        text = ":[1] \"Die ''Welt'' ist schon oft mit einem Narrenhause verglichen worden.\"<ref>[http://www.humanist.de/religion/pfaffe.html Otto von Corvin, Der Pfaffenspiegel] </ref>"
        w = WikiString(text)
        self.assertEqual(
            "[1] \"Die Welt ist schon oft mit einem Narrenhause verglichen worden.\"",
            w.getPlainText())

        text = ":[1–10] {{Wikipedia|Welt (Begriffsklärung)}}"
        w = WikiString(text)
        self.assertEqual("[1–10]", w.getPlainText())
コード例 #9
0
 def testremoveWikiLinks(self):  # throws Exception
     self.assertEqual("Leader",
                      WikiString.removeWikiLinks("[[leader|Leader]]"))
     self.assertEqual("Leader", WikiString.removeWikiLinks("[[Leader]]"))
コード例 #10
0
 def fillContent(self, context):
     if bool(self.contentBuffer.strip()):
         context.setEtymology(WikiString(self.contentBuffer.strip()))
     else:
         context.setEtymology(None)