def fillContent(self, context): """ * Store POS (a.k.a. part-of-speech), examples, quotations in WordEntry object """ self.saveQuotations() # In the special case when article constituents have been found before # the first entry, do not create a new entry, but use the automatically # created one. if context.getPage().getEntryCount() == 1 and context.getPage().getEntry(0).getPartOfSpeech() is None: entry = context.getPage().getEntry(0) entry.setWordLanguage(context.getLanguage()) entry.addPartOfSpeech(context.getPartOfSpeech()) if context.getHeader() is not None: entry.setHeader(context.getHeader()) entry.setWordEtymology(context.getEtymology()) else: entry = self.entryFactory.createEntry(context) context.getPage().addEntry(entry) pronunciations = context.getPronunciations() if pronunciations is not None: for pronunciation in pronunciations: entry.addPronunciation(pronunciation) for senseEntry in self.glossEntryList: sense = entry.createSense() sense.setGloss(WikiString(senseEntry.getDefinition())) for exp in senseEntry.getExampleList(): translation = senseEntry.getExampleTranslation(exp) sense.addExample(WiktionaryExample(WikiString(exp), None if translation is None else WikiString(translation))) for quotation in senseEntry.getQuotations(): sense.addQuotation(quotation) entry.addSense(sense) for relation in [WiktionaryRelation(target, key) for key, value in senseEntry.getRelations() for target in value]: sense.addRelation(relation) # for relation in flatMap([WiktionaryRelation(target, key) for key, value in senseEntry.getRelations() for target in value]): # sense.addRelation(relation) for form in self.wordFormHandler.getWordForms(): entry.addWordForm(form) entry.setRawHeadwordLine(self.wordFormHandler.getRawHeadwordLine()) genders = self.wordFormHandler.getGenders() if genders is not None: for gender in genders: entry.addGender(gender)
def parseTranslation(self, languageHeader, text): matcher = re.search(ENTranslationHandler.TRANSLATION, text) if matcher is None: return None prefix = matcher.group("prefix") content = matcher.group("content") postfix = matcher.group("postfix") if content.startswith("{{"): translation = self.parseTemplate(content) else: translation = WiktionaryTranslation( languageHeader, StringUtils.cleanText(WikiString.removeWikiLinks(content))) if translation is not None: additionalInformation = "" if prefix is not None: additionalInformation += prefix.strip() if translation.getGender() is not None: additionalInformation += " {{" + translation.getGender( ) + "}} " additionalInformation += postfix translation.setAdditionalInformation( StringUtils.cleanText(additionalInformation.strip())) if self.currentSense is not None and len( self.currentSense.strip()) > 0: translation.setRawSense(self.currentSense.strip()) return translation else: return None
def processBody(self, textLine, context): textLine = textLine.strip() if textLine.startswith("{{quote-"): self.inTemplate = True elif self.inTemplate or textLine.startswith("|"): if "}}" not in textLine: self.inTemplate = False elif textLine.startswith("{{"): self.references.append(WikiString(textLine.strip())) elif textLine.startswith("*"): textLine = textLine[1:] self.references.append(WikiString(textLine.strip())) else: return False return True
def extractQuotation(self, textLine, additionalLine, context): """ Extract a quotation from the given line and add it to the internal list. @param textLine @param additionalLine if <code>False</code> adds a new quotation to the list and otherwise appends the quotation to the last one. @param context """ line = textLine.strip() if not line.startswith("*"): return False line = line[1:].strip() if line.startswith(":"): if self.quotations: q = self.quotations[-1] while line.startswith(":"): line = line[1:] q.addLine(WikiString(line.strip())) elif additionalLine: if self.quotations: quot = self.quotations[-1] idx = len(quot.getLines()) - 1 if idx >= 0: line = quot.getLines()[idx].getText() + " " + line quot.getLines()[idx] = WikiString(line.strip()) else: quot.getLines().append(WikiString(line.strip())) else: quotationEntry = Quotation() if line.startswith("{{"): quotationEntry.addLine(WikiString(line.strip())) else: quotationEntry.setSource(WikiString(line.strip())) self.quotations.append(quotationEntry) return False
def processBody(self, text, context): text = text.strip() if text.startswith("{{trans-mid}}") or text.startswith("{{mid}}"): return True if text.startswith("{{trans-top|") and ("}}" in text): template = TemplateParser.parseTemplate(text[2:text.find("}}")]) if template is not None and template.getNumberedParamsCount() >= 1: self.currentSense = template.getNumberedParam(0) return True if text.startswith("{{top}}"): self.currentSense = "" return True if text.startswith("{{trans-bottom}}") or text.startswith( "{{bottom}}" ): # This template indicates the end of the translation block return False if text.startswith("{{") or text.startswith( "=="): # Indicates that a new block has just started. return False matcher = re.search(ENTranslationHandler.LANGUAGE, text) if matcher is None: return False languageText = WikiString.removeWikiLinks(matcher.group(1).strip()) language = Language.findByName(languageText) endOffSet = matcher.end() if endOffSet > len(text) - 1: return False remainingText = text[endOffSet:] for part in self.splitTranslationParts(remainingText): translation = self.parseTranslation(language, part) if translation is not None: # Save the translation if self.currentSense not in self.sensNum2trans: self.sensNum2trans[self.currentSense] = list() translations = self.sensNum2trans[self.currentSense] translations.append(translation) return True
def findMatchingSense(cls, entry, marker): """ @return the word sense whose sense definition corresponds to the specified comment (sense marker). The matching of the corresponding word sense is achieved by word similarity metrics. Returns <code>None</code> if no matching word sense could be found. """ if entry.getSenseCount() == 1: return entry.getSense(1) # Empty sense marker. if marker is None or not len(marker): return None best1Gram = None best3Gram = None best1GramScore = -1 best3GramScore = -1 for sense in entry.senses: if sense.getIndex() <= 0: continue # Skip unassigned sense. gloss = WikiString.removeWikiLinks( sense.getGloss().getText()).lower() similarity = SimilarityUtils.wordSim(marker, gloss) if similarity > best1GramScore: best1GramScore = similarity best1Gram = sense similarity = SimilarityUtils.textSim(marker, gloss) if similarity > best3GramScore: best3GramScore = similarity best3Gram = sense if best1Gram is None and best3Gram is None: return None if best1GramScore <= 0 and best3GramScore <= 0: return None if best1GramScore > best3GramScore: return best1Gram else: return best3Gram
def parseTemplate(self, templateString): template = TemplateParser.parseTemplate(templateString[2:-2]) if template is None or template.getNumberedParamsCount() <= 1: return None translationText = StringUtils.cleanText( WikiString.removeWikiLinks(template.getNumberedParam(1))) if not translationText: return None languageCode = template.getNumberedParam(0) transliteration = template.getNamedParam("tr") translation = WiktionaryTranslation(Language.findByCode(languageCode), translationText) if template.getNumberedParamsCount( ) > 2 and "=" not in template.getNumberedParam(2): translation.setGender(template.getNumberedParam(2)) translation.setCheckNeeded("check" in template.getName()) if transliteration is not None: translation.setTransliteration( StringUtils.cleanText(transliteration)) return translation
def testplainText(self): text = "* {{sense|of or pertaining to the abdomen}} [[ventral]]" w = WikiString(text) self.assertEqual("ventral", w.getPlainText()) text = "# {{zoology|obsolete}} Belonging to the [[abdominales|Abdominales]]; as, ''abdominal'' fishes." w = WikiString(text) self.assertEqual( "# Belonging to the Abdominales; as, abdominal fishes.", w.getPlainText()) text = ":[1] [[eukaryotisch]]es [[Lebewesen|Lebw.]], das keine [[Photosynthese]] betreiben kann, [[Sauerstoff]] zur [[Atmung]] benötigt und tierischen und/oder pflanzlichen Organismen als [[Nahrung]] zu sich nimmt" w = WikiString(text) self.assertEqual( "[1] eukaryotisches Lebw., das keine Photosynthese betreiben kann, Sauerstoff zur Atmung benötigt und tierischen und/oder pflanzlichen Organismen als Nahrung zu sich nimmt", w.getPlainText()) text = ":[1] \"Die ''Welt'' ist schon oft mit einem Narrenhause verglichen worden.\"<ref>[http://www.humanist.de/religion/pfaffe.html Otto von Corvin, Der Pfaffenspiegel] </ref>" w = WikiString(text) self.assertEqual( "[1] \"Die Welt ist schon oft mit einem Narrenhause verglichen worden.\"", w.getPlainText()) text = ":[1–10] {{Wikipedia|Welt (Begriffsklärung)}}" w = WikiString(text) self.assertEqual("[1–10]", w.getPlainText())
def testremoveWikiLinks(self): # throws Exception self.assertEqual("Leader", WikiString.removeWikiLinks("[[leader|Leader]]")) self.assertEqual("Leader", WikiString.removeWikiLinks("[[Leader]]"))
def fillContent(self, context): if bool(self.contentBuffer.strip()): context.setEtymology(WikiString(self.contentBuffer.strip())) else: context.setEtymology(None)