def parse(self): super(deWortartTitleSection, self).parse() for piece in Chopper( self.text, [self.WortartTitlePieceBlock, self.WortartTitleTagBlock], filler_blocks=True, include_tags=True): if isinstance(piece, FillerBlock) or isinstance( piece, self.WortartTitleTagBlock): section = FillerSection(text=piece.text, parent=self, correct=True) else: section = deWortartTitlePieceSection(text=piece.text, parent=self) self.children.append(section.parse()) # Check to see if it is a kind of name # This is a low tech check. Improve later. hints = set(['Vorname', 'Nachname', 'Eigenname']) word = self.get_property('word') if word: for hint in hints: if hint in self.text: word.is_name = True return self
def parse(self): super(deWortartContentSection, self).parse() if self.get_property("language") == "Deutsch": blocks = list(Chopper(self.text, block_classes, filler_blocks=True, filler_block_class=NewFillerBlock)) for block in blocks: section = block.make_section(parent=self) self.children.append(section.parse()) else: section = FillerSection(text=self.text, parent=self) self.children.append(section.parse()) return self
def parse(self): super(deWortartContentSection, self).parse() if self.get_property('language') == 'Deutsch': blocks = list(Chopper(self.text, block_classes, filler_blocks=True, filler_block_class=NewFillerBlock)) for block in blocks: section = block.make_section(parent=self) self.children.append(section.parse()) else: section = FillerSection(text=self.text, parent=self) self.children.append(section.parse()) return self
def parse(self): regex = '== (?P<title>.+) ==' pattern = re.compile(regex, re.UNICODE) match = pattern.match(self.heading) if match: self.set_property('einsatz_slug', match.groupdict()['title']) else: raise ParsingError('Cannot parse log page.') for level3block in Chopper(self.text, [ Level3Block, ], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level3 block. if isinstance(level3block, FillerBlock): section = FillerSection(text=level3block.text, parent=self) section = section.parse() self.top_section = section else: section = AlertSection(text=level3block.text, parent=self, heading=level3block.start_tag) section = section.parse() if section.get_property('alert_slug') in self.alert_sections: raise ParsingError('Two EinsatzSections with same title.') self.alert_sections[section.get_property( 'alert_slug')] = section self.children.append(section) return self
def parse(self, shallow=False): super(dePage, self).parse() if not self.title_OK(): # No words on this page self.ignore = True return self for level2block in Chopper(self.text, [Level2Block,], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level2 block. if isinstance(level2block, FillerBlock): section = FillerSection(text=level2block.text, parent=self) else: section = deLanguageSection(text=level2block.text, parent=self) if not shallow: section = section.parse() self.children.append(section) return self
def parse(self): super(deWortartTitleSection, self).parse() for piece in Chopper(self.text, [self.WortartTitlePieceBlock, self.WortartTitleTagBlock], filler_blocks=True, include_tags=True): if isinstance(piece, FillerBlock) or isinstance(piece, self.WortartTitleTagBlock): section = FillerSection(text=piece.text, parent=self, correct=True) else: section = deWortartTitlePieceSection(text=piece.text, parent=self) self.children.append(section.parse()) # Check to see if it is a kind of name # This is a low tech check. Improve later. hints = set(['Vorname', 'Nachname', 'Eigenname']) word = self.get_property('word') if word: for hint in hints: if hint in self.text: word.is_name = True return self
def parse(self, shallow=False): super(deLanguageSection, self).parse() # The only thing that a deLanguageSection should contain # is the title and deWortartSections. l2bs = list(Chopper(self.text, [ Level2Block, ])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text lang_title_sec = deLangTitleSection(text=title, parent=self).parse() if not lang_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(lang_title_sec) for l3b in Chopper(content, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = FillerSection(text=l3b.text, parent=self, correct=True) else: section = deWortartSection(text=l3b.text, parent=self) if not shallow: section = section.parse() self.children.append(section) return self
def parse(self, shallow=False): super(deLanguageSection, self).parse() # The only thing that a deLanguageSection should contain # is the title and deWortartSections. l2bs = list(Chopper(self.text, [Level2Block])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text lang_title_sec = deLangTitleSection(text=title, parent=self).parse() if not lang_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(lang_title_sec) for l3b in Chopper(content, [Level3Block], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = FillerSection(text=l3b.text, parent=self, correct=True) else: section = deWortartSection(text=l3b.text, parent=self) if not shallow: section = section.parse() self.children.append(section) return self
def parse(self, shallow=False): super(simpleTopSection, self).parse() for l3b in Chopper(self.text, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = simpleTopTopSection(text=l3b.text, parent=self) else: if l3b.start_tag == '===Pronunciation===': section = PronunciationSection(text=l3b.text, parent=self) else: section = FillerSection(text=l3b.text, parent=self, correct=True) if not shallow: section = section.parse() self.children.append(section) return self
def parse(self): super(SubstantivTabelleSection, self).parse() word = self.get_property('word') if not word or not word.genders: return self page_title = self.get_property('page').title page_title = 'blah' self.fixed_text = None match = self.pattern.match(self.text) if not match: message = u'%s: Substantiv-Tabelle in unreadable format.' % page_title alert = UnreadableAlert(message=message, title=page_title) self.alerts.append(alert) return FillerSection(text=self.text, parent=self.parent) data = match.groupdict() self.s_data = {} self.p_data = {} for case in CASES: self.s_data[case] = data[cps(case, False)] self.p_data[case] = data[cps(case, True)] all_assigned = True poss_fks = fks poss_genders = GENDERS smfc = SubstantivMultipleFormColl(self.s_data, plural=False) try: smfc.process() # This is to make plural match singular # But we don't check that every gender and flexionklasse present in # singular is present in plural. They just can't be something # completely different. # I should really lump singular and plural together into one MultipleFormColl if smfc.form_colls: poss_fks = set([]) poss_genders = set([]) for form_coll in smfc.form_colls: poss_fks = poss_fks | form_coll.poss_fks poss_genders = poss_genders | form_coll.poss_genders except SubstantivTabelleException, e: smfc = None
def parse(self): self.einsatz_sections = {} for level2block in Chopper(self.text, [Level2Block,], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level2 block. if isinstance(level2block, FillerBlock): section = FillerSection(text=level2block.text, parent=self) section = section.parse() else: section = EinsatzSection(text=level2block.text, parent=self, heading=level2block.start_tag) section = section.parse() if section.get_property('einsatz_slug') in self.einsatz_sections: raise ParsingError('EinsatzSection title is not unique.') self.einsatz_sections[section.get_property('einsatz_slug')] = section self.children.append(section) return self
def parse(self): self.einsatz_sections = {} for level2block in Chopper(self.text, [ Level2Block, ], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level2 block. if isinstance(level2block, FillerBlock): section = FillerSection(text=level2block.text, parent=self) section = section.parse() else: section = EinsatzSection(text=level2block.text, parent=self, heading=level2block.start_tag) section = section.parse() if section.get_property( 'einsatz_slug') in self.einsatz_sections: raise ParsingError('EinsatzSection title is not unique.') self.einsatz_sections[section.get_property( 'einsatz_slug')] = section self.children.append(section) return self
def parse(self): regex = '== (?P<title>.+) ==' pattern = re.compile(regex, re.UNICODE) match = pattern.match(self.heading) if match: self.set_property('einsatz_slug', match.groupdict()['title']) else: raise ParsingError('Cannot parse log page.') for level3block in Chopper(self.text, [Level3Block,], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level3 block. if isinstance(level3block, FillerBlock): section = FillerSection(text=level3block.text, parent=self) section = section.parse() self.top_section = section else: section = AlertSection(text=level3block.text, parent=self, heading=level3block.start_tag) section = section.parse() if section.get_property('alert_slug') in self.alert_sections: raise ParsingError('Two EinsatzSections with same title.') self.alert_sections[section.get_property('alert_slug')] = section self.children.append(section) return self
def make_section(self, parent): return FillerSection(text=self.text, parent=parent)
def parse(self, shallow=False): super(simpleWordTypeSection, self).parse() l2bs = list(Chopper(self.text, [ Level2Block, ])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text wordtype_title_sec = simpleWordTypeTitleSection(text=title, parent=self).parse() wordtype = self.get_property('wordtype') # If we don't get a recognisable word type then we can't parse this section. if wordtype not in level2_mapping: page_title = self.get_property('page').title section = FillerSection(text=self.text, parent=self.parent) if wordtype in level3_mapping: # This should be a level 3 heading. message = "%s: The heading %s should be level 3 not level 2." % ( page_title, wordtype) fixed_text = u"===%s===%s" % (wordtype, content) alert = Level2_not_Level3(section, fixed_text, message, page_title) else: message = '%s: The word type "%s" is not known.' % (page_title, wordtype) alert = UnknownType(message=message, title=page_title) section.alerts.append(alert) return section # Get the Word Class associated with this type. word_class = level2_mapping[wordtype] # If there is no Word Class then this section can be ignored. if word_class is None: return FillerSection(text=self.text, parent=self.parent) # Otherwise create a new Word object. if word_class not in self.parent.wordtypes: self.parent.wordtypes[word_class] = 1 order = 0 else: order = self.parent.wordtypes[word_class] self.parent.wordtypes[word_class] += 1 new_word = word_class.get_and_update(title=self.parent.title, order=order, session=Session.object_session( self.parent), tags=self.get_property('tags')) self.set_property('word', new_word) self.parent.words.append(new_word) if not wordtype_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(wordtype_title_sec) for l3b in Chopper(content, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = simpleWordTypeHeaderSection(text=l3b.text, parent=self) else: section = FillerSection(text=l3b.text, parent=self, correct=True) if not shallow: section = section.parse() self.children.append(section) return self