def parse(self): regex = '== (?P<title>.+) ==' pattern = re.compile(regex, re.UNICODE) match = pattern.match(self.heading) if match: self.set_property('einsatz_slug', match.groupdict()['title']) else: raise ParsingError('Cannot parse log page.') for level3block in Chopper(self.text, [ Level3Block, ], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level3 block. if isinstance(level3block, FillerBlock): section = FillerSection(text=level3block.text, parent=self) section = section.parse() self.top_section = section else: section = AlertSection(text=level3block.text, parent=self, heading=level3block.start_tag) section = section.parse() if section.get_property('alert_slug') in self.alert_sections: raise ParsingError('Two EinsatzSections with same title.') self.alert_sections[section.get_property( 'alert_slug')] = section self.children.append(section) return self
def parse(self, shallow=False): super(deLanguageSection, self).parse() # The only thing that a deLanguageSection should contain # is the title and deWortartSections. l2bs = list(Chopper(self.text, [ Level2Block, ])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text lang_title_sec = deLangTitleSection(text=title, parent=self).parse() if not lang_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(lang_title_sec) for l3b in Chopper(content, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = FillerSection(text=l3b.text, parent=self, correct=True) else: section = deWortartSection(text=l3b.text, parent=self) if not shallow: section = section.parse() self.children.append(section) return self
def parse(self): regex = u'=== (?P<title>[\w\s]+) ===' pattern = re.compile(regex, re.UNICODE) match = pattern.match(self.heading) if match: self.set_property('alert_slug', match.groupdict()['title']) else: print self.heading raise ParsingError('Cannot parse log page.') return self
def parse(self): super(deWortartSection, self).parse() l3bs = list(Chopper(self.text, [Level3Block,])) if len(l3bs) != 1: import pdb pdb.set_trace() raise ParsingError() title = l3bs[0].start_tag content = l3bs[0].text wortart_title_sec = deWortartTitleSection(text=title, parent=self).parse() # if not wortart_title_sec.readable(): # new_section = FillerSection(text=self.text, parent=self.parent, correct=False) # return new_section wortart_content = deWortartContentSection(text=content, parent=self).parse() self.children.append(wortart_title_sec) self.children.append(wortart_content) return self
def parse(self): self.einsatz_sections = {} for level2block in Chopper(self.text, [ Level2Block, ], filler_blocks=True, include_tags=True): # It's allowed to have some unknown text before the first # level2 block. if isinstance(level2block, FillerBlock): section = FillerSection(text=level2block.text, parent=self) section = section.parse() else: section = EinsatzSection(text=level2block.text, parent=self, heading=level2block.start_tag) section = section.parse() if section.get_property( 'einsatz_slug') in self.einsatz_sections: raise ParsingError('EinsatzSection title is not unique.') self.einsatz_sections[section.get_property( 'einsatz_slug')] = section self.children.append(section) return self
def parse(self, shallow=False): super(simpleWordTypeSection, self).parse() l2bs = list(Chopper(self.text, [ Level2Block, ])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text wordtype_title_sec = simpleWordTypeTitleSection(text=title, parent=self).parse() wordtype = self.get_property('wordtype') # If we don't get a recognisable word type then we can't parse this section. if wordtype not in level2_mapping: page_title = self.get_property('page').title section = FillerSection(text=self.text, parent=self.parent) if wordtype in level3_mapping: # This should be a level 3 heading. message = "%s: The heading %s should be level 3 not level 2." % ( page_title, wordtype) fixed_text = u"===%s===%s" % (wordtype, content) alert = Level2_not_Level3(section, fixed_text, message, page_title) else: message = '%s: The word type "%s" is not known.' % (page_title, wordtype) alert = UnknownType(message=message, title=page_title) section.alerts.append(alert) return section # Get the Word Class associated with this type. word_class = level2_mapping[wordtype] # If there is no Word Class then this section can be ignored. if word_class is None: return FillerSection(text=self.text, parent=self.parent) # Otherwise create a new Word object. if word_class not in self.parent.wordtypes: self.parent.wordtypes[word_class] = 1 order = 0 else: order = self.parent.wordtypes[word_class] self.parent.wordtypes[word_class] += 1 new_word = word_class.get_and_update(title=self.parent.title, order=order, session=Session.object_session( self.parent), tags=self.get_property('tags')) self.set_property('word', new_word) self.parent.words.append(new_word) if not wordtype_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(wordtype_title_sec) for l3b in Chopper(content, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = simpleWordTypeHeaderSection(text=l3b.text, parent=self) else: section = FillerSection(text=l3b.text, parent=self, correct=True) if not shallow: section = section.parse() self.children.append(section) return self