def markup_chamber(self): self.currentline = 2 theline = self.rawlines[self.currentline] annotator = XMLAnnotator(theline) annotator.register_tag(self.re_chamber, '<chamber>') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) self.markup_pages()
def markup_pages(self): self.currentline = 3 theline = self.rawlines[self.currentline] annotator = XMLAnnotator(theline) annotator.register_tag(self.re_pages, '<pages>', group='pages') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) self.xml.append('<congress>%s</congress>\n' % self.congress) self.xml.append('<session>%s</session>\n' % self.session) self.markup_title()
def markup_paragraph(self): ''' this is the standard paragraph parser. handles new speakers, standard recorder comments, long and short quotes, etc. ''' # get to the first line theline = self.get_line() while not theline.strip(): self.currentline += 1 theline = self.get_line() # remove <bullet> tags if they exist theline = self.check_bullet(theline) self.document_first_line = True if not self.has_speakers: self.xml.append('<recorder>') while theline: self.xml.append(theline) self.currentline += 1 theline = self.get_line() self.xml.append('</recorder>\n') self.xml.append('</CRDoc>') return while theline: self.preprocess_state(theline) annotator = XMLAnnotator(theline) if self.intitle: annotator.register_tag(self.re_title, '<title>', group='title') # some things only appear on the first line of a paragraph elif self.inrollcall: # will only match on first line of the roll call annotator.register_tag_open(self.re_rollcall, '<recorder>') elif self.new_paragraph: annotator.register_tag_open(self.re_longquotestart, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start') if self.recorder: annotator.register_tag_open(self.re_startofline, '<recorder>') #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start') #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start') annotator.register_tag(self.re_newspeaker, '<speaker name="%s">' % self.current_speaker, group='name') if self.return_from_quote_interjection(theline): annotator.register_tag_open(self.re_longquotebody, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start') if not self.recorder and not self.inlongquote: # check the current speaker-- if it's the recorder, then # even though this isn't a "known" recorder sentence, # there's no other speaker so we treat it like a recorder # comment. if self.current_speaker == 'recorder': annotator.register_tag_open(self.re_speaking, '<recorder>', group='start') self.recorder = True else: annotator.register_tag_open(self.re_speaking, '<speaking name="%s">' % self.current_speaker, group='start') if not self.intitle and not self.inlongquote and not self.inrollcall: #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker) pass # note: the endquote tag needs to be registered BEFORE the end # speaking tag, because the quote tag should appear before (be # nested within) the speaking tag. a nesting functionality should # really be implemented within the XMLAnnotator class, but this # will do for now. if not self.inlongquote and not self.intitle and not self.inrollcall: if self.inquote: #annotator.register_tag_close(self.re_endshortquote, '</speaking>') pass if self.paragraph_ends(): if self.inrollcall: annotator.register_tag_close(self.re_endofline, '</recorder>') self.inrollcall = False elif self.recorder: annotator.register_tag_close(self.re_endofline, '</recorder>') elif self.inlongquote: if self.longquote_ends(): annotator.register_tag_close(self.re_endofline, '</speaking>') elif self.intitle: pass # this specific set of states usually means we're somewhere # unrecognized, and can without these caveats can end up with # stray </speaking> tags. elif (self.current_speaker == 'recorder' and not (self.inlongquote or self.inrollcall or self.recorder or self.inquote or self.intitle)): print "UNRECOGNIZED STATE (but that's ok): %s" % theline else: annotator.register_tag_close(self.re_endofline, '</speaking>') #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False # and self.recorder == False and self.inquote == False and self.intitle == False): # print "UNRECOGNIZED STATE (but that's ok): %s" % theline # annotator.register_tag(self.re_alltext, '<unknown>', group='text') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # do some post processing self.postprocess_state(theline) # get the next line and do it all again self.currentline +=1 theline = self.get_line() while theline is not None and not theline.strip(): self.currentline += 1 theline = self.get_line() if not theline: # end of file self.xml.append('</CRDoc>')
def markup_title(self): ''' identify and markup the document title. the title is some lines of text, usually but not always capitalized, usually but not always centered, and followed by a least one empty line. they sometimes have a line of dashes separating them from the body of the document. and sometimes they don't exist at all.''' MIN_TITLE_INDENT = 0 # skip line 4; it contains a static reference to the GPO website. self.currentline = 5 theline = self.get_line() while not theline.strip(): self.currentline += 1 theline = self.get_line() # we're going to check what kind of title this is once we're done # parsing it, so keep track of where it starts. since all the special # titles are uniquely specified by their first line, we only need to # track that. title_startline = theline # if it's not a specially formatted title and it's not indented enough, # then it's probably missing a title altogether if self.spaces_indented(theline) < MIN_TITLE_INDENT and not self.is_special_title(theline): self.markup_paragraph() else: # a regular old title annotator = XMLAnnotator(theline) annotator.register_tag_open(self.re_title_start, '<document_title>') self.currentline +=1 theline = self.get_line() # check if the title finished on the sameline it started on: if not theline.strip(): annotator.register_tag_close(self.re_title_end, '</document_title>') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) else: # either way we need to apply the tags to the title start. xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # now find the title end while theline.strip(): self.currentline +=1 theline = self.get_line() # once we hit an empty line, we know the end of the *previous* line # is the end of the title. theline = self.get_line(-1) annotator = XMLAnnotator(theline) annotator.register_tag_close(self.re_title_end, '</document_title>') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # note that as we exit this function, the current line is one PAST # the end of the title, which should generally be a blank line. self.markup_paragraph()
def markup_preamble(self): self.currentline = 1 theline = self.rawlines[self.currentline] annotator = XMLAnnotator(theline) annotator.register_tag(self.re_volume, '<volume>') annotator.register_tag(self.re_number, '<number>') annotator.register_tag(self.re_weekday, '<weekday>', group='weekday') annotator.register_tag(self.re_month, '<month>', group='month') annotator.register_tag(self.re_day, '<day>', group='day') annotator.register_tag(self.re_year, '<year>', group='year') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) if self.is_bullet: self.xml.append('<bullet>1</bullet>\n') self.markup_chamber()
def markup_paragraph(self): ''' this is the standard paragraph parser. handles new speakers, standard recorder comments, long and short quotes, etc. ''' # get to the first line theline = self.get_line() while not theline.strip(): self.currentline += 1 theline = self.get_line() # remove <bullet> tags if they exist theline = self.check_bullet(theline) self.document_first_line = True if not self.has_speakers: self.xml.append('<recorder>') while theline: self.xml.append(theline) self.currentline += 1 theline = self.get_line() self.xml.append('</recorder>\n') self.xml.append('</CRDoc>') return while theline: self.preprocess_state(theline) annotator = XMLAnnotator(theline) if self.intitle: annotator.register_tag(self.re_title, '<title>', group='title') # some things only appear on the first line of a paragraph elif self.inrollcall: # will only match on first line of the roll call annotator.register_tag_open(self.re_rollcall, '<recorder>') elif self.new_paragraph: annotator.register_tag_open( self.re_longquotestart, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start') if self.recorder: annotator.register_tag_open(self.re_startofline, '<recorder>') #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start') #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start') annotator.register_tag(self.re_newspeaker, '<speaker name="%s">' % self.current_speaker, group='name') if self.return_from_quote_interjection(theline): annotator.register_tag_open( self.re_longquotebody, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start') if not self.recorder and not self.inlongquote: # check the current speaker-- if it's the recorder, then # even though this isn't a "known" recorder sentence, # there's no other speaker so we treat it like a recorder # comment. if self.current_speaker == 'recorder': annotator.register_tag_open(self.re_speaking, '<recorder>', group='start') self.recorder = True else: annotator.register_tag_open(self.re_speaking, '<speaking name="%s">' % self.current_speaker, group='start') if not self.intitle and not self.inlongquote and not self.inrollcall: #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker) pass # note: the endquote tag needs to be registered BEFORE the end # speaking tag, because the quote tag should appear before (be # nested within) the speaking tag. a nesting functionality should # really be implemented within the XMLAnnotator class, but this # will do for now. if not self.inlongquote and not self.intitle and not self.inrollcall: if self.inquote: #annotator.register_tag_close(self.re_endshortquote, '</speaking>') pass if self.paragraph_ends(): if self.inrollcall: annotator.register_tag_close(self.re_endofline, '</recorder>') self.inrollcall = False elif self.recorder: annotator.register_tag_close(self.re_endofline, '</recorder>') elif self.inlongquote: if self.longquote_ends(): annotator.register_tag_close(self.re_endofline, '</speaking>') elif self.intitle: pass # this specific set of states usually means we're somewhere # unrecognized, and can without these caveats can end up with # stray </speaking> tags. elif (self.current_speaker == 'recorder' and not (self.inlongquote or self.inrollcall or self.recorder or self.inquote or self.intitle)): print "UNRECOGNIZED STATE (but that's ok): %s" % theline else: annotator.register_tag_close(self.re_endofline, '</speaking>') #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False # and self.recorder == False and self.inquote == False and self.intitle == False): # print "UNRECOGNIZED STATE (but that's ok): %s" % theline # annotator.register_tag(self.re_alltext, '<unknown>', group='text') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # do some post processing self.postprocess_state(theline) # get the next line and do it all again self.currentline += 1 theline = self.get_line() while theline is not None and not theline.strip(): self.currentline += 1 theline = self.get_line() if not theline: # end of file self.xml.append('</CRDoc>')
def markup_title(self): ''' identify and markup the document title. the title is some lines of text, usually but not always capitalized, usually but not always centered, and followed by a least one empty line. they sometimes have a line of dashes separating them from the body of the document. and sometimes they don't exist at all.''' MIN_TITLE_INDENT = 0 # skip line 4; it contains a static reference to the GPO website. self.currentline = 5 theline = self.get_line() while not theline.strip(): self.currentline += 1 theline = self.get_line() # we're going to check what kind of title this is once we're done # parsing it, so keep track of where it starts. since all the special # titles are uniquely specified by their first line, we only need to # track that. title_startline = theline # if it's not a specially formatted title and it's not indented enough, # then it's probably missing a title altogether if self.spaces_indented( theline) < MIN_TITLE_INDENT and not self.is_special_title( theline): self.markup_paragraph() else: # a regular old title annotator = XMLAnnotator(theline) annotator.register_tag_open(self.re_title_start, '<document_title>') self.currentline += 1 theline = self.get_line() # check if the title finished on the sameline it started on: if not theline.strip(): annotator.register_tag_close(self.re_title_end, '</document_title>') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) else: # either way we need to apply the tags to the title start. xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # now find the title end while theline.strip(): self.currentline += 1 theline = self.get_line() # once we hit an empty line, we know the end of the *previous* line # is the end of the title. theline = self.get_line(-1) annotator = XMLAnnotator(theline) annotator.register_tag_close(self.re_title_end, '</document_title>') xml_line = annotator.apply() #print xml_line self.xml.append(xml_line) # note that as we exit this function, the current line is one PAST # the end of the title, which should generally be a blank line. self.markup_paragraph()