Python XMLAnnotatorの例、congressionalrecord.lib.xml_annotator.XMLAnnotator Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_chamber(self):
     self.currentline = 2
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_chamber, '<chamber>')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.markup_pages()

コード例 #2

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_chamber(self):
     self.currentline = 2
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_chamber, '<chamber>')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.markup_pages()

コード例 #3

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_pages(self):
     self.currentline = 3
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_pages, '<pages>', group='pages')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.xml.append('<congress>%s</congress>\n' % self.congress)
     self.xml.append('<session>%s</session>\n' % self.session)
     self.markup_title()

コード例 #4

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_pages(self):
     self.currentline = 3
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_pages, '<pages>', group='pages')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.xml.append('<congress>%s</congress>\n' % self.congress)
     self.xml.append('<session>%s</session>\n' % self.session)
     self.markup_title()

コード例 #5

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

    def markup_paragraph(self):
        ''' this is the standard paragraph parser. handles new speakers,
        standard recorder comments, long and short quotes, etc. '''

        # get to the first line
        theline = self.get_line()

        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # remove <bullet> tags if they exist
        theline = self.check_bullet(theline)
        self.document_first_line = True

        if not self.has_speakers:
            self.xml.append('<recorder>')
            while theline:
                self.xml.append(theline)
                self.currentline += 1
                theline = self.get_line()
            self.xml.append('</recorder>\n')
            self.xml.append('</CRDoc>')
            return

        while theline:

            self.preprocess_state(theline)
            annotator = XMLAnnotator(theline)

            if self.intitle:
                annotator.register_tag(self.re_title, '<title>', group='title')
            # some things only appear on the first line of a paragraph
            elif self.inrollcall:
                # will only match on first line of the roll call
                annotator.register_tag_open(self.re_rollcall, '<recorder>')
            elif self.new_paragraph:
                annotator.register_tag_open(self.re_longquotestart, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start')
                if self.recorder:
                    annotator.register_tag_open(self.re_startofline, '<recorder>')
                    #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start')
                    #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start')
                annotator.register_tag(self.re_newspeaker, '<speaker name="%s">' % self.current_speaker, group='name')
                if self.return_from_quote_interjection(theline):
                    annotator.register_tag_open(self.re_longquotebody, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start')
                if not self.recorder and not self.inlongquote:
                    # check the current speaker-- if it's the recorder, then
                    # even though this isn't a "known" recorder sentence,
                    # there's no other speaker so we treat it like a recorder
                    # comment.
                    if self.current_speaker == 'recorder':
                        annotator.register_tag_open(self.re_speaking, '<recorder>', group='start')
                        self.recorder = True
                    else:
                        annotator.register_tag_open(self.re_speaking, '<speaking name="%s">' % self.current_speaker, group='start')

            if not self.intitle and not self.inlongquote and not self.inrollcall:
                #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker)
                pass

            # note: the endquote tag needs to be registered BEFORE the end
            # speaking tag, because the quote tag should appear before (be
            # nested within) the speaking tag. a nesting functionality should
            # really be implemented within the XMLAnnotator class, but this
            # will do for now.
            if not self.inlongquote and not self.intitle and not self.inrollcall:
                if self.inquote:
                    #annotator.register_tag_close(self.re_endshortquote, '</speaking>')
                    pass

            if self.paragraph_ends():
                if self.inrollcall:
                    annotator.register_tag_close(self.re_endofline, '</recorder>')
                    self.inrollcall = False
                elif self.recorder:
                    annotator.register_tag_close(self.re_endofline, '</recorder>')
                elif self.inlongquote:
                    if self.longquote_ends():
                        annotator.register_tag_close(self.re_endofline, '</speaking>')
                elif self.intitle:
                    pass
                #  this specific set of states usually means we're somewhere
                #  unrecognized, and can without these caveats can end up with
                #  stray </speaking> tags.
                elif (self.current_speaker == 'recorder' and not (self.inlongquote or
                                                                  self.inrollcall or
                                                                  self.recorder or
                                                                  self.inquote or
                                                                  self.intitle)):
                    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
                else:
                    annotator.register_tag_close(self.re_endofline, '</speaking>')

            #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False
            #    and self.recorder == False and self.inquote == False and self.intitle == False):
            #    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
            #    annotator.register_tag(self.re_alltext, '<unknown>', group='text')

            xml_line = annotator.apply()
            #print xml_line
            self.xml.append(xml_line)

            # do some post processing
            self.postprocess_state(theline)

            # get the next line and do it all again
            self.currentline +=1
            theline = self.get_line()
            while theline is not None and not theline.strip():
                self.currentline += 1
                theline = self.get_line()
            if not theline:
                # end of file
                self.xml.append('</CRDoc>')

コード例 #6

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

    def markup_title(self):
        ''' identify and markup the document title. the title is some lines of
        text, usually but not always capitalized, usually but not always
        centered, and followed by a least one empty line. they sometimes have a
        line of dashes separating them from the body of the document. and
        sometimes they don't exist at all.'''

        MIN_TITLE_INDENT = 0

        # skip line 4; it contains a static reference to the GPO website.
        self.currentline = 5
        theline = self.get_line()
        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # we're going to check what kind of title this is once we're done
        # parsing it, so keep track of where it starts. since all the special
        # titles are uniquely specified by their first line, we only need to
        # track that.
        title_startline = theline

        # if it's not a specially formatted title and it's not indented enough,
        # then it's probably missing a title altogether
        if self.spaces_indented(theline) < MIN_TITLE_INDENT and not self.is_special_title(theline):
            self.markup_paragraph()

        else:
            # a regular old title
            annotator = XMLAnnotator(theline)
            annotator.register_tag_open(self.re_title_start, '<document_title>')
            self.currentline +=1
            theline = self.get_line()

            # check if the title finished on the sameline it started on:
            if not theline.strip():
                annotator.register_tag_close(self.re_title_end, '</document_title>')
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)

            else:
                # either way we need to apply the tags to the title start.
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)
                # now find the title end
                while theline.strip():
                    self.currentline +=1
                    theline = self.get_line()
                # once we hit an empty line, we know the end of the *previous* line
                # is the end of the title.
                theline = self.get_line(-1)
                annotator = XMLAnnotator(theline)
                annotator.register_tag_close(self.re_title_end, '</document_title>')
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)

            # note that as we exit this function, the current line is one PAST
            # the end of the title, which should generally be a blank line.
            self.markup_paragraph()

コード例 #7

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_preamble(self):
     self.currentline = 1
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_volume, '<volume>')
     annotator.register_tag(self.re_number, '<number>')
     annotator.register_tag(self.re_weekday, '<weekday>', group='weekday')
     annotator.register_tag(self.re_month, '<month>', group='month')
     annotator.register_tag(self.re_day, '<day>', group='day')
     annotator.register_tag(self.re_year, '<year>', group='year')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     if self.is_bullet:
         self.xml.append('<bullet>1</bullet>\n')
     self.markup_chamber()

コード例 #8

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

    def markup_paragraph(self):
        ''' this is the standard paragraph parser. handles new speakers,
        standard recorder comments, long and short quotes, etc. '''

        # get to the first line
        theline = self.get_line()

        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # remove <bullet> tags if they exist
        theline = self.check_bullet(theline)
        self.document_first_line = True

        if not self.has_speakers:
            self.xml.append('<recorder>')
            while theline:
                self.xml.append(theline)
                self.currentline += 1
                theline = self.get_line()
            self.xml.append('</recorder>\n')
            self.xml.append('</CRDoc>')
            return

        while theline:

            self.preprocess_state(theline)
            annotator = XMLAnnotator(theline)

            if self.intitle:
                annotator.register_tag(self.re_title, '<title>', group='title')
            # some things only appear on the first line of a paragraph
            elif self.inrollcall:
                # will only match on first line of the roll call
                annotator.register_tag_open(self.re_rollcall, '<recorder>')
            elif self.new_paragraph:
                annotator.register_tag_open(
                    self.re_longquotestart,
                    '<speaking quote="true" speaker="%s">' %
                    self.current_speaker,
                    group='start')
                if self.recorder:
                    annotator.register_tag_open(self.re_startofline,
                                                '<recorder>')
                    #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start')
                    #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start')
                annotator.register_tag(self.re_newspeaker,
                                       '<speaker name="%s">' %
                                       self.current_speaker,
                                       group='name')
                if self.return_from_quote_interjection(theline):
                    annotator.register_tag_open(
                        self.re_longquotebody,
                        '<speaking quote="true" speaker="%s">' %
                        self.current_speaker,
                        group='start')
                if not self.recorder and not self.inlongquote:
                    # check the current speaker-- if it's the recorder, then
                    # even though this isn't a "known" recorder sentence,
                    # there's no other speaker so we treat it like a recorder
                    # comment.
                    if self.current_speaker == 'recorder':
                        annotator.register_tag_open(self.re_speaking,
                                                    '<recorder>',
                                                    group='start')
                        self.recorder = True
                    else:
                        annotator.register_tag_open(self.re_speaking,
                                                    '<speaking name="%s">' %
                                                    self.current_speaker,
                                                    group='start')

            if not self.intitle and not self.inlongquote and not self.inrollcall:
                #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker)
                pass

            # note: the endquote tag needs to be registered BEFORE the end
            # speaking tag, because the quote tag should appear before (be
            # nested within) the speaking tag. a nesting functionality should
            # really be implemented within the XMLAnnotator class, but this
            # will do for now.
            if not self.inlongquote and not self.intitle and not self.inrollcall:
                if self.inquote:
                    #annotator.register_tag_close(self.re_endshortquote, '</speaking>')
                    pass

            if self.paragraph_ends():
                if self.inrollcall:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</recorder>')
                    self.inrollcall = False
                elif self.recorder:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</recorder>')
                elif self.inlongquote:
                    if self.longquote_ends():
                        annotator.register_tag_close(self.re_endofline,
                                                     '</speaking>')
                elif self.intitle:
                    pass
                #  this specific set of states usually means we're somewhere
                #  unrecognized, and can without these caveats can end up with
                #  stray </speaking> tags.
                elif (self.current_speaker == 'recorder'
                      and not (self.inlongquote or self.inrollcall or
                               self.recorder or self.inquote or self.intitle)):
                    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
                else:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</speaking>')

            #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False
            #    and self.recorder == False and self.inquote == False and self.intitle == False):
            #    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
            #    annotator.register_tag(self.re_alltext, '<unknown>', group='text')

            xml_line = annotator.apply()
            #print xml_line
            self.xml.append(xml_line)

            # do some post processing
            self.postprocess_state(theline)

            # get the next line and do it all again
            self.currentline += 1
            theline = self.get_line()
            while theline is not None and not theline.strip():
                self.currentline += 1
                theline = self.get_line()
            if not theline:
                # end of file
                self.xml.append('</CRDoc>')

コード例 #9

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

    def markup_title(self):
        ''' identify and markup the document title. the title is some lines of
        text, usually but not always capitalized, usually but not always
        centered, and followed by a least one empty line. they sometimes have a
        line of dashes separating them from the body of the document. and
        sometimes they don't exist at all.'''

        MIN_TITLE_INDENT = 0

        # skip line 4; it contains a static reference to the GPO website.
        self.currentline = 5
        theline = self.get_line()
        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # we're going to check what kind of title this is once we're done
        # parsing it, so keep track of where it starts. since all the special
        # titles are uniquely specified by their first line, we only need to
        # track that.
        title_startline = theline

        # if it's not a specially formatted title and it's not indented enough,
        # then it's probably missing a title altogether
        if self.spaces_indented(
                theline) < MIN_TITLE_INDENT and not self.is_special_title(
                    theline):
            self.markup_paragraph()

        else:
            # a regular old title
            annotator = XMLAnnotator(theline)
            annotator.register_tag_open(self.re_title_start,
                                        '<document_title>')
            self.currentline += 1
            theline = self.get_line()

            # check if the title finished on the sameline it started on:
            if not theline.strip():
                annotator.register_tag_close(self.re_title_end,
                                             '</document_title>')
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)

            else:
                # either way we need to apply the tags to the title start.
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)
                # now find the title end
                while theline.strip():
                    self.currentline += 1
                    theline = self.get_line()
                # once we hit an empty line, we know the end of the *previous* line
                # is the end of the title.
                theline = self.get_line(-1)
                annotator = XMLAnnotator(theline)
                annotator.register_tag_close(self.re_title_end,
                                             '</document_title>')
                xml_line = annotator.apply()
                #print xml_line
                self.xml.append(xml_line)

            # note that as we exit this function, the current line is one PAST
            # the end of the title, which should generally be a blank line.
            self.markup_paragraph()

コード例 #10

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_preamble(self):
     self.currentline = 1
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_volume, '<volume>')
     annotator.register_tag(self.re_number, '<number>')
     annotator.register_tag(self.re_weekday, '<weekday>', group='weekday')
     annotator.register_tag(self.re_month, '<month>', group='month')
     annotator.register_tag(self.re_day, '<day>', group='day')
     annotator.register_tag(self.re_year, '<year>', group='year')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     if self.is_bullet:
         self.xml.append('<bullet>1</bullet>\n')
     self.markup_chamber()