Python XMLAnnotator.register_tagの例、congressionalrecord.lib.xml_annotator.XMLAnnotator.register_tag Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_chamber(self):
     self.currentline = 2
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_chamber, '<chamber>')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.markup_pages()

コード例 #2

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_chamber(self):
     self.currentline = 2
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_chamber, '<chamber>')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.markup_pages()

コード例 #3

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_pages(self):
     self.currentline = 3
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_pages, '<pages>', group='pages')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.xml.append('<congress>%s</congress>\n' % self.congress)
     self.xml.append('<session>%s</session>\n' % self.session)
     self.markup_title()

コード例 #4

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_pages(self):
     self.currentline = 3
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_pages, '<pages>', group='pages')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     self.xml.append('<congress>%s</congress>\n' % self.congress)
     self.xml.append('<session>%s</session>\n' % self.session)
     self.markup_title()

コード例 #5

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

 def markup_preamble(self):
     self.currentline = 1
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_volume, '<volume>')
     annotator.register_tag(self.re_number, '<number>')
     annotator.register_tag(self.re_weekday, '<weekday>', group='weekday')
     annotator.register_tag(self.re_month, '<month>', group='month')
     annotator.register_tag(self.re_day, '<day>', group='day')
     annotator.register_tag(self.re_year, '<year>', group='year')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     if self.is_bullet:
         self.xml.append('<bullet>1</bullet>\n')
     self.markup_chamber()

コード例 #6

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

 def markup_preamble(self):
     self.currentline = 1
     theline = self.rawlines[self.currentline]
     annotator = XMLAnnotator(theline)
     annotator.register_tag(self.re_volume, '<volume>')
     annotator.register_tag(self.re_number, '<number>')
     annotator.register_tag(self.re_weekday, '<weekday>', group='weekday')
     annotator.register_tag(self.re_month, '<month>', group='month')
     annotator.register_tag(self.re_day, '<day>', group='day')
     annotator.register_tag(self.re_year, '<year>', group='year')
     xml_line = annotator.apply()
     #print xml_line
     self.xml.append(xml_line)
     if self.is_bullet:
         self.xml.append('<bullet>1</bullet>\n')
     self.markup_chamber()

コード例 #7

0

ファイルを表示

ファイル: cr_parser.py プロジェクト: PoliticalFraming/congressional-record

    def markup_paragraph(self):
        ''' this is the standard paragraph parser. handles new speakers,
        standard recorder comments, long and short quotes, etc. '''

        # get to the first line
        theline = self.get_line()

        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # remove <bullet> tags if they exist
        theline = self.check_bullet(theline)
        self.document_first_line = True

        if not self.has_speakers:
            self.xml.append('<recorder>')
            while theline:
                self.xml.append(theline)
                self.currentline += 1
                theline = self.get_line()
            self.xml.append('</recorder>\n')
            self.xml.append('</CRDoc>')
            return

        while theline:

            self.preprocess_state(theline)
            annotator = XMLAnnotator(theline)

            if self.intitle:
                annotator.register_tag(self.re_title, '<title>', group='title')
            # some things only appear on the first line of a paragraph
            elif self.inrollcall:
                # will only match on first line of the roll call
                annotator.register_tag_open(self.re_rollcall, '<recorder>')
            elif self.new_paragraph:
                annotator.register_tag_open(self.re_longquotestart, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start')
                if self.recorder:
                    annotator.register_tag_open(self.re_startofline, '<recorder>')
                    #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start')
                    #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start')
                annotator.register_tag(self.re_newspeaker, '<speaker name="%s">' % self.current_speaker, group='name')
                if self.return_from_quote_interjection(theline):
                    annotator.register_tag_open(self.re_longquotebody, '<speaking quote="true" speaker="%s">' % self.current_speaker, group='start')
                if not self.recorder and not self.inlongquote:
                    # check the current speaker-- if it's the recorder, then
                    # even though this isn't a "known" recorder sentence,
                    # there's no other speaker so we treat it like a recorder
                    # comment.
                    if self.current_speaker == 'recorder':
                        annotator.register_tag_open(self.re_speaking, '<recorder>', group='start')
                        self.recorder = True
                    else:
                        annotator.register_tag_open(self.re_speaking, '<speaking name="%s">' % self.current_speaker, group='start')

            if not self.intitle and not self.inlongquote and not self.inrollcall:
                #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker)
                pass

            # note: the endquote tag needs to be registered BEFORE the end
            # speaking tag, because the quote tag should appear before (be
            # nested within) the speaking tag. a nesting functionality should
            # really be implemented within the XMLAnnotator class, but this
            # will do for now.
            if not self.inlongquote and not self.intitle and not self.inrollcall:
                if self.inquote:
                    #annotator.register_tag_close(self.re_endshortquote, '</speaking>')
                    pass

            if self.paragraph_ends():
                if self.inrollcall:
                    annotator.register_tag_close(self.re_endofline, '</recorder>')
                    self.inrollcall = False
                elif self.recorder:
                    annotator.register_tag_close(self.re_endofline, '</recorder>')
                elif self.inlongquote:
                    if self.longquote_ends():
                        annotator.register_tag_close(self.re_endofline, '</speaking>')
                elif self.intitle:
                    pass
                #  this specific set of states usually means we're somewhere
                #  unrecognized, and can without these caveats can end up with
                #  stray </speaking> tags.
                elif (self.current_speaker == 'recorder' and not (self.inlongquote or
                                                                  self.inrollcall or
                                                                  self.recorder or
                                                                  self.inquote or
                                                                  self.intitle)):
                    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
                else:
                    annotator.register_tag_close(self.re_endofline, '</speaking>')

            #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False
            #    and self.recorder == False and self.inquote == False and self.intitle == False):
            #    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
            #    annotator.register_tag(self.re_alltext, '<unknown>', group='text')

            xml_line = annotator.apply()
            #print xml_line
            self.xml.append(xml_line)

            # do some post processing
            self.postprocess_state(theline)

            # get the next line and do it all again
            self.currentline +=1
            theline = self.get_line()
            while theline is not None and not theline.strip():
                self.currentline += 1
                theline = self.get_line()
            if not theline:
                # end of file
                self.xml.append('</CRDoc>')

コード例 #8

0

ファイルを表示

ファイル: cr_parser_modified.py プロジェクト: ldparadise/datasousveillance

    def markup_paragraph(self):
        ''' this is the standard paragraph parser. handles new speakers,
        standard recorder comments, long and short quotes, etc. '''

        # get to the first line
        theline = self.get_line()

        while not theline.strip():
            self.currentline += 1
            theline = self.get_line()

        # remove <bullet> tags if they exist
        theline = self.check_bullet(theline)
        self.document_first_line = True

        if not self.has_speakers:
            self.xml.append('<recorder>')
            while theline:
                self.xml.append(theline)
                self.currentline += 1
                theline = self.get_line()
            self.xml.append('</recorder>\n')
            self.xml.append('</CRDoc>')
            return

        while theline:

            self.preprocess_state(theline)
            annotator = XMLAnnotator(theline)

            if self.intitle:
                annotator.register_tag(self.re_title, '<title>', group='title')
            # some things only appear on the first line of a paragraph
            elif self.inrollcall:
                # will only match on first line of the roll call
                annotator.register_tag_open(self.re_rollcall, '<recorder>')
            elif self.new_paragraph:
                annotator.register_tag_open(
                    self.re_longquotestart,
                    '<speaking quote="true" speaker="%s">' %
                    self.current_speaker,
                    group='start')
                if self.recorder:
                    annotator.register_tag_open(self.re_startofline,
                                                '<recorder>')
                    #annotator.register_tag_open(self.re_recorderstart, '<recorder>', 'start')
                    #annotator.register_tag_open(self.re_recorder_fuzzy, '<recorder>', 'start')
                annotator.register_tag(self.re_newspeaker,
                                       '<speaker name="%s">' %
                                       self.current_speaker,
                                       group='name')
                if self.return_from_quote_interjection(theline):
                    annotator.register_tag_open(
                        self.re_longquotebody,
                        '<speaking quote="true" speaker="%s">' %
                        self.current_speaker,
                        group='start')
                if not self.recorder and not self.inlongquote:
                    # check the current speaker-- if it's the recorder, then
                    # even though this isn't a "known" recorder sentence,
                    # there's no other speaker so we treat it like a recorder
                    # comment.
                    if self.current_speaker == 'recorder':
                        annotator.register_tag_open(self.re_speaking,
                                                    '<recorder>',
                                                    group='start')
                        self.recorder = True
                    else:
                        annotator.register_tag_open(self.re_speaking,
                                                    '<speaking name="%s">' %
                                                    self.current_speaker,
                                                    group='start')

            if not self.intitle and not self.inlongquote and not self.inrollcall:
                #annotator.register_tag_open(self.re_startshortquote, '<quote speaker="%s">' % self.current_speaker)
                pass

            # note: the endquote tag needs to be registered BEFORE the end
            # speaking tag, because the quote tag should appear before (be
            # nested within) the speaking tag. a nesting functionality should
            # really be implemented within the XMLAnnotator class, but this
            # will do for now.
            if not self.inlongquote and not self.intitle and not self.inrollcall:
                if self.inquote:
                    #annotator.register_tag_close(self.re_endshortquote, '</speaking>')
                    pass

            if self.paragraph_ends():
                if self.inrollcall:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</recorder>')
                    self.inrollcall = False
                elif self.recorder:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</recorder>')
                elif self.inlongquote:
                    if self.longquote_ends():
                        annotator.register_tag_close(self.re_endofline,
                                                     '</speaking>')
                elif self.intitle:
                    pass
                #  this specific set of states usually means we're somewhere
                #  unrecognized, and can without these caveats can end up with
                #  stray </speaking> tags.
                elif (self.current_speaker == 'recorder'
                      and not (self.inlongquote or self.inrollcall or
                               self.recorder or self.inquote or self.intitle)):
                    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
                else:
                    annotator.register_tag_close(self.re_endofline,
                                                 '</speaking>')

            #if (self.current_speaker == 'recorder' and self.inlongquote == False and self.inrollcall == False
            #    and self.recorder == False and self.inquote == False and self.intitle == False):
            #    print "UNRECOGNIZED STATE (but that's ok): %s" % theline
            #    annotator.register_tag(self.re_alltext, '<unknown>', group='text')

            xml_line = annotator.apply()
            #print xml_line
            self.xml.append(xml_line)

            # do some post processing
            self.postprocess_state(theline)

            # get the next line and do it all again
            self.currentline += 1
            theline = self.get_line()
            while theline is not None and not theline.strip():
                self.currentline += 1
                theline = self.get_line()
            if not theline:
                # end of file
                self.xml.append('</CRDoc>')