Ejemplo n.º 1
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):
        def is_heading(parser):
            return parser.reader.peek().font.size == 17

        def is_dnr(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and
                    re.match('\d+-\d{2,4}', str(chunk))):
                return True

        def is_datum(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and
                    re.match('\d{4}-\d{2}-\d{2}', str(chunk))):
                return True

        def is_nonessential(parser):
            chunk = parser.reader.peek()
            if chunk.top >= 1159 or chunk.top <= 146:
                return True

        def is_abstract(parser):
            if str(parser.reader.peek()).startswith("Beslutet i korthet:"):
                return True

        def is_section(parser):
            chunk = parser.reader.peek()
            strchunk = str(chunk)
            if chunk.font.size == 14 and chunk[0].tag == "b" and not strchunk.endswith("."):
                return True

        def is_blockquote(parser):
            chunk = parser.reader.peek()
            if chunk.left >= 255:
                return True

        def is_normal(parser):
            chunk = parser.reader.peek()
            if chunk.left < 255:
                return True

        def is_paragraph(parser):
            return True

        @decorators.newstate("body")
        def make_body(parser):
            return parser.make_children(Body())

        def make_heading(parser):
            # h = Heading(str(parser.reader.next()).strip())
            h = Meta([str(parser.reader.next()).strip()],
                     predicate=DCTERMS.title,
                     lang="sv")
            return h

        @decorators.newstate("abstract")
        def make_abstract(parser):
            a = Abstract([Paragraph(parser.reader.next())])
            return parser.make_children(a)

        @decorators.newstate("section")
        def make_section(parser):
            s = UnorderedSection(title=str(parser.reader.next()).strip())
            return parser.make_children(s)

        @decorators.newstate("blockquote")
        def make_blockquote(parser):
            b = Blockquote()
            return parser.make_children(b)

        def make_paragraph(parser):
            # A Paragraph containing PDFReader.Textelement object will
            # render these as <span> objects (the default rendering. A
            # PDFReader.Textbox object containing same will render
            # unstyled Textelements as plain strings, cutting down on
            # unneccesary <span> elements. However, these themselves
            # render with unneccessary @style and @class attributes,
            # which we don't want. For now, lets stick with Paragraphs
            # as containers and maybe later figure out how to get
            # PDFReader.Textelements to render themselves sanely.
            # 
            # p = parser.reader.next()
            p = Paragraph(parser.reader.next())
            return p

        def make_datum(parser):
            datestr = str(parser.reader.next()).strip()
            year = int(datestr.split("-")[0])
            if 2100 > year > 1970:
                parser.remove_recognizer(is_datum)
                d = [datestr]
                return Meta(d, predicate=RPUBL.avgorandedatum,
                            datatype=XSD.date)
            else:
                self.log.warning("Year in %s doesn't look valid" % datestr)
                return None

        def make_dnr(parser):
            parser.remove_recognizer(is_dnr)
            ds = [x for x in str(parser.reader.next()).strip().split(" ")]
            return Meta(ds, predicate=RPUBL.diarienummer)

        def skip_nonessential(parser):
            parser.reader.next()  # return nothing

        p = FSMParser()
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.set_recognizers(is_datum,
                          is_dnr,
                          is_nonessential,
                          is_heading,
                          is_abstract,
                          is_section,
                          is_normal,
                          is_blockquote,
                          is_paragraph)
        p.set_transitions({("body", is_heading): (make_heading, None),
                           ("body", is_nonessential): (skip_nonessential, None),
                           ("body", is_datum): (make_datum, None),
                           ("body", is_dnr): (make_dnr, None),
                           ("body", is_abstract): (make_abstract, "abstract"),
                           ("body", is_section): (make_section, "section"),
                           ("body", is_blockquote): (make_blockquote, "blockquote"),
                           ("body", is_paragraph): (make_paragraph, None),
                           ("abstract", is_paragraph): (make_paragraph, None),
                           ("abstract", is_section): (False, None),
                           ("abstract", is_dnr): (False, None),
                           ("abstract", is_datum): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_nonessential): (skip_nonessential, None),
                           ("section", is_section): (False, None),
                           ("section", is_blockquote): (make_blockquote, "blockquote"),
                           ("section", is_datum): (make_datum, None),
                           ("section", is_dnr): (make_dnr, None),
                           ("blockquote", is_blockquote): (make_paragraph, None),
                           ("blockquote", is_nonessential): (skip_nonessential,  None),
                           ("blockquote", is_section): (False, None),
                           ("blockquote", is_normal): (False, None),
                           ("blockquote", is_datum): (make_datum, None),
                           ("blockquote", is_dnr): (make_dnr, None),
                           })
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Ejemplo n.º 2
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal','decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha','upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman','upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc','circle','square','dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")
        
        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol,state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor,newstate)
        
        # CONSTRUCTORS
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)
        setattr(make_body,'newstate','body')
        
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_section,'newstate','section')

        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsection,'newstate','subsection')

        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsubsection,'newstate','subsubsection')

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])

#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        def make_ol_decimal(parser):
            return make_orderedlist(parser,"decimal","ol-decimal")
        setattr(make_ol_decimal,'newstate','ol-decimal')

        def make_ol_alpha(parser):
            return make_orderedlist(parser,"lower-alpha", "ol-alpha")
        setattr(make_ol_alpha,'newstate','ol-alpha')

        def make_ol_roman(parser):
            return make_orderedlist(parser,"lower-roman", "ol-roman")
        setattr(make_ol_roman,'newstate','ol-romal')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype,ordinal,separator,rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem,'newstate','listitem')

        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-a")
        # setattr(make_state_a, 'newstate', 'state-a')

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-b")
        # setattr(make_state_b, 'newstate', 'state-b')

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-c")
        # setattr(make_state_c, 'newstate', 'state-c')
        
        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and 
                    len(list(filter(None,s.split(".")))))

        def make_orderedlist(parser,listtype,childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem,"listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match
        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None,chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +',chunk)
            if m:
                if chunk.startswith("0"):
                    listtype="decimal-leading-zero"
                else:
                    listtype="decimal"
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            if chunk.startswith("* "):
                return ("disc",None,None,chunk)
            if chunk.startswith("- "):
                return ("dash",None,None,chunk)
                
            return (listtype,ordinal,separator,chunk) # None * 3

        
        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal,
                          is_li_roman, 
                          is_li_alpha,
                          is_header,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_preformatted,
                          is_definition,
                          is_description,
                          is_state_a,
                          is_state_b,
                          is_state_c,
                          is_paragraph)
        p.set_transitions({("body", is_paragraph): (make_paragraph, None),
                           ("body", is_section): (make_section,"section"),
                           ("body", is_state_a): (make_state_a, "state-a"),
                           ("state-a", is_state_b): (make_state_b, "state-b"),
                           ("state-b", is_state_c): (make_state_c, "state-c"),
                           ("state-c", is_section): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("subsection", is_paragraph): (make_paragraph,None),
                           ("subsection", is_subsection): (False,None),
                           ("subsection", is_state_a): (False,"body"), 
                           ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"),
                           ("subsubsection", is_paragraph): (make_paragraph,None),
                           ("subsubsection", is_section): (False, None),
                           ("subsection", is_section): (False, None),
                           ("section", is_section): (False, None),
                           ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
                           ("ol-decimal",is_li_decimal):(make_listitem,"listitem"),
                           ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"),
                           ("ol-alpha",is_li_alpha):(make_listitem,"listitem"),
                           ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"),
                           ("ol-roman",is_li_roman):(make_listitem,"listitem"),
                           ("ol-roman",is_li_alpha):(False,None),
                           ("ol-alpha",is_li_decimal):(False,None),
                           ("listitem",is_li_alpha):sublist_or_parent, 
                           ("listitem",is_li_roman):sublist_or_parent, 
                           ("listitem",is_li_decimal):sublist_or_parent, 
                           })

        p.debug = debug

        tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b
Ejemplo n.º 3
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):

        def is_header(parser):
            p = parser.reader.peek()
            # older direktiv sources start with dir number
            if re.match(r'Dir\.? \d{4}:\d+$', p):
                return False
            return (headerlike(p) and 
                    not is_strecksats(parser, parser.reader.peek(2)))

        def is_strecksats(parser, chunk=None):
            if chunk is None:
                chunk = parser.reader.peek()
            return chunk.startswith(("--", "- "))

        def is_section(parser):
            (ordinal, headingtype, title) = analyze_sectionstart(parser)
            if ordinal:
                return headingtype == "h1"

        def is_subsection(parser):
            (ordinal, headingtype, title) = analyze_sectionstart(parser)
            if ordinal:
                return headingtype == "h2"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            chunk = parser.reader.next()
            ordinal, headingtype, title = analyze_sectionstart(parser, chunk)
            s = Avsnitt(ordinal=ordinal, title=title)
            return parser.make_children(s)

        @newstate('strecksats')
        def make_strecksatslista(parser):
            ul = Strecksatslista()
            li = make_listitem(parser)
            ul.append(li)
            res = parser.make_children(ul)
            return res

        def make_listitem(parser):
            chunk = parser.reader.next()
            s = str(chunk)
            if " " in s:
                # assume text before first space is the bullet
                s = s.split(" ",1)[1]
            else:
                # assume the bullet is a single char
                s = s[1:]
            return Strecksatselement([s])

        def make_header(parser):
            return Heading([parser.reader.next()])
        
        def make_paragraph(parser):
            return Paragraph([parser.reader.next()])

        @newstate('unorderedsection')
        def make_unorderedsection(parser):
            s = UnorderedSection(title=parser.reader.next().strip())
            return parser.make_children(s)
            
        def headerlike(p):
            return (p[0].lower() != p[0]
                    and len(p) < 150
                    and not (p.endswith(".") and
                             not (p.endswith("m.m.") or
                                  p.endswith("m. m.") or
                                  p.endswith("m.fl.") or
                                  p.endswith("m. fl."))))

        re_sectionstart = re.compile("^(\d[\.\d]*) +([A-ZÅÄÖ].*)$").match
        def analyze_sectionstart(parser, chunk=None):
            """returns (ordinal, headingtype, text) if it looks like a section
            heading, (None, None, chunk) otherwise."""
            if chunk is None:
                chunk = parser.reader.peek()
            m = re_sectionstart(chunk)
            if m and headerlike(m.group(2)):
                return (m.group(1),
                        "h" + str(m.group(1).count(".") + 1),
                        m.group(2).strip())
            else:
                return None, None, chunk

        p = FSMParser()
        if parseconfig == "simple":
            recognizers = [is_header, is_strecksats, is_paragraph]
        else:
            recognizers = [is_section,
                           is_subsection,
                           is_header,
                           is_strecksats,
                           is_paragraph]
        p.set_recognizers(*recognizers)
        commonstates = ("body", "section", "subsection", "unorderedsection")
        p.set_transitions({(commonstates, is_paragraph): (make_paragraph, None),
                           (commonstates, is_strecksats): (make_strecksatslista, "strecksats"),
                           (commonstates, is_header): (make_unorderedsection, "unorderedsection"),
                           (commonstates, is_section): (make_section, "section"),
                           
                           ("unorderedsection", is_header): (False, None),
                           ("unorderedsection", is_section): (False, None),
                           ("strecksats", is_paragraph): (False, None),
                           ("strecksats", is_strecksats): (make_listitem, None),
                           ("section", is_header): (False, None),
                           ("section", is_section): (False, None),
                           ("section", is_subsection): (make_section, "subsection"),
                           ("subsection", is_subsection): (False, None),
                           ("subsection", is_section): (False, None)})
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Ejemplo n.º 4
0
    def get_parser(self, basefile, sanitized_body, parseconfig="default"):
        # a typical decision structure:

        # [h1] Justitiekanslerns beslut
        #    ... text ...
        #    [h2] Ärendet (h3)
        #        [h3] Bakgrund (p/em)
        #        ... text ...
        #        [h3] Anspråket
        #        ... text ...
        #        [h3 class="reglering"] Rättslig reglering m.m. (p/strong)
        #    [h2] Justitiekanslerns bedömning
        #        [h3] Skadestånd
        #        [h3] Tillsyn
        def is_section(parser):
            return parser.reader.peek().name == "h3"

        def is_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "em"

        def is_special_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(
                chunk.children)[0].name == "strong"

        def is_subsubsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "u"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('special_subsection')
        def make_special_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        def make_paragraph(parser):
            # FIXME: this strips out formatting tags NB: Now this is a
            # SFS stycke that has fragment_label, id/uri and other
            # crap. Let's see if it still works!
            return AnonStycke([parser.reader.next().get_text()])

        p = FSMParser()
        p.set_recognizers(is_section, is_subsection, is_subsubsection,
                          is_paragraph)
        p.set_transitions({
            ("body", is_section): (make_section, "section"),
            ("section", is_section): (False, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_special_subsection):
            (make_special_subsection, "special_subsection"),
            ("subsection", is_section): (False, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_special_subsection): (False, None),
            ("subsection", is_subsubsection):
            (make_subsection, "subsubsection"),
            ("special_subsection", is_section): (False, None),
            ("special_subsection", is_subsection): (False, None),
            ("special_subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_section): (False, None),
            ("subsubsection", is_special_subsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            (("body", "section", "subsection", "subsubsection"), is_paragraph):
            (make_paragraph, None)
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Ejemplo n.º 5
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):
        def is_heading(parser):
            return parser.reader.peek().font.size == 17

        def is_dnr(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))):
                return True

        def is_datum(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12
                    and re.match('\d{4}-\d{2}-\d{2}', str(chunk))):
                return True

        def is_nonessential(parser):
            chunk = parser.reader.peek()
            if chunk.top >= 1159 or chunk.top <= 146:
                return True

        def is_abstract(parser):
            if str(parser.reader.peek()).startswith("Beslutet i korthet:"):
                return True

        def is_section(parser):
            chunk = parser.reader.peek()
            strchunk = str(chunk)
            if chunk.font.size == 14 and chunk[
                    0].tag == "b" and not strchunk.endswith("."):
                return True

        def is_blockquote(parser):
            chunk = parser.reader.peek()
            if chunk.left >= 255:
                return True

        def is_normal(parser):
            chunk = parser.reader.peek()
            if chunk.left < 255:
                return True

        def is_paragraph(parser):
            return True

        @decorators.newstate("body")
        def make_body(parser):
            return parser.make_children(Body())

        def make_heading(parser):
            # h = Heading(str(parser.reader.next()).strip())
            h = Meta([str(parser.reader.next()).strip()],
                     predicate=DCTERMS.title,
                     lang="sv")
            return h

        @decorators.newstate("abstract")
        def make_abstract(parser):
            a = Abstract([Paragraph(parser.reader.next())])
            return parser.make_children(a)

        @decorators.newstate("section")
        def make_section(parser):
            s = UnorderedSection(title=str(parser.reader.next()).strip())
            return parser.make_children(s)

        @decorators.newstate("blockquote")
        def make_blockquote(parser):
            b = Blockquote()
            return parser.make_children(b)

        def make_paragraph(parser):
            # A Paragraph containing PDFReader.Textelement object will
            # render these as <span> objects (the default rendering. A
            # PDFReader.Textbox object containing same will render
            # unstyled Textelements as plain strings, cutting down on
            # unneccesary <span> elements. However, these themselves
            # render with unneccessary @style and @class attributes,
            # which we don't want. For now, lets stick with Paragraphs
            # as containers and maybe later figure out how to get
            # PDFReader.Textelements to render themselves sanely.
            #
            # p = parser.reader.next()
            p = Paragraph(parser.reader.next())
            return p

        def make_datum(parser):
            datestr = str(parser.reader.next()).strip()
            year = int(datestr.split("-")[0])
            if 2100 > year > 1970:
                parser.remove_recognizer(is_datum)
                d = [datestr]
                return Meta(d,
                            predicate=RPUBL.avgorandedatum,
                            datatype=XSD.date)
            else:
                self.log.warning("Year in %s doesn't look valid" % datestr)
                return None

        def make_dnr(parser):
            parser.remove_recognizer(is_dnr)
            ds = [x for x in str(parser.reader.next()).strip().split(" ")]
            return Meta(ds, predicate=RPUBL.diarienummer)

        def skip_nonessential(parser):
            parser.reader.next()  # return nothing

        p = FSMParser()
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading,
                          is_abstract, is_section, is_normal, is_blockquote,
                          is_paragraph)
        p.set_transitions({
            ("body", is_heading): (make_heading, None),
            ("body", is_nonessential): (skip_nonessential, None),
            ("body", is_datum): (make_datum, None),
            ("body", is_dnr): (make_dnr, None),
            ("body", is_abstract): (make_abstract, "abstract"),
            ("body", is_section): (make_section, "section"),
            ("body", is_blockquote): (make_blockquote, "blockquote"),
            ("body", is_paragraph): (make_paragraph, None),
            ("abstract", is_paragraph): (make_paragraph, None),
            ("abstract", is_section): (False, None),
            ("abstract", is_dnr): (False, None),
            ("abstract", is_datum): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_nonessential): (skip_nonessential, None),
            ("section", is_section): (False, None),
            ("section", is_blockquote): (make_blockquote, "blockquote"),
            ("section", is_datum): (make_datum, None),
            ("section", is_dnr): (make_dnr, None),
            ("blockquote", is_blockquote): (make_paragraph, None),
            ("blockquote", is_nonessential): (skip_nonessential, None),
            ("blockquote", is_section): (False, None),
            ("blockquote", is_normal): (False, None),
            ("blockquote", is_datum): (make_datum, None),
            ("blockquote", is_dnr): (make_dnr, None),
        })
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Ejemplo n.º 6
0
    def get_parser(self, basefile, sanitized_body, parseconfig="default"):
        # a typical decision structure:

        # [h1] Justitiekanslerns beslut
        #    ... text ...
        #    [h2] Ärendet (h3)
        #        [h3] Bakgrund (p/em)
        #        ... text ...
        #        [h3] Anspråket
        #        ... text ...
        #        [h3 class="reglering"] Rättslig reglering m.m. (p/strong)
        #    [h2] Justitiekanslerns bedömning
        #        [h3] Skadestånd
        #        [h3] Tillsyn
        def is_section(parser):
            return parser.reader.peek().name == "h3"

        def is_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "em"

        def is_special_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "strong"

        def is_subsubsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "u"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('special_subsection')
        def make_special_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        def make_paragraph(parser):
            # FIXME: this strips out formatting tags NB: Now this is a
            # SFS stycke that has fragment_label, id/uri and other
            # crap. Let's see if it still works!
            return AnonStycke([parser.reader.next().get_text()])

        p = FSMParser()
        p.set_recognizers(is_section,
                          is_subsection,
                          is_subsubsection,
                          is_paragraph)
        p.set_transitions({
            ("body", is_section): (make_section, "section"),
            ("section", is_section): (False, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_special_subsection): (make_special_subsection, "special_subsection"),
            ("subsection", is_section): (False, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_special_subsection): (False, None),
            ("subsection", is_subsubsection): (make_subsection, "subsubsection"),
            ("special_subsection", is_section): (False, None),
            ("special_subsection", is_subsection): (False, None),
            ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
            ("subsubsection", is_section): (False, None),
            ("subsubsection", is_special_subsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None)
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Ejemplo n.º 7
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal', 'decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha', 'upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman', 'upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc', 'circle', 'square', 'dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")

        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol, state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack:  # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor, newstate)

        # CONSTRUCTORS
        @newstate('body')
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)

        @newstate('section')
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])


#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        @newstate('ol-decimal')
        def make_ol_decimal(parser):
            return make_orderedlist(parser, "decimal", "ol-decimal")

        @newstate('ol-alpha')
        def make_ol_alpha(parser):
            return make_orderedlist(parser, "lower-alpha", "ol-alpha")

        @newstate('ol-roman')
        def make_ol_roman(parser):
            return make_orderedlist(parser, "lower-roman", "ol-roman")

        @newstate('listitem')
        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        # NOTE: no @newstate decorator for these -- we transition from
        # one state to the next, not push a new state onto the stack
        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-a")

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-b")

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-c")

        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        def make_orderedlist(parser, listtype, childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None, chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +', chunk)
            if m:
                if chunk.startswith("0"):
                    listtype = "decimal-leading-zero"
                else:
                    listtype = "decimal"
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            if chunk.startswith("* "):
                return ("disc", None, None, chunk)
            if chunk.startswith("- "):
                return ("dash", None, None, chunk)

            return (listtype, ordinal, separator, chunk)  # None * 3

        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header,
                          is_section, is_subsection, is_subsubsection,
                          is_preformatted, is_definition, is_description,
                          is_state_a, is_state_b, is_state_c, is_paragraph)
        p.set_transitions({
            ("body", is_paragraph): (make_paragraph, None),
            ("body", is_section): (make_section, "section"),
            ("body", is_state_a): (make_state_a, "state-a"),
            ("state-a", is_state_b): (make_state_b, "state-b"),
            ("state-b", is_state_c): (make_state_c, "state-c"),
            ("state-c", is_section): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("subsection", is_paragraph): (make_paragraph, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_state_a): (False, "body"),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_paragraph): (make_paragraph, None),
            ("subsubsection", is_section): (False, None),
            ("subsection", is_section): (False, None),
            ("section", is_section): (False, None),
            ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
            ("ol-decimal", is_li_decimal): (make_listitem, "listitem"),
            ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"),
            ("ol-alpha", is_li_alpha): (make_listitem, "listitem"),
            ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"),
            ("ol-roman", is_li_roman): (make_listitem, "listitem"),
            ("ol-roman", is_li_alpha): (False, None),
            ("ol-alpha", is_li_decimal): (False, None),
            ("listitem", is_li_alpha):
            sublist_or_parent,
            ("listitem", is_li_roman):
            sublist_or_parent,
            ("listitem", is_li_decimal):
            sublist_or_parent,
        })

        p.debug = debug

        tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b