Esempio n. 1
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Esempio n. 2
0
    def parametric_test(self, filename):
        self.maxDiff = None
        reader = TextReader(filename=filename, encoding='iso-8859-1',
                              linesep=TextReader.DOS)
        reader.autostrip = True
        # p.lagrum_parser = FakeParser()
        parser = self.p.get_parser("9999:998", reader)
        b = parser(reader)
        elements = self.p._count_elements(b)

        # FIXME: How was this used? Where should we plug
        # skipfragments?
        if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2:
            self.p.skipfragments = [
                ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'),
                ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')]
        else:
            self.p.skipfragments = [('rinfoex:avdelningnummer',
                                     'rpubl:kapitelnummer')]

        # NB: _construct_ids won't look for references
        self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998',
                                                   'uris': set()})
        self.p.visit_node(b, self.p.find_definitions, False, debug=False)
        self.p.lagrum_parser.parse_recursive(b)
        self._remove_uri_for_testcases(b)
        resultfilename = filename.replace(".txt", ".xml")
        if os.path.exists(resultfilename):
            with codecs.open(resultfilename, encoding="utf-8") as fp:
                result = fp.read().strip()
            self.assertEqual(result, serialize(b).strip())
        else:
            self.assertEqual("", serialize(b).strip())
        # reset the state of the repo...
        self.p.current_section = '0'
        self.p.current_headline_level = 0
Esempio n. 3
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))
Esempio n. 4
0
File: rfc.py Progetto: zigit/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
Esempio n. 5
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Esempio n. 6
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal', 'decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha', 'upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman', 'upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc', 'circle', 'square', 'dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")

        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol, state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack:  # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor, newstate)

        # CONSTRUCTORS
        @newstate('body')
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)

        @newstate('section')
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])


#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        @newstate('ol-decimal')
        def make_ol_decimal(parser):
            return make_orderedlist(parser, "decimal", "ol-decimal")

        @newstate('ol-alpha')
        def make_ol_alpha(parser):
            return make_orderedlist(parser, "lower-alpha", "ol-alpha")

        @newstate('ol-roman')
        def make_ol_roman(parser):
            return make_orderedlist(parser, "lower-roman", "ol-roman")

        @newstate('listitem')
        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        # NOTE: no @newstate decorator for these -- we transition from
        # one state to the next, not push a new state onto the stack
        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-a")

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-b")

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-c")

        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        def make_orderedlist(parser, listtype, childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None, chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +', chunk)
            if m:
                if chunk.startswith("0"):
                    listtype = "decimal-leading-zero"
                else:
                    listtype = "decimal"
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            if chunk.startswith("* "):
                return ("disc", None, None, chunk)
            if chunk.startswith("- "):
                return ("dash", None, None, chunk)

            return (listtype, ordinal, separator, chunk)  # None * 3

        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header,
                          is_section, is_subsection, is_subsubsection,
                          is_preformatted, is_definition, is_description,
                          is_state_a, is_state_b, is_state_c, is_paragraph)
        p.set_transitions({
            ("body", is_paragraph): (make_paragraph, None),
            ("body", is_section): (make_section, "section"),
            ("body", is_state_a): (make_state_a, "state-a"),
            ("state-a", is_state_b): (make_state_b, "state-b"),
            ("state-b", is_state_c): (make_state_c, "state-c"),
            ("state-c", is_section): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("subsection", is_paragraph): (make_paragraph, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_state_a): (False, "body"),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_paragraph): (make_paragraph, None),
            ("subsubsection", is_section): (False, None),
            ("subsection", is_section): (False, None),
            ("section", is_section): (False, None),
            ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
            ("ol-decimal", is_li_decimal): (make_listitem, "listitem"),
            ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"),
            ("ol-alpha", is_li_alpha): (make_listitem, "listitem"),
            ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"),
            ("ol-roman", is_li_roman): (make_listitem, "listitem"),
            ("ol-roman", is_li_alpha): (False, None),
            ("ol-alpha", is_li_decimal): (False, None),
            ("listitem", is_li_alpha):
            sublist_or_parent,
            ("listitem", is_li_roman):
            sublist_or_parent,
            ("listitem", is_li_decimal):
            sublist_or_parent,
        })

        p.debug = debug

        tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b