Esempio n. 1
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt", ".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename, encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr = TextReader(filename,
                             encoding="utf-8",
                             linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n" + elements.serialize(b))
         self.fail()
Esempio n. 2
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.rdftype(self.ns['rfc'].RFC)
        desc.value(self.ns['dct'].title, title, lang="en")
        self.parse_header(header, desc)
        if not desc.getvalues(self.ns['dct'].identifier):
            desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
Esempio n. 3
0
 def download(self):
     self.log.debug("download: Start at %s" %  self.start_url)
     indextext = requests.get(self.start_url).text
     reader = TextReader(string=indextext)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     if not isinstance(self.config.downloadmax, (int, type(None))):
         self.config.downloadmax = int(self.config.downloadmax)
         
     for basefile in self.download_get_basefiles(iterator):
         self.download_single(basefile)
Esempio n. 4
0
    def download(self):
        self.log.debug("download: Start at %s" % self.start_url)
        indextext = requests.get(self.start_url).text
        reader = TextReader(string=indextext)  # see TextReader class
        iterator = reader.getiterator(reader.readparagraph)
        if not isinstance(self.config.downloadmax, (int, type(None))):
            self.config.downloadmax = int(self.config.downloadmax)

        for basefile in self.download_get_basefiles(iterator):
            self.download_single(basefile)
Esempio n. 5
0
 def download(self, basefile=None):
     """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt"""
     if basefile and self.document_url_template:
         return self.download_single(basefile)
     res = requests.get(self.start_url)
     indextext = res.text
     reader = TextReader(string=indextext, linesep=TextReader.UNIX)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     for (basefile, url) in self.download_get_basefiles(iterator):
         try:
             if not os.path.exists(self.store.downloaded_path(basefile)):
                 self.download_single(basefile)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 # create a empty dummy file in order to
                 # avoid looking for it over and over again:
                 with open(self.store.downloaded_path(basefile), "w"):
                     pass
Esempio n. 6
0
File: rfc.py Progetto: zigit/ferenda
 def download(self, basefile=None):
     """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt"""
     if basefile and self.document_url_template:
         return self.download_single(basefile)
     res = requests.get(self.start_url)
     indextext = res.text
     reader = TextReader(string=indextext,
                         linesep=TextReader.UNIX)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     for (basefile, url) in self.download_get_basefiles(iterator):
         try:
             if not os.path.exists(self.store.downloaded_path(basefile)):
                 self.download_single(basefile)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 # create a empty dummy file in order to
                 # avoid looking for it over and over again:
                 with open(self.store.downloaded_path(basefile), "w"):
                     pass
Esempio n. 7
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Esempio n. 8
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Esempio n. 9
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt",".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename,encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n"+elements.serialize(b))
         self.fail()
Esempio n. 10
0
    def fsmparse(self, functionname, source):
        """Parse a list of text chunks using a named fsm parser and
        output the parse tree and final result to stdout.

        :param functionname: A function that returns a configured
                             :py:class:`~ferenda.FSMParser`
        :type  functionname: str
        :param source:       A file containing the text chunks, separated
                             by double newlines
        :type source:        str

        """
        modulename, classname, methodname = functionname.rsplit(".", 2)
        __import__(modulename)
        m = sys.modules[modulename]
        for name, cls in inspect.getmembers(m, inspect.isclass):
            if name == classname:
                break
        method = getattr(cls,methodname)
        parser = method()
        parser.debug = True
        tr = TextReader(source)
        b = parser.parse(tr.getiterator(tr.readparagraph))
        print(serialize(b))
Esempio n. 11
0
    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Esempio n. 12
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal','decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha','upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman','upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc','circle','square','dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")
        
        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol,state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor,newstate)
        
        # CONSTRUCTORS
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)
        setattr(make_body,'newstate','body')
        
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_section,'newstate','section')

        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsection,'newstate','subsection')

        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsubsection,'newstate','subsubsection')

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])

#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        def make_ol_decimal(parser):
            return make_orderedlist(parser,"decimal","ol-decimal")
        setattr(make_ol_decimal,'newstate','ol-decimal')

        def make_ol_alpha(parser):
            return make_orderedlist(parser,"lower-alpha", "ol-alpha")
        setattr(make_ol_alpha,'newstate','ol-alpha')

        def make_ol_roman(parser):
            return make_orderedlist(parser,"lower-roman", "ol-roman")
        setattr(make_ol_roman,'newstate','ol-romal')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype,ordinal,separator,rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem,'newstate','listitem')

        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-a")
        # setattr(make_state_a, 'newstate', 'state-a')

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-b")
        # setattr(make_state_b, 'newstate', 'state-b')

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-c")
        # setattr(make_state_c, 'newstate', 'state-c')
        
        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and 
                    len(list(filter(None,s.split(".")))))

        def make_orderedlist(parser,listtype,childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem,"listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match
        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None,chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +',chunk)
            if m:
                if chunk.startswith("0"):
                    listtype="decimal-leading-zero"
                else:
                    listtype="decimal"
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            if chunk.startswith("* "):
                return ("disc",None,None,chunk)
            if chunk.startswith("- "):
                return ("dash",None,None,chunk)
                
            return (listtype,ordinal,separator,chunk) # None * 3

        
        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal,
                          is_li_roman, 
                          is_li_alpha,
                          is_header,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_preformatted,
                          is_definition,
                          is_description,
                          is_state_a,
                          is_state_b,
                          is_state_c,
                          is_paragraph)
        p.set_transitions({("body", is_paragraph): (make_paragraph, None),
                           ("body", is_section): (make_section,"section"),
                           ("body", is_state_a): (make_state_a, "state-a"),
                           ("state-a", is_state_b): (make_state_b, "state-b"),
                           ("state-b", is_state_c): (make_state_c, "state-c"),
                           ("state-c", is_section): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("subsection", is_paragraph): (make_paragraph,None),
                           ("subsection", is_subsection): (False,None),
                           ("subsection", is_state_a): (False,"body"), 
                           ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"),
                           ("subsubsection", is_paragraph): (make_paragraph,None),
                           ("subsubsection", is_section): (False, None),
                           ("subsection", is_section): (False, None),
                           ("section", is_section): (False, None),
                           ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
                           ("ol-decimal",is_li_decimal):(make_listitem,"listitem"),
                           ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"),
                           ("ol-alpha",is_li_alpha):(make_listitem,"listitem"),
                           ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"),
                           ("ol-roman",is_li_roman):(make_listitem,"listitem"),
                           ("ol-roman",is_li_alpha):(False,None),
                           ("ol-alpha",is_li_decimal):(False,None),
                           ("listitem",is_li_alpha):sublist_or_parent, 
                           ("listitem",is_li_roman):sublist_or_parent, 
                           ("listitem",is_li_decimal):sublist_or_parent, 
                           })

        p.debug = debug

        tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b
Esempio n. 13
0
File: rfc.py Progetto: zigit/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
Esempio n. 14
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Esempio n. 15
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal', 'decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha', 'upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman', 'upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc', 'circle', 'square', 'dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")

        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol, state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack:  # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor, newstate)

        # CONSTRUCTORS
        @newstate('body')
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)

        @newstate('section')
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])


#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        @newstate('ol-decimal')
        def make_ol_decimal(parser):
            return make_orderedlist(parser, "decimal", "ol-decimal")

        @newstate('ol-alpha')
        def make_ol_alpha(parser):
            return make_orderedlist(parser, "lower-alpha", "ol-alpha")

        @newstate('ol-roman')
        def make_ol_roman(parser):
            return make_orderedlist(parser, "lower-roman", "ol-roman")

        @newstate('listitem')
        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        # NOTE: no @newstate decorator for these -- we transition from
        # one state to the next, not push a new state onto the stack
        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-a")

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-b")

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-c")

        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        def make_orderedlist(parser, listtype, childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None, chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +', chunk)
            if m:
                if chunk.startswith("0"):
                    listtype = "decimal-leading-zero"
                else:
                    listtype = "decimal"
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            if chunk.startswith("* "):
                return ("disc", None, None, chunk)
            if chunk.startswith("- "):
                return ("dash", None, None, chunk)

            return (listtype, ordinal, separator, chunk)  # None * 3

        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header,
                          is_section, is_subsection, is_subsubsection,
                          is_preformatted, is_definition, is_description,
                          is_state_a, is_state_b, is_state_c, is_paragraph)
        p.set_transitions({
            ("body", is_paragraph): (make_paragraph, None),
            ("body", is_section): (make_section, "section"),
            ("body", is_state_a): (make_state_a, "state-a"),
            ("state-a", is_state_b): (make_state_b, "state-b"),
            ("state-b", is_state_c): (make_state_c, "state-c"),
            ("state-c", is_section): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("subsection", is_paragraph): (make_paragraph, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_state_a): (False, "body"),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_paragraph): (make_paragraph, None),
            ("subsubsection", is_section): (False, None),
            ("subsection", is_section): (False, None),
            ("section", is_section): (False, None),
            ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
            ("ol-decimal", is_li_decimal): (make_listitem, "listitem"),
            ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"),
            ("ol-alpha", is_li_alpha): (make_listitem, "listitem"),
            ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"),
            ("ol-roman", is_li_roman): (make_listitem, "listitem"),
            ("ol-roman", is_li_alpha): (False, None),
            ("ol-alpha", is_li_decimal): (False, None),
            ("listitem", is_li_alpha):
            sublist_or_parent,
            ("listitem", is_li_roman):
            sublist_or_parent,
            ("listitem", is_li_decimal):
            sublist_or_parent,
        })

        p.debug = debug

        tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b