Esempio n. 1
0
    def extract_head(self, fp, basefile):
        """Parsear ut det SFSR-registret som innehåller alla ändringar
        i lagtexten från HTML-filer"""

        # NB: We should really call self.store.register_path, but that
        # custom func isn't mocked by ferenda.testutil.RepoTester,
        # and downloaded_path is. So we call that one and munge it.
        filename = self.store.downloaded_path(basefile).replace(
            "/downloaded/", "/register/")
        with codecs.open(filename, encoding=self.source_encoding) as rfp:
            soup = bs4.BeautifulSoup(rfp.read(), "lxml")
        # do we really have a registry?
        notfound = soup.find(text="Sökningen gav ingen träff!")
        if notfound:
            raise InteExisterandeSFS(str(notfound))
        textheader = fp.read(2048)
        if not isinstance(textheader, str):
            # Depending on whether the fp is opened through standard
            # open() or bz2.BZ2File() in self.parse_open(), it might
            # return bytes or unicode strings. This seem to be a
            # problem in BZ2File (or how we use it). Just roll with it.
            textheader = textheader.decode(self.source_encoding)
        idx = textheader.index("\r\n" * 4)
        fp.seek(idx + 8)
        reader = TextReader(string=textheader, linesep=TextReader.DOS)
        subreader = reader.getreader(reader.readchunk, reader.linesep * 4)
        return soup, subreader.getiterator(subreader.readparagraph)
Esempio n. 2
0
    def extract_head(self, fp, basefile):
        """Parsear ut det SFSR-registret som innehåller alla ändringar
        i lagtexten från HTML-filer"""

        # NB: We should really call self.store.register_path, but that
        # custom func isn't mocked by ferenda.testutil.RepoTester,
        # and downloaded_path is. So we call that one and munge it.
        filename = self.store.downloaded_path(basefile).replace(
            "/downloaded/", "/register/")
        with codecs.open(filename, encoding=self.source_encoding) as rfp:
            soup = bs4.BeautifulSoup(rfp.read(), "lxml")
        # do we really have a registry?
        notfound = soup.find(text="Sökningen gav ingen träff!")
        if notfound:
            raise InteExisterandeSFS(str(notfound))
        textheader = fp.read(2048)
        if not isinstance(textheader, str):
            # Depending on whether the fp is opened through standard
            # open() or bz2.BZ2File() in self.parse_open(), it might
            # return bytes or unicode strings. This seem to be a
            # problem in BZ2File (or how we use it). Just roll with it.
            textheader = textheader.decode(self.source_encoding)
        idx = textheader.index("\r\n" * 4)
        fp.seek(idx + 8)
        reader = TextReader(string=textheader,
                            linesep=TextReader.DOS)
        subreader = reader.getreader(
            reader.readchunk, reader.linesep * 4)
        return soup, subreader.getiterator(subreader.readparagraph)
Esempio n. 3
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt", ".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename, encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr = TextReader(filename,
                             encoding="utf-8",
                             linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n" + elements.serialize(b))
         self.fail()
Esempio n. 4
0
 def extract_body(self, fp, basefile):
     bodystring = fp.read()
     # see comment in extract_head for why we must handle both
     # bytes- and str-files
     if not isinstance(bodystring, str):
         bodystring = bodystring.decode(self.source_encoding)
     reader = TextReader(string=bodystring, linesep=TextReader.DOS)
     reader.autostrip = True
     return reader
Esempio n. 5
0
 def extract_body(self, fp, basefile):
     bodystring = fp.read()
     # see comment in extract_head for why we must handle both
     # bytes- and str-files
     if not isinstance(bodystring, str):
         bodystring = bodystring.decode(self.source_encoding)
     reader = TextReader(string=bodystring, linesep=TextReader.DOS)
     reader.autostrip = True
     return reader
Esempio n. 6
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.rdftype(self.ns['rfc'].RFC)
        desc.value(self.ns['dct'].title, title, lang="en")
        self.parse_header(header, desc)
        if not desc.getvalues(self.ns['dct'].identifier):
            desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
Esempio n. 7
0
    def download(self):
        self.log.debug("download: Start at %s" % self.start_url)
        indextext = requests.get(self.start_url).text
        reader = TextReader(string=indextext)  # see TextReader class
        iterator = reader.getiterator(reader.readparagraph)
        if not isinstance(self.config.downloadmax, (int, type(None))):
            self.config.downloadmax = int(self.config.downloadmax)

        for basefile in self.download_get_basefiles(iterator):
            self.download_single(basefile)
Esempio n. 8
0
 def download(self):
     self.log.debug("download: Start at %s" %  self.start_url)
     indextext = requests.get(self.start_url).text
     reader = TextReader(string=indextext)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     if not isinstance(self.config.downloadmax, (int, type(None))):
         self.config.downloadmax = int(self.config.downloadmax)
         
     for basefile in self.download_get_basefiles(iterator):
         self.download_single(basefile)
Esempio n. 9
0
 def _find_uppdaterad_tom(self, sfsnr, filename=None, reader=None):
     if not reader:
         reader = TextReader(filename, encoding=self.source_encoding)
     try:
         reader.cue("&Auml;ndring inf&ouml;rd:<b> t.o.m. SFS")
         l = reader.readline()
         m = re.search('(\d+:\s?\d+)', l)
         if m:
             return m.group(1)
         else:
             # if m is None, the SFS id is using a non-standard
             # formatting (eg 1996/613-first-version) -- interpret
             # it as if it didn't exist
             return sfsnr
     except IOError:
         return sfsnr  # the base SFS nr
Esempio n. 10
0
 def parametric_test(self, filename):
     # these options adjusts the constructed URIs. by default, the
     # official rpubl URIs are minted.
     #
     # self.repo.config.localizeuri = True
     # self.repo.config.url = "http://example.org/"
     # self.repo.config.urlpath = ''
     # a few of the subclasses have specialized rules. make sure we
     # instantiate the correct class
     repo = os.path.basename(filename).split("-")[0]
     basefile = os.path.splitext(os.path.basename(filename))[0].replace(
         "-", "/", 1).replace("-", ":")
     repoclass = self.aliases[repo]
     self.repo = repoclass(
         datadir=self.datadir,
         storelocation=self.datadir + "/ferenda.sqlite",
         indexlocation=self.datadir + "/whoosh",
     )
     doc = self.repo.make_document(basefile)
     text = self.repo.sanitize_text(util.readfile(filename), basefile)
     reader = TextReader(string=text, encoding='utf-8')
     self.repo.parse_metadata_from_textreader(reader, doc)
     wantfile = filename.replace(".txt", ".n3")
     if os.path.exists(wantfile):
         self.assertEqualGraphs(wantfile, doc.meta, exact=False)
     else:
         self.fail(
             "Expected a %s with the following content:\n\n%s" %
             (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
Esempio n. 11
0
 def _find_uppdaterad_tom(self, sfsnr, filename=None, reader=None):
     if not reader:
         reader = TextReader(filename, encoding=self.source_encoding)
     try:
         reader.cue("&Auml;ndring inf&ouml;rd:<b> t.o.m. SFS")
         l = reader.readline()
         m = re.search('(\d+:\s?\d+)', l)
         if m:
             return m.group(1)
         else:
             # if m is None, the SFS id is using a non-standard
             # formatting (eg 1996/613-first-version) -- interpret
             # it as if it didn't exist
             return sfsnr
     except IOError:
         return sfsnr  # the base SFS nr
Esempio n. 12
0
    def parametric_test(self, filename):
        # these options adjusts the constructed URIs. by default, the
        # official rpubl URIs are minted.
        #
        # self.repo.config.localizeuri = True
        # self.repo.config.url = "http://example.org/"
        # self.repo.config.urlpath = ''
        # a few of the subclasses have specialized rules. make sure we
        # instantiate the correct class
        repo, basefile = self.parse_filename(filename)
        doc = repo.make_document(basefile)
        text = repo.sanitize_text(util.readfile(filename), basefile)
        reader = TextReader(string=text, encoding='utf-8')
        props = repo.extract_metadata(reader, basefile)
        props = repo.sanitize_metadata(props, basefile)
        resource = repo.polish_metadata(props, basefile)
        repo.infer_metadata(resource, basefile)

        wantfile = filename.replace(".txt", ".n3")
        if os.path.exists(wantfile):
            self.assertEqualGraphs(wantfile, resource.graph, exact=False)
        else:
            self.fail(
                "Expected a %s with the following content:\n\n%s" %
                (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
Esempio n. 13
0
 def extract_body(self, fp, basefile):
     rawtext = fp.read().decode(self.source_encoding)
     # remove whitespace on otherwise empty lines
     rawtext = re.sub("\n\t\n", "\n\n", rawtext)
     reader = TextReader(string=rawtext,
                         linesep=TextReader.UNIX)
     return reader
Esempio n. 14
0
 def find_version(f):
     # need to look at the file to find out its version
     encoding = self._sniff_encoding(f)
     raw = open(f, 'rb').read(8000)
     text = unescape(raw.decode(encoding, errors="replace"))
     reader = TextReader(string=text)
     updated_to = self._find_uppdaterad_tom(basefile, reader=reader)
     return updated_to
Esempio n. 15
0
 def extract_body(self, fp, basefile):
     rawtext = fp.read()
     if isinstance(rawtext, bytes): # happens when creating the intermediate file
         rawtext = rawtext.decode(self.source_encoding)
     # remove whitespace on otherwise empty lines
     rawtext = re.sub("\n\t\n", "\n\n", rawtext)
     reader = TextReader(string=rawtext,
                         linesep=TextReader.UNIX)
     return reader
Esempio n. 16
0
 def download(self, basefile=None):
     """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt"""
     if basefile and self.document_url_template:
         return self.download_single(basefile)
     res = requests.get(self.start_url)
     indextext = res.text
     reader = TextReader(string=indextext, linesep=TextReader.UNIX)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     for (basefile, url) in self.download_get_basefiles(iterator):
         try:
             if not os.path.exists(self.store.downloaded_path(basefile)):
                 self.download_single(basefile)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 # create a empty dummy file in order to
                 # avoid looking for it over and over again:
                 with open(self.store.downloaded_path(basefile), "w"):
                     pass
Esempio n. 17
0
File: rfc.py Progetto: zigit/ferenda
 def download(self, basefile=None):
     """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt"""
     if basefile and self.document_url_template:
         return self.download_single(basefile)
     res = requests.get(self.start_url)
     indextext = res.text
     reader = TextReader(string=indextext,
                         linesep=TextReader.UNIX)  # see TextReader class
     iterator = reader.getiterator(reader.readparagraph)
     for (basefile, url) in self.download_get_basefiles(iterator):
         try:
             if not os.path.exists(self.store.downloaded_path(basefile)):
                 self.download_single(basefile)
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 404:
                 # create a empty dummy file in order to
                 # avoid looking for it over and over again:
                 with open(self.store.downloaded_path(basefile), "w"):
                     pass
Esempio n. 18
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Esempio n. 19
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Esempio n. 20
0
    def parse_basefile(self, basefile):
        # create an Document instance with an initialized doc.meta RDFLib graph
        doc = self.make_document()
        intermediate_path = self.generic_path(basefile, 'intermediate', '.txt')
        downloaded_path = self.downloaded_path(basefile)
        doc.uri = self.canonical_uri(basefile)
        doc.lang = "sv"
        html = codecs.open(downloaded_path, encoding="iso-8859-1").read()
        header_chunk = util.extract_text(
            html, '<pre>\n   <pre>', '<hr>', strip_tags=False)
        self.make_meta(header_chunk, doc.meta, doc.uri, basefile)
        util.writefile(intermediate_path, util.extract_text(
            html, '<pre>', '</pre>'), encoding="utf-8")
        reader = TextReader(intermediate_path, encoding="utf-8")
        reader.readparagraph()
        self.make_body(reader, doc.body)

        # Iterate through body tree and find things to link to (See
        # EurlexTreaties.process_body for inspiration)
        self.process_body(doc.body, '', doc.uri)
        return doc
Esempio n. 21
0
 def extract_body(self, fp, basefile):
     if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")):
         bodystring = fp.read()
         if isinstance(bodystring, bytes):
             # fp is opened in bytestream mode
             bodystring = bodystring.decode("utf-8")
         return TextReader(string=bodystring)
     else:
         reader = super(PropTrips, self).extract_body(fp, basefile)
         pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
         for page in reader:
             page.src = pdffile
         return reader
Esempio n. 22
0
 def make_document(self, basefile=None):
     doc = super(SFS, self).make_document(basefile)
     if basefile:  # toc_generate_page calls this w/o basefile
         # We need to get the uppdaterad_tom field to create a proper
         # URI.  First create a throwaway reader and make sure we have
         # the intermediate file at ready
         # FIXME: this is broken
         fp = self.downloaded_to_intermediate(basefile)
         t = TextReader(string=fp.read(2048))
         fp.close()
         uppdaterad_tom = self._find_uppdaterad_tom(basefile, reader=t)
         doc.uri = self.canonical_uri(basefile, uppdaterad_tom)
     return doc
Esempio n. 23
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt",".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename,encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n"+elements.serialize(b))
         self.fail()
Esempio n. 24
0
    def parametric_test(self, filename):
        self.maxDiff = None
        reader = TextReader(filename=filename, encoding='iso-8859-1',
                              linesep=TextReader.DOS)
        reader.autostrip = True
        # p.lagrum_parser = FakeParser()
        parser = self.p.get_parser("9999:998", reader)
        b = parser(reader)
        elements = self.p._count_elements(b)

        # FIXME: How was this used? Where should we plug
        # skipfragments?
        if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2:
            self.p.skipfragments = [
                ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'),
                ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')]
        else:
            self.p.skipfragments = [('rinfoex:avdelningnummer',
                                     'rpubl:kapitelnummer')]

        # NB: _construct_ids won't look for references
        self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998',
                                                   'uris': set()})
        self.p.visit_node(b, self.p.find_definitions, False, debug=False)
        self.p.lagrum_parser.parse_recursive(b)
        self._remove_uri_for_testcases(b)
        resultfilename = filename.replace(".txt", ".xml")
        if os.path.exists(resultfilename):
            with codecs.open(resultfilename, encoding="utf-8") as fp:
                result = fp.read().strip()
            self.assertEqual(result, serialize(b).strip())
        else:
            self.assertEqual("", serialize(b).strip())
        # reset the state of the repo...
        self.p.current_section = '0'
        self.p.current_headline_level = 0
Esempio n. 25
0
    def fsmparse(self, functionname, source):
        """Parse a list of text chunks using a named fsm parser and
        output the parse tree and final result to stdout.

        :param functionname: A function that returns a configured
                             :py:class:`~ferenda.FSMParser`
        :type  functionname: str
        :param source:       A file containing the text chunks, separated
                             by double newlines
        :type source:        str

        """
        modulename, classname, methodname = functionname.rsplit(".", 2)
        __import__(modulename)
        m = sys.modules[modulename]
        for name, cls in inspect.getmembers(m, inspect.isclass):
            if name == classname:
                break
        method = getattr(cls,methodname)
        parser = method()
        parser.debug = True
        tr = TextReader(source)
        b = parser.parse(tr.getiterator(tr.readparagraph))
        print(serialize(b))
Esempio n. 26
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))
Esempio n. 27
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Esempio n. 28
0
 def downloaded_to_intermediate(self, basefile):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning(
                     "%s is an expired SFS" % basefile,
                     dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile),
                    txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)
Esempio n. 29
0
File: rfc.py Progetto: zigit/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
Esempio n. 30
0
    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Esempio n. 31
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'],
                                        id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning("%s is an expired SFS" % basefile,
                                          dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile), txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)
Esempio n. 32
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal','decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha','upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman','upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc','circle','square','dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")
        
        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol,state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor,newstate)
        
        # CONSTRUCTORS
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)
        setattr(make_body,'newstate','body')
        
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_section,'newstate','section')

        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsection,'newstate','subsection')

        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsubsection,'newstate','subsubsection')

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])

#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        def make_ol_decimal(parser):
            return make_orderedlist(parser,"decimal","ol-decimal")
        setattr(make_ol_decimal,'newstate','ol-decimal')

        def make_ol_alpha(parser):
            return make_orderedlist(parser,"lower-alpha", "ol-alpha")
        setattr(make_ol_alpha,'newstate','ol-alpha')

        def make_ol_roman(parser):
            return make_orderedlist(parser,"lower-roman", "ol-roman")
        setattr(make_ol_roman,'newstate','ol-romal')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype,ordinal,separator,rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem,'newstate','listitem')

        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-a")
        # setattr(make_state_a, 'newstate', 'state-a')

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-b")
        # setattr(make_state_b, 'newstate', 'state-b')

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-c")
        # setattr(make_state_c, 'newstate', 'state-c')
        
        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and 
                    len(list(filter(None,s.split(".")))))

        def make_orderedlist(parser,listtype,childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem,"listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match
        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None,chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +',chunk)
            if m:
                if chunk.startswith("0"):
                    listtype="decimal-leading-zero"
                else:
                    listtype="decimal"
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            if chunk.startswith("* "):
                return ("disc",None,None,chunk)
            if chunk.startswith("- "):
                return ("dash",None,None,chunk)
                
            return (listtype,ordinal,separator,chunk) # None * 3

        
        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal,
                          is_li_roman, 
                          is_li_alpha,
                          is_header,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_preformatted,
                          is_definition,
                          is_description,
                          is_state_a,
                          is_state_b,
                          is_state_c,
                          is_paragraph)
        p.set_transitions({("body", is_paragraph): (make_paragraph, None),
                           ("body", is_section): (make_section,"section"),
                           ("body", is_state_a): (make_state_a, "state-a"),
                           ("state-a", is_state_b): (make_state_b, "state-b"),
                           ("state-b", is_state_c): (make_state_c, "state-c"),
                           ("state-c", is_section): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("subsection", is_paragraph): (make_paragraph,None),
                           ("subsection", is_subsection): (False,None),
                           ("subsection", is_state_a): (False,"body"), 
                           ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"),
                           ("subsubsection", is_paragraph): (make_paragraph,None),
                           ("subsubsection", is_section): (False, None),
                           ("subsection", is_section): (False, None),
                           ("section", is_section): (False, None),
                           ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
                           ("ol-decimal",is_li_decimal):(make_listitem,"listitem"),
                           ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"),
                           ("ol-alpha",is_li_alpha):(make_listitem,"listitem"),
                           ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"),
                           ("ol-roman",is_li_roman):(make_listitem,"listitem"),
                           ("ol-roman",is_li_alpha):(False,None),
                           ("ol-alpha",is_li_decimal):(False,None),
                           ("listitem",is_li_alpha):sublist_or_parent, 
                           ("listitem",is_li_roman):sublist_or_parent, 
                           ("listitem",is_li_decimal):sublist_or_parent, 
                           })

        p.debug = debug

        tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b
Esempio n. 33
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal', 'decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha', 'upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman', 'upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc', 'circle', 'square', 'dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")

        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol, state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack:  # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor, newstate)

        # CONSTRUCTORS
        @newstate('body')
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)

        @newstate('section')
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])


#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        @newstate('ol-decimal')
        def make_ol_decimal(parser):
            return make_orderedlist(parser, "decimal", "ol-decimal")

        @newstate('ol-alpha')
        def make_ol_alpha(parser):
            return make_orderedlist(parser, "lower-alpha", "ol-alpha")

        @newstate('ol-roman')
        def make_ol_roman(parser):
            return make_orderedlist(parser, "lower-roman", "ol-roman")

        @newstate('listitem')
        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        # NOTE: no @newstate decorator for these -- we transition from
        # one state to the next, not push a new state onto the stack
        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-a")

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-b")

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-c")

        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        def make_orderedlist(parser, listtype, childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None, chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +', chunk)
            if m:
                if chunk.startswith("0"):
                    listtype = "decimal-leading-zero"
                else:
                    listtype = "decimal"
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            if chunk.startswith("* "):
                return ("disc", None, None, chunk)
            if chunk.startswith("- "):
                return ("dash", None, None, chunk)

            return (listtype, ordinal, separator, chunk)  # None * 3

        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header,
                          is_section, is_subsection, is_subsubsection,
                          is_preformatted, is_definition, is_description,
                          is_state_a, is_state_b, is_state_c, is_paragraph)
        p.set_transitions({
            ("body", is_paragraph): (make_paragraph, None),
            ("body", is_section): (make_section, "section"),
            ("body", is_state_a): (make_state_a, "state-a"),
            ("state-a", is_state_b): (make_state_b, "state-b"),
            ("state-b", is_state_c): (make_state_c, "state-c"),
            ("state-c", is_section): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("subsection", is_paragraph): (make_paragraph, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_state_a): (False, "body"),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_paragraph): (make_paragraph, None),
            ("subsubsection", is_section): (False, None),
            ("subsection", is_section): (False, None),
            ("section", is_section): (False, None),
            ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
            ("ol-decimal", is_li_decimal): (make_listitem, "listitem"),
            ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"),
            ("ol-alpha", is_li_alpha): (make_listitem, "listitem"),
            ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"),
            ("ol-roman", is_li_roman): (make_listitem, "listitem"),
            ("ol-roman", is_li_alpha): (False, None),
            ("ol-alpha", is_li_decimal): (False, None),
            ("listitem", is_li_alpha):
            sublist_or_parent,
            ("listitem", is_li_roman):
            sublist_or_parent,
            ("listitem", is_li_decimal):
            sublist_or_parent,
        })

        p.debug = debug

        tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b