Esempio n. 1
0
 def postprocess_doc(self, doc):
     next_is_title = False
     newbody = Body()
     glue = lambda x, y, z: False
     for para in doc.body.textboxes(gluefunc=glue, pageobjects=True):
         strpara = str(para).strip()
         if strpara == "Kommittédirektiv":
             next_is_title = True
         elif next_is_title:
             doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(strpara)))
             next_is_title = False
         elif strpara.startswith("Beslut vid regeringssammanträde den "):
             datestr = strpara[36:]  # length of above prefix
             if datestr.endswith("."):
                 datestr = datestr[:-1]
             doc.meta.add((URIRef(doc.uri), DCTERMS.issued,
                           Literal(self.parse_swedish_date(datestr),
                                   datatype=XSD.date)))
         if isinstance(para, Page):
             newbody.append(Sidbrytning(ordinal=para.number,
                                        width=para.width,
                                        height=para.height,
                                        src=para.src))
         else:
             newbody.append(para)
         doc.body = newbody
Esempio n. 2
0
 def htmlparser(chunks):
     b = Body()
     for block in chunks:
         tagtype = Preformatted if block.name == "pre" else Paragraph
         t = util.normalize_space(''.join(block.findAll(text=True)))
         block.extract()  # to avoid seeing it again
         if t:
             b.append(tagtype([t]))
     return b
Esempio n. 3
0
 def htmlparser(chunks):
     b = Body()
     for block in chunks:
         tagtype = Preformatted if block.name == "pre" else Paragraph
         t = util.normalize_space(''.join(block.findAll(text=True)))
         block.extract()  # to avoid seeing it again
         if t:
             b.append(tagtype([t]))
     return b
Esempio n. 4
0
    def test_serialize_roundtrip(self):
        # Create a elements object tree
        tree = Body([
            Section([Paragraph(["Hello"]),
                     Paragraph(["World"])],
                    ordinal="1",
                    title="Main section"),
            Section([
                42,
                date(2013, 11, 27),
                datetime(2013, 11, 27, 12, 0, 0), b'bytestring', {
                    'foo': 'bar',
                    'x': 'y'
                }
            ],
                    ordinal=2,
                    title="Native types")
        ])
        # roundtrip using the default XML format
        serialized = serialize(tree)
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, caller_globals=globals())
        self.assertEqual(tree, newtree)

        # make another section with special (but commonly used) types
        # and try to roundtrip them. The XML serialization format does
        # not support this.
        graph = Graph().parse(
            data="""@prefix dcterms: <http://purl.org/dc/terms/> .

<http://example.org/1> dcterms:title "Hello world"@en .
""",
            format="turtle")
        parseresult = urlparser.parseString("http://example.org/1")
        tree.append(Section([parseresult, graph], meta=graph))

        # roundtrip using JSON (which uses fully qualified classnames,
        # so we don't need to pass globals() into deserialize()
        serialized = serialize(tree, format="json")
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, format="json")

        # two pyparsing.ParseResult objects cannot be directly
        # compared (they don't implement __eq__), therefore we compare
        # their XML representations
        tree[2][0] = util.parseresults_as_xml(tree[2][0])
        newtree[2][0] = util.parseresults_as_xml(newtree[2][0])
        self.assertEqual(tree, newtree)
Esempio n. 5
0
    def test_serialize_roundtrip(self):
        # Create a elements object tree
        tree = Body([Section([Paragraph(["Hello"]),
                              Paragraph(["World"])],
                             ordinal="1",
                             title="Main section"),
                     Section([42,
                              date(2013,11,27),
                              datetime(2013,11,27,12,0,0),
                              b'bytestring',
                              {'foo': 'bar',
                               'x': 'y'}],
                             ordinal=2,
                             title="Native types")
                 ])
        # roundtrip using the default XML format
        serialized = serialize(tree)
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, caller_globals=globals())
        self.assertEqual(tree, newtree)

        # make another section with special (but commonly used) types
        # and try to roundtrip them. The XML serialization format does
        # not support this.
        graph = Graph().parse(data="""@prefix dcterms: <http://purl.org/dc/terms/> .

<http://example.org/1> dcterms:title "Hello world"@en .
""", format="turtle")
        parseresult = urlparser.parseString("http://example.org/1")
        tree.append(Section([parseresult,
                             graph],
                            meta=graph))
        
        # roundtrip using JSON (which uses fully qualified classnames,
        # so we don't need to pass globals() into deserialize()
        serialized = serialize(tree, format="json")
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, format="json")

        # two pyparsing.ParseResult objects cannot be directly
        # compared (they don't implement __eq__), therefore we compare
        # their XML representations
        tree[2][0] = util.parseresults_as_xml(tree[2][0])
        newtree[2][0] = util.parseresults_as_xml(newtree[2][0])
        self.assertEqual(tree, newtree)
Esempio n. 6
0
 def parse(tokenstream):
     current_type = None
     body = Body()
     for p in tokenstream:
         new_type = guess_type(p, current_type)
         # if not new_type == None:
         #    print "Guessed %s for %r" % (new_type.__name__,p[:20])
         if new_type is None:
             pass
         elif new_type == Continuation and len(body) > 0:
             # Don't create a new text node, add this text to the last
             # text node created
             para = body.pop()
             para.append(p)
             body.append(para)
         else:
             if new_type == Continuation:
                 new_type = Paragraph
             body.append(new_type([p]))
             current_type = new_type
     return body
Esempio n. 7
0
    def parse_pdfs(self, basefile, pdffiles):
        doc = Body()
        for pdffile in pdffiles:
            # FIXME: downloaded_path must be more fully mocked
            # (support attachments) by testutil.RepoTester. In the
            # meantime, we do some path munging ourselves

            pdf_path = self.store.downloaded_path(basefile).replace("index.html", pdffile)
            intermediate_path = self.store.intermediate_path(basefile, attachment=pdffile)
            intermediate_dir = os.path.dirname(intermediate_path)
            try:
                pdf = self.parse_pdf(pdf_path, intermediate_dir)
                for page in pdf:
                    pass
                    # page.crop(left=50,top=0,bottom=900,right=700)
                doc.append(pdf)
            except ValueError:
                (exc_type, exc_value, exc_trackback) = sys.exc_info()
                self.log.warning("Ignoring exception %s (%s), skipping PDF %s" %
                                 (exc_type, exc_value, pdffile))
        return doc