Beispiel #1
0
 def parse(self, doc):
     # FIXME: don't create these if they already exists
     self.lagrum_parser = LegalRef(LegalRef.LAGRUM)
     self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL)
     docfile = self.store.downloaded_path(doc.basefile)
     intermediatefile = self.store.intermediate_path(doc.basefile)
     r = WordReader()
     intermediatefile, filetype = r.read(docfile, intermediatefile)
     with codecs.open(intermediatefile, encoding="utf-8") as fp:
         patchedtext, patchdesc = self.patch_if_needed(doc.basefile,
                                                       fp.read())
     # The second step is to mangle the crappy XML produced by
     # antiword (docbook) or Word 2007 (OOXML) into a nice pair of
     # structures. rawhead is a simple dict that we'll later transform
     # into a rdflib Graph. rawbody is a list of plaintext strings, each
     # representing a paragraph.
     #
     # long-term FIXME: WordReader should expose a unified
     # interface for handling both kinds of word files so that we
     # wouldn't need both parse_ooxml() and
     # parse_antiword_docbook(). This might require some other tool
     # than antiword for old .doc files, as this throws away a LOT
     # of info.
     if filetype == "docx":
         rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile)
     else:
         rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile)
     doc.uri = self.polish_metadata(rawhead, doc)
     if patchdesc:
         doc.meta.add((URIRef(doc.uri),
                       self.ns['ferenda'].patchdescription,
                       patchdesc))
     doc.body = self.format_body(rawbody)  # FIXME: Write a
Beispiel #2
0
class Read(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.datadir = tempfile.mkdtemp()
        self.reader = WordReader()
        
    def tearDown(self):
        shutil.rmtree(self.datadir)

    def test_doc(self):
        path = self.datadir + os.sep + "out.xml"
        out, type = self.reader.read("test/files/wordreader/sample.doc",
                                     path)
        self.assertEqual(out, path)
        self.assertEqual(type, "doc")
        self.assertTrue(os.path.exists(path))
        tree = etree.parse(path)
        self.assertEqual("book", tree.getroot().tag)
        xpath = '//*[contains(text(), "simple document in .doc format")]'
        self.assertTrue(tree.getroot().xpath(xpath))

        # test that spaces in filename work (requires more cmdline quoting)
        os.unlink(path)
        out, type = self.reader.read("test/files/wordreader/spaces in filename.doc",
                                     path)
        self.assertEqual(out, path)
        self.assertEqual(type, "doc")
        

    def test_docx(self):
        path = self.datadir + os.sep + "out.xml"
        out, type = self.reader.read("test/files/wordreader/sample.docx",
                                     path)
        self.assertEqual(out, path)
        self.assertEqual(type, "docx")
        self.assertTrue(os.path.exists(path))
        tree = etree.parse(path)
        self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                         tree.getroot().tag)
        xpath = '//*[contains(text(), "simple document in OOXML (.docx) format")]'
        self.assertTrue(tree.getroot().xpath(xpath))
            
    def test_mislabeled(self):
        path = self.datadir + os.sep + "out.xml"
        out, type = self.reader.read("test/files/wordreader/mislabeled.doc",
                                     path)
        self.assertEqual(out, path)
        self.assertEqual(type, "docx")
        self.assertTrue(os.path.exists(path))
        tree = etree.parse(path)
        self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                         tree.getroot().tag)
        xpath = '//*[contains(text(), "mis-labeled as a .doc file")]'
        self.assertTrue(tree.getroot().xpath(xpath))
Beispiel #3
0
 def setUp(self):
     self.maxDiff = None
     self.datadir = tempfile.mkdtemp()
     self.reader = WordReader()
Beispiel #4
0
class Read(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.datadir = tempfile.mkdtemp()
        self.reader = WordReader()
        
    def tearDown(self):
        shutil.rmtree(self.datadir)

    def test_doc(self):
        path = self.datadir + os.sep + "out.xml"
        try:
            with open(path, "wb") as fp:
                filetype = self.reader.read("test/files/wordreader/sample.doc",
                                            fp)
            self.assertEqual(filetype, "doc")
            self.assertTrue(os.path.exists(path))
            tree = etree.parse(path)
            self.assertEqual("book", tree.getroot().tag)
            xpath = '//*[contains(text(), "simple document in .doc format")]'
            self.assertTrue(tree.getroot().xpath(xpath))

            # test that spaces in filename work (requires more cmdline quoting)
            os.unlink(path)
            with open(path, "wb") as fp:
                filetype = self.reader.read("test/files/wordreader/spaces in filename.doc",
                                            fp)
            self.assertEqual(filetype, "doc")
        except ExternalCommandError as e:
            raise unittest.SkipTest("Antiword does not seem to be installed")
        
        

    def test_docx(self):
        path = self.datadir + os.sep + "out.xml"
        with open(path, "wb") as fp:
            filetype = self.reader.read("test/files/wordreader/sample.docx",
                                        fp)
        self.assertEqual(filetype, "docx")
        self.assertTrue(os.path.exists(path))
        tree = etree.parse(path)
        self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                         tree.getroot().tag)
        xpath = '//*[contains(text(), "simple document in OOXML (.docx) format")]'
        self.assertTrue(tree.getroot().xpath(xpath))
            
    def test_mislabeled(self):
        path = self.datadir + os.sep + "out.xml"
        try:
            with silence():
                with open(path, "wb") as fp:
                    filetype = self.reader.read("test/files/wordreader/mislabeled.doc",
                                                fp)
            self.assertEqual(filetype, "docx")
            self.assertTrue(os.path.exists(path))
            tree = etree.parse(path)
            self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                             tree.getroot().tag)
            xpath = '//*[contains(text(), "mis-labeled as a .doc file")]'
            self.assertTrue(tree.getroot().xpath(xpath))
        except ExternalCommandError as e:
            raise unittest.SkipTest("Antiword does not seem to be installed")
Beispiel #5
0
 def setUp(self):
     self.maxDiff = None
     self.datadir = tempfile.mkdtemp()
     self.reader = WordReader()
Beispiel #6
0
class Read(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.datadir = tempfile.mkdtemp()
        self.reader = WordReader()
        
    def tearDown(self):
        shutil.rmtree(self.datadir)

    def test_doc(self):
        path = self.datadir + os.sep + "out.xml"
        try:
            with open(path, "wb") as fp:
                filetype = self.reader.read("test/files/wordreader/sample.doc",
                                            fp)
            self.assertEqual(filetype, "doc")
            self.assertTrue(os.path.exists(path))
            tree = etree.parse(path)
            self.assertEqual("book", tree.getroot().tag)
            xpath = '//*[contains(text(), "simple document in .doc format")]'
            self.assertTrue(tree.getroot().xpath(xpath))

            # test that spaces in filename work (requires more cmdline quoting)
            os.unlink(path)
            with open(path, "wb") as fp:
                filetype = self.reader.read("test/files/wordreader/spaces in filename.doc",
                                            fp)
            self.assertEqual(filetype, "doc")
        except ExternalCommandError as e:
            raise unittest.SkipTest("Antiword does not seem to be installed")
        
        

    def test_docx(self):
        path = self.datadir + os.sep + "out.xml"
        with open(path, "wb") as fp:
            filetype = self.reader.read("test/files/wordreader/sample.docx",
                                        fp)
        self.assertEqual(filetype, "docx")
        self.assertTrue(os.path.exists(path))
        tree = etree.parse(path)
        self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                         tree.getroot().tag)
        xpath = '//*[contains(text(), "simple document in OOXML (.docx) format")]'
        self.assertTrue(tree.getroot().xpath(xpath))
            
    def test_mislabeled(self):
        path = self.datadir + os.sep + "out.xml"
        try:
            with silence():
                with open(path, "wb") as fp:
                    filetype = self.reader.read("test/files/wordreader/mislabeled.doc",
                                                fp)
            self.assertEqual(filetype, "docx")
            self.assertTrue(os.path.exists(path))
            tree = etree.parse(path)
            self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document",
                             tree.getroot().tag)
            xpath = '//*[contains(text(), "mis-labeled as a .doc file")]'
            self.assertTrue(tree.getroot().xpath(xpath))
        except ExternalCommandError as e:
            raise unittest.SkipTest("Antiword does not seem to be installed")