def parse(self, doc): # FIXME: don't create these if they already exists self.lagrum_parser = LegalRef(LegalRef.LAGRUM) self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL) docfile = self.store.downloaded_path(doc.basefile) intermediatefile = self.store.intermediate_path(doc.basefile) r = WordReader() intermediatefile, filetype = r.read(docfile, intermediatefile) with codecs.open(intermediatefile, encoding="utf-8") as fp: patchedtext, patchdesc = self.patch_if_needed(doc.basefile, fp.read()) # The second step is to mangle the crappy XML produced by # antiword (docbook) or Word 2007 (OOXML) into a nice pair of # structures. rawhead is a simple dict that we'll later transform # into a rdflib Graph. rawbody is a list of plaintext strings, each # representing a paragraph. # # long-term FIXME: WordReader should expose a unified # interface for handling both kinds of word files so that we # wouldn't need both parse_ooxml() and # parse_antiword_docbook(). This might require some other tool # than antiword for old .doc files, as this throws away a LOT # of info. if filetype == "docx": rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile) else: rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile) doc.uri = self.polish_metadata(rawhead, doc) if patchdesc: doc.meta.add((URIRef(doc.uri), self.ns['ferenda'].patchdescription, patchdesc)) doc.body = self.format_body(rawbody) # FIXME: Write a
class Read(unittest.TestCase): def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = WordReader() def tearDown(self): shutil.rmtree(self.datadir) def test_doc(self): path = self.datadir + os.sep + "out.xml" out, type = self.reader.read("test/files/wordreader/sample.doc", path) self.assertEqual(out, path) self.assertEqual(type, "doc") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("book", tree.getroot().tag) xpath = '//*[contains(text(), "simple document in .doc format")]' self.assertTrue(tree.getroot().xpath(xpath)) # test that spaces in filename work (requires more cmdline quoting) os.unlink(path) out, type = self.reader.read("test/files/wordreader/spaces in filename.doc", path) self.assertEqual(out, path) self.assertEqual(type, "doc") def test_docx(self): path = self.datadir + os.sep + "out.xml" out, type = self.reader.read("test/files/wordreader/sample.docx", path) self.assertEqual(out, path) self.assertEqual(type, "docx") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document", tree.getroot().tag) xpath = '//*[contains(text(), "simple document in OOXML (.docx) format")]' self.assertTrue(tree.getroot().xpath(xpath)) def test_mislabeled(self): path = self.datadir + os.sep + "out.xml" out, type = self.reader.read("test/files/wordreader/mislabeled.doc", path) self.assertEqual(out, path) self.assertEqual(type, "docx") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document", tree.getroot().tag) xpath = '//*[contains(text(), "mis-labeled as a .doc file")]' self.assertTrue(tree.getroot().xpath(xpath))
def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = WordReader()
class Read(unittest.TestCase): def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = WordReader() def tearDown(self): shutil.rmtree(self.datadir) def test_doc(self): path = self.datadir + os.sep + "out.xml" try: with open(path, "wb") as fp: filetype = self.reader.read("test/files/wordreader/sample.doc", fp) self.assertEqual(filetype, "doc") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("book", tree.getroot().tag) xpath = '//*[contains(text(), "simple document in .doc format")]' self.assertTrue(tree.getroot().xpath(xpath)) # test that spaces in filename work (requires more cmdline quoting) os.unlink(path) with open(path, "wb") as fp: filetype = self.reader.read("test/files/wordreader/spaces in filename.doc", fp) self.assertEqual(filetype, "doc") except ExternalCommandError as e: raise unittest.SkipTest("Antiword does not seem to be installed") def test_docx(self): path = self.datadir + os.sep + "out.xml" with open(path, "wb") as fp: filetype = self.reader.read("test/files/wordreader/sample.docx", fp) self.assertEqual(filetype, "docx") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document", tree.getroot().tag) xpath = '//*[contains(text(), "simple document in OOXML (.docx) format")]' self.assertTrue(tree.getroot().xpath(xpath)) def test_mislabeled(self): path = self.datadir + os.sep + "out.xml" try: with silence(): with open(path, "wb") as fp: filetype = self.reader.read("test/files/wordreader/mislabeled.doc", fp) self.assertEqual(filetype, "docx") self.assertTrue(os.path.exists(path)) tree = etree.parse(path) self.assertEqual("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}document", tree.getroot().tag) xpath = '//*[contains(text(), "mis-labeled as a .doc file")]' self.assertTrue(tree.getroot().xpath(xpath)) except ExternalCommandError as e: raise unittest.SkipTest("Antiword does not seem to be installed")