def distill_test(self, downloaded_file, rdf_file, docroot): try: prefixlen = len(docroot + "/downloaded/") if self.repo.storage_policy == "dir": suffixlen = len(downloaded_file.split(os.sep)[-1]) + 1 else: suffixlen = len(os.path.splitext(downloaded_file)[1]) pathfrag = downloaded_file[prefixlen:-suffixlen] basefile = self.repo.store.pathfrag_to_basefile(pathfrag) except: basefile = self.filename_to_basefile(downloaded_file) with patch.object(self.repo.documentstore_class, 'downloaded_path', return_value=downloaded_file): # self.repo.config.fsmdebug = True self.repo.parse(basefile) if 'FERENDA_SET_TESTFILES' in os.environ: print("Overwriting %r with result of parse (%r)" % (rdf_file, basefile)) g = rdflib.Graph() g.parse(data=util.readfile(self.repo.store.distilled_path(basefile))) util.robust_rename(rdf_file, rdf_file + "~") with open(rdf_file, "wb") as fp: fp.write(g.serialize(format="turtle")) return self.assertEqualGraphs(rdf_file, self.repo.store.distilled_path(basefile), exact=False)
def parse_test(self, downloaded_file, xhtml_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the resulting XHTML document is equal to the XHTML file placed in docroot/parsed/. """ basefile = self.filename_to_basefile(downloaded_file) def runtest(): if "FERENDA_LOG_TEST" in os.environ: loglevel = { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO) logformat = "%(asctime)s %(name)s %(levelname)s %(message)s" datefmt = "%H:%M:%S" handler = logging.StreamHandler() handler.setLevel(loglevel) handler.setFormatter( logging.Formatter(logformat, datefmt=datefmt)) logger = logging.getLogger() logger.setLevel(loglevel) # shut some non-core loggers up for logname in [ 'requests.packages.urllib3.connectionpool', 'rdflib.plugins.sleepycat', 'rdflib.plugins.parsers.pyRdfa', 'ferenda.thirdparty.patch' ]: log = logging.getLogger(logname) log.propagate = False logger.addHandler(handler) self.repo.parse(basefile) if "FERENDA_LOG_TEST" in os.environ: logger.removeHandler(handler) if "FERENDA_PROFILE_TEST" in os.environ: print("Profiling test") import cProfile cProfile.runctx("runtest()", globals(), locals(), sort="cumtime") else: runtest() print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile( self.repo.store.parsed_path(basefile)), tidy_xhtml=True)
def parse_test(self, downloaded_file, xhtml_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the resulting XHTML document is equal to the XHTML file placed in docroot/parsed/. """ basefile = self.filename_to_basefile(downloaded_file) def runtest(): if "FERENDA_LOG_TEST" in os.environ: loglevel = { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO) logformat = "%(asctime)s %(name)s %(levelname)s %(message)s" datefmt = "%H:%M:%S" handler = logging.StreamHandler() handler.setLevel(loglevel) handler.setFormatter(logging.Formatter(logformat, datefmt=datefmt)) logger = logging.getLogger() logger.setLevel(loglevel) # shut some non-core loggers up for logname in ['requests.packages.urllib3.connectionpool', 'rdflib.plugins.sleepycat', 'rdflib.plugins.parsers.pyRdfa', 'ferenda.thirdparty.patch']: log = logging.getLogger(logname) log.propagate = False logger.addHandler(handler) self.repo.parse(basefile) if "FERENDA_LOG_TEST" in os.environ: logger.removeHandler(handler) if "FERENDA_PROFILE_TEST" in os.environ: print("Profiling test") import cProfile cProfile.runctx("runtest()", globals(), locals(), sort="cumtime") else: runtest() print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile(self.repo.store.parsed_path(basefile)), tidy_xhtml=True)
def parse_test(self, downloaded_file, xhtml_file, docroot): # patch method so we control where the downloaded doc is # loaded from. basefile = self.filename_to_basefile(downloaded_file) # with patch('ferenda.DocumentStore.downloaded_path', # return_value=downloaded_file): with patch.object(self.repo.documentstore_class, 'downloaded_path', return_value=downloaded_file): self.repo.parse(basefile) if 'FERENDA_SET_TESTFILES' in os.environ: print("Overwriting %r with result of parse (%r)" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile(self.repo.store.parsed_path(basefile)))
def distill_test(self, downloaded_file, rdf_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the distilled RDF metadata is equal to the TTL files placed in docroot/distilled/. """ basefile = self.filename_to_basefile(downloaded_file) self.repo.parse(basefile) print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (rdf_file, basefile)) g = rdflib.Graph() g.parse(data=util.readfile(self.repo.store.distilled_path(basefile))) util.robust_rename(rdf_file, rdf_file + "~") with open(rdf_file, "wb") as fp: fp.write(g.serialize(format="turtle")) return self.assertEqualGraphs(rdf_file, self.repo.store.distilled_path(basefile), exact=False)
def distill_test(self, downloaded_file, rdf_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the distilled RDF metadata is equal to the TTL files placed in docroot/distilled/. """ basefile = self.filename_to_basefile(downloaded_file) self.repo.parse(basefile) print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (rdf_file, basefile)) g = rdflib.Graph() g.parse( data=util.readfile(self.repo.store.distilled_path(basefile))) util.robust_rename(rdf_file, rdf_file + "~") with open(rdf_file, "wb") as fp: fp.write(g.serialize(format="turtle")) return self.assertEqualGraphs(rdf_file, self.repo.store.distilled_path(basefile), exact=False)
def download_single(self, basefile, url): super(ARN, self).download_single(basefile, url) # after downloading: see if our PDF in reality was something else # FIXME: we should do this prior to .download_if_needed... d = self.store.downloaded_path(basefile) if os.path.exists(d): with open(d, "rb") as fp: sig = fp.read(4) if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" elif sig == b'%PDF': doctype = ".pdf" else: self.log.warning( "%s has unknown signature %r -- don't know what kind of file it is" % (d, sig)) doctype = ".pdf" # don't do anything if doctype != '.pdf': util.robust_rename(d, d.replace(".pdf", doctype))
def test_robust_rename(self): # only test the IOError branch util.writefile(self.fname, "Hello") util.writefile(self.fname2, "Hello") with patch('ferenda.util.shutil.move', side_effect=IOError): util.robust_rename(self.fname, self.fname2)