Beispiel #1
0
 def distill_test(self, downloaded_file, rdf_file, docroot):
     try:
         prefixlen = len(docroot + "/downloaded/")
         if self.repo.storage_policy == "dir":
             suffixlen = len(downloaded_file.split(os.sep)[-1]) + 1
         else:
             suffixlen = len(os.path.splitext(downloaded_file)[1])
         pathfrag = downloaded_file[prefixlen:-suffixlen]
         basefile = self.repo.store.pathfrag_to_basefile(pathfrag)
     except:
         basefile = self.filename_to_basefile(downloaded_file)
     with patch.object(self.repo.documentstore_class, 'downloaded_path',
                       return_value=downloaded_file):
         # self.repo.config.fsmdebug = True
         self.repo.parse(basefile)
     if 'FERENDA_SET_TESTFILES' in os.environ:
         print("Overwriting %r with result of parse (%r)" % (rdf_file, basefile))
         g = rdflib.Graph()
         g.parse(data=util.readfile(self.repo.store.distilled_path(basefile)))
         util.robust_rename(rdf_file, rdf_file + "~")
         with open(rdf_file, "wb") as fp:
             fp.write(g.serialize(format="turtle"))
         return
     self.assertEqualGraphs(rdf_file,
                            self.repo.store.distilled_path(basefile),
                            exact=False)
Beispiel #2
0
    def parse_test(self, downloaded_file, xhtml_file, docroot):
        """This test is run once for each basefile found in
        docroot/downloaded. It performs a full parse, and verifies that
        the resulting XHTML document is equal to the XHTML file placed in
        docroot/parsed/.

        """
        basefile = self.filename_to_basefile(downloaded_file)

        def runtest():
            if "FERENDA_LOG_TEST" in os.environ:
                loglevel = {
                    "DEBUG": logging.DEBUG,
                    "INFO": logging.INFO,
                    "WARNING": logging.WARNING,
                    "ERROR": logging.ERROR,
                    "CRITICAL": logging.CRITICAL
                }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO)
                logformat = "%(asctime)s %(name)s %(levelname)s %(message)s"
                datefmt = "%H:%M:%S"
                handler = logging.StreamHandler()
                handler.setLevel(loglevel)
                handler.setFormatter(
                    logging.Formatter(logformat, datefmt=datefmt))
                logger = logging.getLogger()
                logger.setLevel(loglevel)
                # shut some non-core loggers up
                for logname in [
                        'requests.packages.urllib3.connectionpool',
                        'rdflib.plugins.sleepycat',
                        'rdflib.plugins.parsers.pyRdfa',
                        'ferenda.thirdparty.patch'
                ]:
                    log = logging.getLogger(logname)
                    log.propagate = False
                logger.addHandler(handler)
            self.repo.parse(basefile)
            if "FERENDA_LOG_TEST" in os.environ:
                logger.removeHandler(handler)

        if "FERENDA_PROFILE_TEST" in os.environ:
            print("Profiling test")
            import cProfile
            cProfile.runctx("runtest()", globals(), locals(), sort="cumtime")
        else:
            runtest()
        print = builtins.print
        if 'FERENDA_SET_TESTFILE' in os.environ:
            print("Overwriting '%s' with result of parse ('%s')" %
                  (xhtml_file, basefile))
            util.robust_rename(xhtml_file, xhtml_file + "~")
            shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file)
            return
        self.assertEqualXML(util.readfile(xhtml_file),
                            util.readfile(
                                self.repo.store.parsed_path(basefile)),
                            tidy_xhtml=True)
Beispiel #3
0
    def parse_test(self, downloaded_file, xhtml_file, docroot):
        """This test is run once for each basefile found in
        docroot/downloaded. It performs a full parse, and verifies that
        the resulting XHTML document is equal to the XHTML file placed in
        docroot/parsed/.

        """
        basefile = self.filename_to_basefile(downloaded_file)
        def runtest():
            if "FERENDA_LOG_TEST" in os.environ:
                loglevel = {
                    "DEBUG": logging.DEBUG,
                    "INFO": logging.INFO,
                    "WARNING": logging.WARNING,
                    "ERROR": logging.ERROR,
                    "CRITICAL": logging.CRITICAL
                    }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO)
                logformat = "%(asctime)s %(name)s %(levelname)s %(message)s"
                datefmt = "%H:%M:%S"
                handler = logging.StreamHandler()
                handler.setLevel(loglevel)
                handler.setFormatter(logging.Formatter(logformat, datefmt=datefmt))
                logger = logging.getLogger()
                logger.setLevel(loglevel)
                # shut some non-core loggers up
                for logname in ['requests.packages.urllib3.connectionpool',
                                'rdflib.plugins.sleepycat',
                                'rdflib.plugins.parsers.pyRdfa',
                                'ferenda.thirdparty.patch']:
                    log = logging.getLogger(logname)
                    log.propagate = False
                logger.addHandler(handler)
            self.repo.parse(basefile)
            if "FERENDA_LOG_TEST" in os.environ:
                logger.removeHandler(handler)
            

        if "FERENDA_PROFILE_TEST" in os.environ:
            print("Profiling test")
            import cProfile
            cProfile.runctx("runtest()", globals(), locals(), sort="cumtime")
        else:
            runtest()
        print = builtins.print
        if 'FERENDA_SET_TESTFILE' in os.environ:
            print("Overwriting '%s' with result of parse ('%s')" % (xhtml_file, basefile))
            util.robust_rename(xhtml_file, xhtml_file + "~")
            shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file)
            return
        self.assertEqualXML(util.readfile(xhtml_file),
                            util.readfile(self.repo.store.parsed_path(basefile)),
                            tidy_xhtml=True)
Beispiel #4
0
 def parse_test(self, downloaded_file, xhtml_file, docroot):
     # patch method so we control where the downloaded doc is
     # loaded from.
     basefile = self.filename_to_basefile(downloaded_file)
     # with patch('ferenda.DocumentStore.downloaded_path',
     #           return_value=downloaded_file):
     with patch.object(self.repo.documentstore_class, 'downloaded_path',
                       return_value=downloaded_file):
         self.repo.parse(basefile)
     if 'FERENDA_SET_TESTFILES' in os.environ:
         print("Overwriting %r with result of parse (%r)" % (xhtml_file, basefile))
         util.robust_rename(xhtml_file, xhtml_file + "~")
         shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file)
         return
     self.assertEqualXML(util.readfile(xhtml_file),
                         util.readfile(self.repo.store.parsed_path(basefile)))
Beispiel #5
0
    def distill_test(self, downloaded_file, rdf_file, docroot):
        """This test is run once for each basefile found in
        docroot/downloaded. It performs a full parse, and verifies that
        the distilled RDF metadata is equal to the TTL files placed in
        docroot/distilled/.

        """
        basefile = self.filename_to_basefile(downloaded_file)
        self.repo.parse(basefile)
        print = builtins.print
        if 'FERENDA_SET_TESTFILE' in os.environ:
            print("Overwriting '%s' with result of parse ('%s')" % (rdf_file, basefile))
            g = rdflib.Graph()
            g.parse(data=util.readfile(self.repo.store.distilled_path(basefile)))
            util.robust_rename(rdf_file, rdf_file + "~")
            with open(rdf_file, "wb") as fp:
                fp.write(g.serialize(format="turtle"))
            return
        self.assertEqualGraphs(rdf_file,
                               self.repo.store.distilled_path(basefile),
                               exact=False)
Beispiel #6
0
    def distill_test(self, downloaded_file, rdf_file, docroot):
        """This test is run once for each basefile found in
        docroot/downloaded. It performs a full parse, and verifies that
        the distilled RDF metadata is equal to the TTL files placed in
        docroot/distilled/.

        """
        basefile = self.filename_to_basefile(downloaded_file)
        self.repo.parse(basefile)
        print = builtins.print
        if 'FERENDA_SET_TESTFILE' in os.environ:
            print("Overwriting '%s' with result of parse ('%s')" %
                  (rdf_file, basefile))
            g = rdflib.Graph()
            g.parse(
                data=util.readfile(self.repo.store.distilled_path(basefile)))
            util.robust_rename(rdf_file, rdf_file + "~")
            with open(rdf_file, "wb") as fp:
                fp.write(g.serialize(format="turtle"))
            return
        self.assertEqualGraphs(rdf_file,
                               self.repo.store.distilled_path(basefile),
                               exact=False)
Beispiel #7
0
 def download_single(self, basefile, url):
     super(ARN, self).download_single(basefile, url)
     # after downloading: see if our PDF in reality was something else
     # FIXME: we should do this prior to .download_if_needed...
     d = self.store.downloaded_path(basefile)
     if os.path.exists(d):
         with open(d, "rb") as fp:
             sig = fp.read(4)
             if sig == b'\xffWPC':
                 doctype = ".wpd"
             elif sig == b'\xd0\xcf\x11\xe0':
                 doctype = ".doc"
             elif sig == b'PK\x03\x04':
                 doctype = ".docx"
             elif sig == b'{\\rt':
                 doctype = ".rtf"
             elif sig == b'%PDF':
                 doctype = ".pdf"
             else:
                 self.log.warning(
                     "%s has unknown signature %r -- don't know what kind of file it is" % (d, sig))
                 doctype = ".pdf"  # don't do anything
         if doctype != '.pdf':
             util.robust_rename(d, d.replace(".pdf", doctype))
Beispiel #8
0
 def test_robust_rename(self):
     # only test the IOError branch
     util.writefile(self.fname, "Hello")
     util.writefile(self.fname2, "Hello")
     with patch('ferenda.util.shutil.move', side_effect=IOError):
         util.robust_rename(self.fname, self.fname2)