Example #1
0
    def _run_test(self, extr, url, result):
        if result:
            if "options" in result:
                for key, value in result["options"]:
                    config.set(key.split("."), value)
            content = "content" in result
        else:
            content = False

        tjob = job.TestJob(url, content=content)
        self.assertEqual(extr, tjob.extractor.__class__)

        if not result:
            return
        if "exception" in result:
            self.assertRaises(result["exception"], tjob.run)
            return

        tjob.run()
        if "url" in result:
            self.assertEqual(result["url"], tjob.hash_url.hexdigest())
        if "keyword" in result:
            self.assertEqual(result["keyword"], tjob.hash_keyword.hexdigest())
        if "content" in result:
            self.assertEqual(result["content"], tjob.hash_content.hexdigest())
        if "count" in result:
            self.assertEqual(len(tjob.urllist), int(result["count"]))
        if "pattern" in result:
            for url in tjob.urllist:
                self.assertRegex(url, result["pattern"])
Example #2
0
    def _run_test(self, extr, url, result):
        if result:
            if "options" in result:
                for key, value in result["options"]:
                    config.set(key.split("."), value)
            if "range" in result:
                config.set(("image-range", ), result["range"])
                config.set(("chapter-range", ), result["range"])
            content = "content" in result
        else:
            content = False

        tjob = job.TestJob(url, content=content)
        self.assertEqual(extr, tjob.extractor.__class__)

        if not result:
            return
        if "exception" in result:
            self.assertRaises(result["exception"], tjob.run)
            return
        try:
            tjob.run()
        except exception.StopExtraction:
            pass
        except exception.HttpError as exc:
            if re.match(r"5\d\d: ", str(exc)):
                self.skipTest(exc)
            raise

        # test archive-id uniqueness
        self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive))

        # test extraction results
        if "url" in result:
            self.assertEqual(result["url"], tjob.hash_url.hexdigest())

        if "content" in result:
            self.assertEqual(result["content"], tjob.hash_content.hexdigest())

        if "keyword" in result:
            keyword = result["keyword"]
            if isinstance(keyword, dict):
                for kwdict in tjob.list_keyword:
                    self._test_kwdict(kwdict, keyword)
            else:  # assume SHA1 hash
                self.assertEqual(keyword, tjob.hash_keyword.hexdigest())

        if "count" in result:
            count = result["count"]
            if isinstance(count, str):
                self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$")
                expr = "{} {}".format(len(tjob.list_url), count)
                self.assertTrue(eval(expr), msg=expr)
            else:  # assume integer
                self.assertEqual(len(tjob.list_url), count)

        if "pattern" in result:
            self.assertGreater(len(tjob.list_url), 0)
            for url in tjob.list_url:
                self.assertRegex(url, result["pattern"])
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--content", action="store_true")
    parser.add_argument("--recreate", action="store_true")
    parser.add_argument("urls", nargs="*")
    args = parser.parse_args()

    if args.recreate:
        urls = [
            test[0]
            for extr in extractor.extractors() if extr.category in args.urls
            for test in extr.test
        ]
    else:
        urls = args.urls

    config.load()
    for url in urls:
        tjob = job.TestJob(url, content=args.content)
        try:
            tjob.run()
        except Exception as exc:
            fmt = TESTDATA_EXCEPTION_FMT
            data = (exc.__class__.__name__,)
        else:
            fmt = TESTDATA_FMT
            data = (tjob.hash_url.hexdigest(),
                    tjob.hash_keyword.hexdigest(),
                    tjob.hash_content.hexdigest())
        print(tjob.extractor.__class__.__name__)
        print(fmt.format(url, *data))
Example #4
0
    def _run_test(self, extr, url, result):
        if result:
            if "options" in result:
                for key, value in result["options"]:
                    config.set(key.split("."), value)
            content = "content" in result
        else:
            content = False

        tjob = job.TestJob(url, content=content)
        self.assertEqual(extr, tjob.extractor.__class__)

        if not result:
            return
        if "exception" in result:
            self.assertRaises(result["exception"], tjob.run)
            return

        try:
            tjob.run()
        except exception.HttpError as exc:
            try:
                if 500 <= exc.args[0].response.status_code < 600:
                    self.skipTest(exc)
            except AttributeError:
                pass
            raise

        # test archive-id uniqueness
        self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive))

        # test extraction results
        if "url" in result:
            self.assertEqual(result["url"], tjob.hash_url.hexdigest())

        if "content" in result:
            self.assertEqual(result["content"], tjob.hash_content.hexdigest())

        if "keyword" in result:
            keyword = result["keyword"]
            if isinstance(keyword, dict):
                for kwdict in tjob.list_keyword:
                    self._test_kwdict(kwdict, keyword)
            else:  # assume SHA1 hash
                self.assertEqual(keyword, tjob.hash_keyword.hexdigest())

        if "count" in result:
            count = result["count"]
            if isinstance(count, str):
                self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$")
                expr = "{} {}".format(len(tjob.list_url), count)
                self.assertTrue(eval(expr), msg=expr)
            else:  # assume integer
                self.assertEqual(len(tjob.list_url), count)

        if "pattern" in result:
            for url in tjob.list_url:
                self.assertRegex(url, result["pattern"])
Example #5
0
 def _run_test(self, extr, url, result):
     tjob = job.TestJob(url, "content" in result)
     self.assertEqual(extr, tjob.extractor.__class__)
     if "exception" in result:
         self.assertRaises(result["exception"], tjob.run)
         return
     tjob.run()
     if "url" in result:
         self.assertEqual(tjob.hash_url.hexdigest(), result["url"])
     if "keyword" in result:
         self.assertEqual(tjob.hash_keyword.hexdigest(), result["keyword"])
     if "content" in result:
         self.assertEqual(tjob.hash_content.hexdigest(), result["content"])
Example #6
0
    def _run_test(self, extr, url, result):
        if result:
            if "options" in result:
                for key, value in result["options"]:
                    config.set(key.split("."), value)
            content = "content" in result
        else:
            content = False

        tjob = job.TestJob(url, content=content)
        self.assertEqual(extr, tjob.extractor.__class__)

        if not result:
            return
        if "exception" in result:
            self.assertRaises(result["exception"], tjob.run)
            return

        try:
            tjob.run()
        except exception.HttpError as exc:
            try:
                if 500 <= exc.args[0].response.status_code < 600:
                    self.skipTest(exc)
            except AttributeError as e:
                pass
            raise

        if "url" in result:
            self.assertEqual(result["url"], tjob.hash_url.hexdigest())
        if "keyword" in result:
            self.assertEqual(result["keyword"], tjob.hash_keyword.hexdigest())
        if "content" in result:
            self.assertEqual(result["content"], tjob.hash_content.hexdigest())
        if "count" in result:
            self.assertEqual(len(tjob.urllist), int(result["count"]))
        if "pattern" in result:
            for url in tjob.urllist:
                self.assertRegex(url, result["pattern"])