def test_xml_metadata_document_is_missing(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents.  The first landing
        page document references an XML document that is not present.  The 2nd
        document is good.

        EXPECTED RESULT:  The log records the XML retrieval failure,
        but also the successful ingest of the 2nd document.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1 (fails)
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_binary('tests.data.arm', 'sitemap2.xml'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            b'',
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.xml'),
        ]
        status_codes = [200, 200, 400, 200, 200]
        headers = [
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
        ]
        z = zip(contents, status_codes, headers)
        for content, status_code, headers in z:
            aioresp_mocker.get(self.pattern,
                               body=content,
                               status=status_code,
                               headers=headers)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertErrorLogMessage(cm.output, "Bad Request")
            self.assertSuccessfulDebugIngest(cm.output)
    def test__max_num_errors(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references several documents, one of which
        has invalid XML.  The max_num_errors setting has been lowered to one,
        which should trigger a shutdown.

        EXPECTED RESULT:  We should stop processing as soon as that first
        error is encountered.  There should be an INFO shutdown message.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.invalid.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm',
                         'nsanimfraod1michC2.c1.fixed.xml'),  # noqa: E501
        ]
        status_codes = [200, 200, 200, 200, 200]
        headers = [
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
        ]
        z = zip(contents, status_codes, headers)
        for content, status_code, headers in z:
            aioresp_mocker.get(self.pattern,
                               body=content,
                               status=status_code,
                               headers=headers)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url, max_num_errors=1)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertInfoLogMessage(cm.output, 'Shutting down')
    def test_limit_number_of_documents(self, aioresp_mocker):
        """
        SCENARIO:  We do not wish to go through the entire list of documents,
        so a limit is specified.  There are 3 records in the sitemap, only 2
        are to be processed.

        EXPECTED RESULT:  Three records are detected in the sitemap, but the
        log shows only two were processed.
        """
        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_binary('tests.data.arm', 'sitemap3.xml'),
            ir.read_binary('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_binary('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.xml'),
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/xml'
            },
        ]
        for content, headers in zip(contents, headers):
            aioresp_mocker.get(self.pattern, body=content, headers=headers)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url, num_documents=2)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            # Only two records processed.
            self.assertSuccessfulDebugIngest(cm.output, n=2)

        # And just to show, there are 3 URLs in the sitemap.
        doc = lxml.etree.parse(io.BytesIO(contents[0]))
        urls = doc.xpath('.//sm:loc/text()',
                         namespaces=schema_org.core.SITEMAP_NS)
        self.assertEqual(len(urls), 3)
    def test_metadata_document_does_not_validate(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents.  The first landing
        page document references an XML document that does not validate.  The
        2nd document is good.

        EXPECTED RESULT:  The successful ingest of the 2nd document is
        reflected in the log.  The validation failure is also reflected,
        specifically the identity of the CI_Responsibility element.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]
        for header, content in zip(headers, contents):
            aioresp_mocker.get(self.pattern, headers=header, body=content)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            expected_msgs = [
                'XML document does not validate', 'CI_ResponsibleParty'
            ]
            self.assertLogMessage(cm.output, expected_msgs)
            self.assertSuccessfulDebugIngest(cm.output)
    def test_metadata_document_is_invalid_xml(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents.  The first landing
        page document references an XML document that is not valid XML.  The
        2nd document is good.

        EXPECTED RESULT:  The log records the invalid XML for the first
        document, but also the successful ingest of the 2nd document.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.invalid.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm',
                         'nsanimfraod1michC2.c1.fixed.xml'),  # noqa: E501
        ]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]
        for header, content in zip(headers, contents):
            aioresp_mocker.get(self.pattern, headers=header, body=content)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertLogMessage(cm.output, 'XMLMetadataParsingError')
            self.assertSuccessfulDebugIngest(cm.output)
    def test_landing_page_is_not_present(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents.  The first landing
        page document that is not present, but the next one is fine.

        EXPECTED RESULT:  The log record reflects the successful calls, but
        also failure to retrieve the HTML.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1 (fails)
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            b'',
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]
        status_codes = [200, 400, 200, 200]

        for content, header, status in zip(contents, headers, status_codes):
            aioresp_mocker.get(self.pattern,
                               body=content,
                               headers=header,
                               status=status)

        sitemap = 'https://www.archive.arm.gov/metadata/adc/sitemap_not.xml'
        obj = D1CheckSitemap(sitemap_url=sitemap)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertErrorLogMessage(cm.output, "Bad Request")
            self.assertSuccessfulDebugIngest(cm.output)
    def test_jsonld_script_elemement_is_not_valid_json(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents.  The first landing
        page document has a JSON-LD script element that is not valid JSON-LD.
        The 2nd document is good.

        EXPECTED RESULT:  The log records the JSON-LD failure, but also the
        successful ingest of the 2nd document.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1 (no XML document can be accessed)
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            ir.read_text(
                'tests.data.arm',
                'nsaqcrad1longC2.c2.invalid_jsonld.html'),  # noqa: E501
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]

        for header, content in zip(headers, contents):
            aioresp_mocker.get(self.pattern, headers=header, body=content)

        sitemap = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=sitemap)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertErrorLogMessage(cm.output, 'JSONDecodeError')
            self.assertSuccessfulDebugIngest(cm.output)
    def test_sitemap_is_gzipped(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents, both of which are
        good.  The sitemap is gzipped.

        EXPECTED RESULT:  The successful ingest of both documents is logged.
        """

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_binary('tests.data.arm', 'sitemap2.xml.gz'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'application/x-gzip'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]

        for header, content in zip(headers, contents):
            aioresp_mocker.get(self.pattern, headers=header, body=content)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.xml'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())
            self.assertSuccessfulDebugIngest(cm.output, n=2)
    def test_no_site_map(self, aioresp_mocker):
        """
        SCENARIO:  The given URL for the sitemap does not seem to exist.

        EXPECTED RESULT:  Errors are recorded.
        """
        aioresp_mocker.get(self.pattern, status=400)

        sitemap = 'https://www.archive.arm.gov/metadata/adc/sitemap.txt'
        obj = D1CheckSitemap(sitemap_url=sitemap, verbosity='DEBUG')
        with self.assertLogs(logger=obj.logger, level='INFO') as cm:
            asyncio.run(obj.run())

            msgs = [SITEMAP_RETRIEVAL_FAILURE_MESSAGE, 'ClientResponseError']
            self.assertLogMessage(cm.output, msgs)
    def test_site_map_is_not_xml(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap document is not XML.

        EXPECTED RESULT:  There is an error stating that the URL may not be
        XML, plus an XMLSyntaxError is raised.
        """
        content = ir.read_text('tests.data.arm', 'sitemap.txt')
        headers = {'Content-Type': 'text/plain'}
        aioresp_mocker.get(self.pattern, body=content, headers=headers)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap.txt'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            # Verify the warning about the sitemap possibly not being XML.
            self.assertLogMessage(cm.output,
                                  SITEMAP_NOT_XML_MESSAGE,
                                  level='WARNING')

            # Verify the exception caught when trying to parse the sitemap.
            self.assertLogMessage(cm.output, 'XMLSyntaxError', level='ERROR')
    def test_multiple_workers(self, aioresp_mocker):
        """
        SCENARIO:  The sitemap references two documents, both of which are
        good.  More than one worker is employed.  Warning, this test seems
        brittle.

        EXPECTED RESULT:  The successful ingest of both documents is logged.
        Tasks can be seen as being created for both workers.  There should be
        a log message stating that two records were successfully processed.
        """

        # External calls to read the:
        # Have to have a lot of documents because just two might not be enough
        # to get more than one worker involved.
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #   3) HTML document for record 2
        #   4) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap2.xml'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.xml'),
        ]
        headers = [
            {
                'Content-Type': 'application/x-gzip'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'text/html'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]
        for header, content in zip(headers, contents):
            aioresp_mocker.get(self.pattern, headers=header, body=content)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap2.xml'
        obj = D1CheckSitemap(sitemap_url=url, num_workers=2)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())
            self.assertSuccessfulDebugIngest(cm.output, n=2)

            # If there was more than one worker, then there should be messages
            # logged that have the strings "consume(0)" and "consume(1)".
            expected_msgs = [
                f"create task for sitemap_consumer[{idx}]" for idx in range(2)
            ]
            self.assertLogMessage(cm.output, expected_msgs, level='DEBUG')

            expected = 'Successfully processed 2 records'
            self.assertInfoLogMessage(cm.output, expected)
    def test_sitemap_url_is_sitemap_index_file(self, aioresp_mocker):
        """
        SCENARIO:  The URL given is actually a sitemap index that references
        two sitemaps.  Both sitemaps have two documents that are both good.

        EXPECTED RESULT:  The successful ingest of all four documents is
        logged.
        """

        # External calls to read the:
        #
        #   1) sitemap index file
        #   2) sitemap 1
        #   3) HTML document for record 1
        #   4) XML document for record 1
        #   5) HTML document for record 2
        #   6) XML document for record 2
        #   7) sitemap 2
        #   8) HTML document for record 1
        #   9) XML document for record 1
        #   10) HTML document for record 2
        #   11) XML document for record 2
        #
        contents = [
            ir.read_text('tests.data.arm', 'sitemap_index_file.xml'),
            ir.read_binary('tests.data.arm', 'sitemap2.xml.gz'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm',
                         'nsanimfraod1michC2.c1.fixed.xml'),  # noqa: E501
            ir.read_binary('tests.data.arm', 'sitemap2.xml.gz'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.html'),
            ir.read_text('tests.data.arm', 'nsaqcrad1longC2.c2.fixed.xml'),
            ir.read_text('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_text('tests.data.arm',
                         'nsanimfraod1michC2.c1.fixed.xml'),  # noqa: E501
        ]
        header_types = [
            'application/xml',
            'application/x-gzip',
            'text/html',
            'application/xml',
            'text/html',
            'application/xml',
            'application/x-gzip',
            'text/html',
            'application/xml',
            'text/html',
            'application/xml',
        ]
        for header, content in zip(header_types, contents):
            aioresp_mocker.get(self.pattern,
                               headers={'Content-Type': header},
                               body=content)

        url = 'https://www.archive.arm.gov/metadata/adc/sitemap_index_file.xml'
        obj = D1CheckSitemap(sitemap_url=url)

        with self.assertLogs(logger=obj.logger, level='DEBUG') as cm:
            asyncio.run(obj.run())

            self.assertSuccessfulDebugIngest(cm.output, n=4)