Exemple #1
0
    def test_metadata_document_retrieval_failure(self, mock_harvest_time):
        """
        SCENARIO:  The XML metadata document retrieval fails.

        EXPECTED RESULT:  The failure count goes up by one.
        """

        mock_harvest_time.return_value = '1900-01-01T00:00:00Z'

        harvester = ARMHarvester(host=self.host, port=self.port)

        failed_count = harvester.failed_count

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) XML document for record 1
        #
        contents = [
            ir.read_binary('tests.data.arm', 'sitemap-1.xml'),
            ir.read_binary('tests.data.arm', 'nsanimfraod1michC2.c1.fixed.html'),  # noqa: E501
            ir.read_binary('tests.data.arm', 'nsanimfraod1michC2.c1.xml'),
        ]
        headers = [
            {'Content-Type': 'application/xml'},
            {'Content-Type': 'text/html'},
            {'Content-Type': 'application/xml'},
        ]
        status_codes = [200, 200, 400]

        with aioresponses() as m:
            z = zip(contents, headers, status_codes)
            for content, header, status_code in z:
                m.get(self.regex, body=content, headers=header,
                      status=status_code)

            with self.assertLogs(logger=harvester.logger, level='DEBUG') as cm:
                asyncio.run(harvester.run())

                self.assertErrorLogMessage(cm.output, "Bad Request")

        self.assertEqual(harvester.failed_count, failed_count + 1)
Exemple #2
0
    def test_load_run_with_jsonld_lasmod_prior_to_last_harvest(
        self,
        mock_harvest_time,
        mock_check_if_identifier_exists,
        mock_update_science_metadata,
        mock_load_science_metadata
    ):
        """
        SCENARIO:  We encounter a document where the sitemap last modification
        time is after the last harvest time, but the JSON-LD lastmod time is
        before the last harvest time.  We preferentially take the JSON-LD time.

        EXPECTED RESULT:  The document is not harvested.
        """

        last_harvest_time = dt.datetime(2019, 1, 1)
        mock_harvest_time.return_value = last_harvest_time.strftime(DATETIME_FORMAT)  # noqa : E501

        mock_check_if_identifier_exists.return_value = {
            'outcome': 'yes',
            'record_date': last_harvest_time,
            'current_version_id': 1,
        }

        # None of these should actually be called.
        mock_update_science_metadata.return_value = True
        mock_load_science_metadata.return_value = True

        # Setup the sitemap to show that the lastmod time there is after the
        # last harvest time.  This should normally cause the document to be
        # harvested.
        data = """
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
          <url>
            <loc>https://www.archive.arm.gov/metadata/adc/html/nsasondewnpnS01.b1.html</loc>
            <lastmod>2019-01-02</lastmod>
          </url>
        </urlset>
        """
        sitemap_content = data.encode('utf-8')

        # External calls to read the:
        #
        #   1) sitemap
        #   2) Remote HTML document for record 1
        #   3) Remote XML document for record 1
        #
        contents = [
            sitemap_content,
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.lastmod_before_harvest.html'),  # noqa : E501
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.xml'),
        ]
        status_codes = [200, 200, 200]
        headers = [
            {'Content-Type': 'application/xml'},
            {'Content-Type': 'text/html'},
            {'Content-Type': 'application/xml'},
        ]
        regex = [
            re.compile('https://www.archive.arm.gov/metadata/adc'),
            re.compile('https://www.archive.arm.gov/metadata/adc'),
            re.compile('https://www.archive.arm.gov/metadata/adc'),
        ]

        with aioresponses() as m:
            z = zip(regex, contents, status_codes, headers)
            for regex, content, status_code, headers in z:
                m.get(regex, body=content, status=status_code, headers=headers)

            harvester = ARMHarvester(host=self.host, port=self.port)

            with self.assertLogs(logger=harvester.logger, level='DEBUG'):
                asyncio.run(harvester.run())

        self.assertEqual(mock_check_if_identifier_exists.call_count, 1),

        # These are the critical ones.  None of them should have been called.
        self.assertEqual(mock_load_science_metadata.call_count, 0),
        self.assertEqual(mock_update_science_metadata.call_count, 0),
Exemple #3
0
    def test_load_run(self,
                      mock_harvest_time,
                      mock_check_if_identifier_exists,
                      mock_update_science_metadata,
                      mock_load_science_metadata):
        """
        SCENARIO:  We encounter a document that has not yet been harvested.

        EXPECTED RESULT:  The document is loaded for the first time, not
        updated.  The update occurs with the PID set to the checksum of the
        XML document and the SID set to the DOI.
        """

        mock_harvest_time.return_value = '1900-01-01T00:00:00Z'
        mock_check_if_identifier_exists.return_value = {'outcome': 'no'}
        mock_update_science_metadata.return_value = True
        mock_load_science_metadata.return_value = True

        harvester = ARMHarvester(host=self.host, port=self.port)

        # External calls to read the:
        #
        #   1) sitemap
        #   2) Remote HTML document for record 1
        #   3) Remote XML document for record 1
        #   4) Existing XML document for record 1 (retrieved from the member
        #      node)
        #
        contents = [
            ir.read_binary('tests.data.arm', 'sitemap-1.xml'),
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.html'),
            ir.read_binary('tests.data.arm', 'nsasondewnpnS01.b1.fixed.xml'),
        ]
        status_codes = [200, 200, 200]
        headers = [
            {'Content-Type': 'application/xml'},
            {'Content-Type': 'text/html'},
            {'Content-Type': 'application/xml'},
        ]
        regex = [
            re.compile('https://www.archive.arm.gov/metadata/adc'),
            re.compile('https://www.archive.arm.gov/metadata/adc'),
            re.compile('https://www.archive.arm.gov/metadata/adc'),
        ]

        with aioresponses() as m:
            z = zip(regex, contents, status_codes, headers)
            for regex, content, status_code, headers in z:
                m.get(regex, body=content, status=status_code, headers=headers)

            with self.assertLogs(logger=harvester.logger, level='DEBUG'):
                asyncio.run(harvester.run())

        self.assertEqual(mock_load_science_metadata.call_count, 1),
        self.assertEqual(mock_update_science_metadata.call_count, 0),

        # Verify the PID and SID
        args, kwargs = mock_load_science_metadata.call_args_list[0]

        # Verify the PID
        actual = kwargs['system_metadata'].identifier.value()
        expected = 'b96feb9f87705bb03d466ad44289cb11'
        self.assertEqual(actual, expected)

        # Verify the SID
        actual = kwargs['system_metadata'].seriesId.value()
        expected = 'doi:10.5439/1021460'
        self.assertEqual(actual, expected)