Esempio n. 1
0
    def test_retrieve_record__bad_metadata_document(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document.
        The metadata document, however, is invalid.

        EXPECTED RESULT:  An XMLMetadataParsingError is issued.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        # 2nd:  zip archive containing data and metadata
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.html')

        # Switch out the metadata document for something that is NOT xml.
        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    b'not xml')
        zf.close()
        b.seek(0)
        contents2 = b.read()

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)
                m.get(self.regex, body=contents2)

                with self.assertRaises(XMLMetadataParsingError):
                    asyncio.run(harvester.retrieve_record(url))
Esempio n. 2
0
    def test_landing_page_is_published(self):
        """
        SCENARIO:  A landing page indicates that it is published.
        Such an element is needed because we only wish to harvest PUBLISHED
        documents.

        EXPECTED RESULT:  The landing page document is parsed and returned.
        """

        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents = ir.read_text(package, 'landing_page.html')
        headers = {'Content-Type': 'text/html'}

        obj = CUAHSIHarvester()

        expected = lxml.etree.HTML(contents)

        landing_page_url = 'https://www.hydroshare.org/something.html'
        with self.assertLogs(logger=obj.logger, level='DEBUG'):
            with aioresponses() as m:
                m.get(self.regex, body=contents, headers=headers)

                awaitable = obj.retrieve_landing_page_content(landing_page_url)
                actual = asyncio.run(awaitable)

        self.assertEqual(lxml.etree.tostring(actual),
                         lxml.etree.tostring(expected))
Esempio n. 3
0
    def test_retrieve_record__no_url_for_zip_archive(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document,
        but the landing page does not have a proper URL for the bagit zip
        archive.  Yeah, this happens.

        EXPECTED RESULT:  A SkipError is issued.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.no_zip_url.html')

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)

                with self.assertRaises(SkipError):
                    asyncio.run(harvester.retrieve_record(url))
Esempio n. 4
0
    def test__harvest__but_landing_page_has_no_json_ld(self,
                                                       harvest_time_mocker):
        """
        SCENARIO:  We have a valid sitemap but one of the landing pages does
        not have any JSON-LD.

        EXPECTED RESULT:  A SkipError is issued and caught.
        """

        harvest_time_mocker.return_value = '1900-01-01T00:00:00Z'

        host, port = 'www.hydroshare.org', 443
        harvester = CUAHSIHarvester(host=host, port=port)

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #
        contents = [
            ir.read_binary('tests.data.cuahsi',
                           'sitemap.81e947faccf04de59392dddaac77bc75.xml'),
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75',  # noqa : E501
                'landing_page.no_json_ld.html')
        ]
        status_codes = [200, 200]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'application/html'
            },
        ]

        z = zip(contents, status_codes, headers)
        with aioresponses() as m:
            for content, status_code, headers in z:
                m.get(self.regex,
                      body=content,
                      status=status_code,
                      headers=headers)

            with self.assertLogs(logger=harvester.logger, level='DEBUG') as cm:
                asyncio.run(harvester.run())

                expected = "Successfully processed 0 records."
                self.assertInfoLogMessage(cm.output, expected)

                expected = "No JSON-LD <SCRIPT> elements were located"
                self.assertWarningLogMessage(cm.output, expected)

                expected = "SkipError"
                self.assertErrorLogMessage(cm.output, expected)
Esempio n. 5
0
    def test_landing_page_is_not_published(self):
        """
        SCENARIO:  A landing page indicates that it is not published.

        EXPECTED RESULT:  A SkipError is raised.
        """

        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents = ir.read_text(package, 'landing_page.not_published.html')
        doc = lxml.etree.HTML(contents)

        obj = CUAHSIHarvester()

        with self.assertLogs(logger=obj.logger, level='DEBUG'):
            with self.assertRaises(SkipError):
                obj.preprocess_landing_page(doc)
Esempio n. 6
0
    def test_landing_page_jsonld(self):
        """
        SCENARIO:  A landing page does have a JSON-LD <SCRIPT> element.

        EXPECTED RESULT:  The JSON-LD is loaded.
        """
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents = ir.read_text(package, 'landing_page.html')
        doc = lxml.etree.HTML(contents)

        obj = CUAHSIHarvester()

        with self.assertLogs(logger=obj.logger, level='DEBUG'):
            j = obj.get_jsonld(doc)

        # To make sure we have JSON, feed it to something that expects it.
        json.dumps(j)
Esempio n. 7
0
    def test_landing_page_is_missing_jsonld(self):
        """
        SCENARIO:  A landing page does not have a JSON-LD <SCRIPT> element.

        EXPECTED RESULT:  A SkipError is raised.  Many CUAHSI documents
        don't have JSON-LD, and we don't want that counting against the
        failure count.
        """
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents = ir.read_text(package, 'landing_page.no_json_ld.html')
        doc = lxml.etree.HTML(contents)

        obj = CUAHSIHarvester()

        with self.assertLogs(logger=obj.logger, level='DEBUG'):
            with self.assertRaises(JsonLdError):
                obj.get_jsonld(doc)
Esempio n. 8
0
    def test_landing_page_does_not_have_sharing_status_element(self):
        """
        SCENARIO:  A landing page does not have a sharing status element.
        Such an element is needed because we only wish to harvest PUBLISHED
        documents.

        EXPECTED RESULT:  A SkipError is raised.
        """

        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents = ir.read_text(package, 'landing_page.no_sharing_status.html')
        doc = lxml.etree.HTML(contents)

        obj = CUAHSIHarvester()

        with self.assertLogs(logger=obj.logger, level='DEBUG'):
            with self.assertRaises(SkipError):
                obj.preprocess_landing_page(doc)
Esempio n. 9
0
    def test_retrieve_record(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document.

        EXPECTED RESULT:  The series identifier is retrieved.  The lastMod
        time is None because this is only retrieved in schema.org.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        # 2nd:  zip archive containing data and metadata
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.html')

        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data'
        content = ir.read_binary(package, 'resourcemetadata.xml')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    content)
        zf.close()
        b.seek(0)
        contents2 = b.read()

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)
                m.get(self.regex, body=contents2)

                awaitable = harvester.retrieve_record(url)
                sid, pid, lastmod, doc = asyncio.run(awaitable)

        self.assertEqual(sid, '10.4211/hs.81e947faccf04de59392dddaac77bc75')
        self.assertIsNone(lastmod)
Esempio n. 10
0
    def test_new_run(self, mock_harvest_time, mock_check_if_identifier_exists,
                     mock_update_science_metadata, mock_load_science_metadata):
        """
        SCENARIO:  We have a valid sitemap for one valid document, which is a
        document that has not been seen before.

        EXPECTED RESULT:  The document is loaded, not updated.  Verify that the
        sid is a DOI and that the pid is the MD5SUM of the zip archive.
        """

        mock_harvest_time.return_value = '1900-01-01T00:00:00Z'
        mock_check_if_identifier_exists.return_value = {'outcome': 'no'}
        mock_load_science_metadata.return_value = True
        mock_update_science_metadata.return_value = True

        harvester = CUAHSIHarvester(host=self.host, port=self.port)

        # External calls to read the:
        #
        #   1) sitemap
        #   2) Remote HTML document for record 1
        #   3) Remote Zip archive (contains metadata)
        #

        contents = [
            ir.read_binary('tests.data.cuahsi', 'sitemap1.xml'),
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75',
                'landing_page.html'),  # noqa : E501
        ]

        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data'
        content = ir.read_binary(package, 'resourcemetadata.xml')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    content)
        zf.close()
        b.seek(0)
        zip_archive_content = b.read()

        contents.append(zip_archive_content)

        status_codes = [200, 200, 200]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'application/html'
            },
            {
                'Content-Type': 'application/zip'
            },
        ]
        regex = [
            re.compile('https://www.hydroshare.org/'),
            re.compile('https://www.hydroshare.org/'),
            re.compile('https://www.hydroshare.org/'),
        ]

        with aioresponses() as m:
            z = zip(regex, contents, status_codes, headers)
            for regex, content, status_code, headers in z:
                m.get(regex, body=content, status=status_code, headers=headers)

            with self.assertLogs(logger=harvester.logger, level='DEBUG'):
                asyncio.run(harvester.run())

        self.assertEqual(mock_load_science_metadata.call_count, 1),
        self.assertEqual(mock_update_science_metadata.call_count, 0),

        # Verify the PID and SID
        args, kwargs = mock_load_science_metadata.call_args_list[0]

        # Verify the PID
        actual = kwargs['system_metadata'].identifier.value()
        expected = hashlib.md5(zip_archive_content).hexdigest()
        self.assertEqual(actual, expected)

        # Verify the SID
        actual = kwargs['system_metadata'].seriesId.value()
        expected = '10.4211/hs.81e947faccf04de59392dddaac77bc75'
        self.assertEqual(actual, expected)
Esempio n. 11
0
    def test_update_run(self, mock_harvest_time,
                        mock_check_if_identifier_exists,
                        mock_update_science_metadata,
                        mock_load_science_metadata):
        """
        SCENARIO:  We have a valid sitemap for one valid document, which is to
        be updated.

        EXPECTED RESULT:  The document is updated, not loaded for the first
        time.  Verify that the sid is a DOI and that the pid is the MD5SUM of
        the zip archive.
        """

        record_date = dt.datetime(1949, 1, 1, tzinfo=dt.timezone.utc)
        mock_harvest_time.return_value = record_date.strftime(DATETIME_FORMAT)
        mock_check_if_identifier_exists.return_value = {
            'outcome': 'yes',
            'record_date': record_date - dt.timedelta(days=1),
            'current_version_id': 1,
        }
        mock_update_science_metadata.return_value = True
        mock_load_science_metadata.return_value = True

        harvester = CUAHSIHarvester(host=self.host, port=self.port)

        # External calls to read the:
        #
        #   1) sitemap
        #   2) Remote HTML document for record 1
        #   3) Remote XML document for record 1
        #   4) Existing XML document for record 1 (retrieved from the member
        #      node)
        #
        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data'
        content = ir.read_binary(package, 'resourcemetadata.xml')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    content)
        zf.close()
        b.seek(0)
        zip_archive_content = b.read()

        contents = [
            ir.read_binary('tests.data.cuahsi', 'sitemap1.xml'),
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75',
                'landing_page.html'),  # noqa : E501
            zip_archive_content,
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data',
                'resourcemetadata.previous.xml'),  # noqa : E501
        ]

        status_codes = [200, 200, 200, 200]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'application/html'
            },
            {
                'Content-Type': 'application/zip'
            },
            {
                'Content-Type': 'application/xml'
            },
        ]
        regex = [
            re.compile('https://www.hydroshare.org/'),
            re.compile('https://www.hydroshare.org/'),
            re.compile('https://www.hydroshare.org/'),
            re.compile('https://cuahsi.mn.org:443/mn/v2/'),
        ]

        with aioresponses() as m:
            z = zip(regex, contents, status_codes, headers)
            for regex, content, status_code, headers in z:
                m.get(regex, body=content, status=status_code, headers=headers)

            with self.assertLogs(logger=harvester.logger, level='DEBUG') as cm:
                asyncio.run(harvester.run())

        self.assertEqual(mock_load_science_metadata.call_count, 0),
        self.assertEqual(mock_update_science_metadata.call_count, 1),

        # Verify the PID and SID
        args, kwargs = mock_update_science_metadata.call_args_list[0]

        actual = kwargs['system_metadata'].identifier.value()
        expected = hashlib.md5(zip_archive_content).hexdigest()
        self.assertEqual(actual, expected)

        actual = kwargs['system_metadata'].seriesId.value()
        expected = '10.4211/hs.81e947faccf04de59392dddaac77bc75'
        self.assertEqual(actual, expected)
Esempio n. 12
0
    def test__harvest__landing_page_errors__succeeds_on_retry(
            self, harvest_time_mocker, mock_check_if_identifier_exists,
            mock_load_science_metadata):
        """
        SCENARIO:  We have a valid sitemap but the landing page errors out.
        A retry is specified, and it succeeds the next time.

        EXPECTED RESULT:  The document is successfully harvested.
        """

        harvest_time_mocker.return_value = '1900-01-01T00:00:00Z'
        mock_check_if_identifier_exists.return_value = {'outcome': 'no'}
        mock_load_science_metadata.return_value = True

        host, port = 'www.hydroshare.org', 443
        harvester = CUAHSIHarvester(host=host, port=port, retry=1)

        # External calls to read the:
        #
        #   1) sitemap
        #   2) HTML document for record 1
        #   3) HTML document for record 1, 2nd try
        #   4) ZIP archive for record 1
        #
        contents = [
            ir.read_binary('tests.data.cuahsi',
                           'sitemap.81e947faccf04de59392dddaac77bc75.xml'),
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75',  # noqa : E501
                'landing_page.html'),
            ir.read_binary(
                'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75',  # noqa : E501
                'landing_page.html')
        ]

        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data'
        content = ir.read_binary(package, 'resourcemetadata.xml')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    content)
        zf.close()
        b.seek(0)
        contents.append(b.read())

        status_codes = [200, 400, 200, 200]
        headers = [
            {
                'Content-Type': 'application/xml'
            },
            {
                'Content-Type': 'application/html'
            },
            {
                'Content-Type': 'application/html'
            },
            {
                'Content-Type': 'application/zip'
            },
        ]

        z = zip(contents, status_codes, headers)
        with aioresponses() as m:
            for content, status_code, headers in z:
                m.get(self.regex,
                      body=content,
                      status=status_code,
                      headers=headers)

            with self.assertLogs(logger=harvester.logger, level='DEBUG') as cm:
                asyncio.run(harvester.run())

                expected = "Successfully processed 1 records."
                self.assertInfoLogMessage(cm.output, expected)

                expected = "Bad Request"
                self.assertErrorLogMessage(cm.output, expected)

                expected = "ClientResponseError"
                self.assertErrorLogMessage(cm.output, expected)