Example #1
0
    def test_check_all_urls(self):
        """Should check all the URLs for one provider"""
        mock_lock = mock.Mock()
        with mock.patch('geospaas_harvesting.verify_urls.Lock', return_value=mock_lock), \
                mock.patch(
                    'geospaas_harvesting.verify_urls.BoundedThreadPoolExecutor') as mock_pool, \
                mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects') as mock_manager, \
                mock.patch('concurrent.futures.as_completed'), \
                mock.patch('geospaas_harvesting.verify_urls.HTTPProvider'
                           '.check_and_write_stale_url') as mock_write:
            mock_executor = mock_pool.return_value.__enter__.return_value
            mock_dataset_uri = mock.Mock()
            mock_manager.filter.return_value.iterator.return_value = [mock_dataset_uri]

            # call without throttle: 50 workers
            provider = verify_urls.HTTPProvider('test', {'url': 'https://foo/'})
            with self.assertLogs(verify_urls.logger, level=logging.INFO):
                provider.check_all_urls('output.txt')

            mock_executor.submit.assert_called_once_with(
                mock_write, mock_lock, 'output.txt', mock_dataset_uri)
            mock_pool.assert_called_once_with(max_workers=50, queue_limit=2000)

            mock_pool.reset_mock()

            # call with throttle: 1 worker
            provider = verify_urls.HTTPProvider('test', {'url': 'https://foo/', 'throttle': 1})
            with self.assertLogs(verify_urls.logger, level=logging.INFO):
                provider.check_all_urls('output.txt')
            mock_executor.submit.assert_called_once_with(
                mock_write, mock_lock, 'output.txt', mock_dataset_uri)
            mock_pool.assert_called_once_with(max_workers=1, queue_limit=2000)

            mock_pool.reset_mock()
Example #2
0
    def test_read_config(self):
        """Should read the provider configuration from a YAML file"""
        config = textwrap.dedent('''---
        podaac:
          url: 'https://opendap.jpl.nasa.gov/opendap/'
        scihub:
          url: 'https://scihub.copernicus.eu/'
          username: !ENV 'COPERNICUS_OPEN_HUB_USERNAME'
          password: !ENV 'COPERNICUS_OPEN_HUB_PASSWORD'
        creodias:
          url: 'https://zipper.creodias.eu/'
          username: !ENV 'CREODIAS_USERNAME'
          password: !ENV 'CREODIAS_PASSWORD'
          token_url: 'https://auth.creodias.eu/auth/realms/DIAS/protocol/openid-connect/token'
          client_id: 'CLOUDFERRO_PUBLIC'
          throttle: 1
          auth_renew: 36000
        rtofs:
            url: 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/'
          ''')
        environment = {
            'COPERNICUS_OPEN_HUB_USERNAME': '******',
            'COPERNICUS_OPEN_HUB_PASSWORD': '******',
            'CREODIAS_USERNAME': '******',
            'CREODIAS_PASSWORD': '******',
        }
        # we check that get_auth() is called with the right arguments
        # by replacing its output by its arguments
        with mock.patch('geospaas_harvesting.verify_urls.open', mock.mock_open(read_data=config)), \
                mock.patch('os.environ', environment):
            providers = verify_urls.read_config('foo.yml')

        self.assertListEqual(providers, [
            verify_urls.HTTPProvider('podaac', {
                'url': 'https://opendap.jpl.nasa.gov/opendap/',
            }),
            verify_urls.HTTPProvider('scihub', {
                'url': 'https://scihub.copernicus.eu/',
                'username': '******',
                'password': '******'
            }),
            verify_urls.HTTPProvider('creodias', {
                'url': 'https://zipper.creodias.eu/',
                'username': '******',
                'password': '******',
                'token_url': 'https://auth.creodias.eu/auth/realms/DIAS/protocol/'
                                'openid-connect/token',
                'client_id': 'CLOUDFERRO_PUBLIC',
                'throttle': 1,
                'auth_renew': 36000
            }),
            verify_urls.FTPProvider('rtofs', {
                'url': 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/'
            })
        ])
Example #3
0
    def test_check_providers(self):
        """Should run URL checks for each provider in a separate
        process. If an exception is raised in one of the sub-processes,
        check_providers() should return False and the traceback of the
        exception should be logged
        """
        providers = [
            verify_urls.HTTPProvider('scihub', {
                'url': 'https://scihub.copernicus.eu/',
                'username': '******',
                'password': '******',
                'throttle': 0
            }),
            verify_urls.HTTPProvider('podaac', {
                'url': 'https://opendap.jpl.nasa.gov/opendap/',
                'username': '******',
                'password': '******',
                'throttle': 0
            }),
            verify_urls.FTPProvider('rtofs', {
                'url': 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/'
            }),
        ]

        with mock.patch('concurrent.futures.ProcessPoolExecutor') as mock_pool, \
                mock.patch('geospaas_harvesting.verify_urls.datetime') as mock_datetime, \
                mock.patch('geospaas_harvesting.verify_urls.'
                           'HTTPProvider.check_all_urls') as mock_http_check, \
                mock.patch('geospaas_harvesting.verify_urls.'
                           'FTPProvider.check_all_urls') as mock_ftp_check, \
                mock.patch('concurrent.futures.as_completed', iter):
            mock_executor = mock_pool.return_value.__enter__.return_value
            mock_datetime.now.return_value.strftime.return_value = 'time'
            self.assertTrue(verify_urls.check_providers('foo', providers))
            mock_executor.submit.assert_has_calls((
                mock.call(
                    mock_http_check,
                    os.path.join('foo', 'scihub_stale_urls_time.txt')),
                mock.call(
                    mock_http_check,
                    os.path.join('foo', 'podaac_stale_urls_time.txt')),
                mock.call(
                    mock_ftp_check,
                    os.path.join('foo', 'rtofs_stale_urls_time.txt'))
            ), any_order=True)
            self.assertEqual(len(mock_executor.submit.call_args_list), 3)

            mock_executor.submit.return_value.result.side_effect = AttributeError
            with self.assertLogs(verify_urls.logger, level=logging.ERROR):
                self.assertFalse(verify_urls.check_providers('foo', providers))
Example #4
0
 def test_auth_basic(self):
     """The auth property should return the right authentication
     object based on the provider attributes
     """
     provider = verify_urls.HTTPProvider('test', {'username': '******', 'password': '******'})
     self.assertEqual(
         provider.auth,
         requests.auth.HTTPBasicAuth('user', 'pass'))
Example #5
0
 def test_check_url_200(self):
     """Should send a HEAD request to the URL and return whether the
     URL is valid or not.
     """
     provider = verify_urls.HTTPProvider('test', {})
     mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
     mock_response = mock.MagicMock(status_code=200, headers={})
     with mock.patch('geospaas_harvesting.utils.http_request', return_value=mock_response):
         self.assertEqual(provider.check_url(mock_dataset_uri), verify_urls.PRESENT)
Example #6
0
 def test_check_url_http_error(self):
     """Should send a HEAD request to the URL and return
     'http_<error_code>' if an error code other than 404 is received
     """
     provider = verify_urls.HTTPProvider('test', {})
     mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
     mock_response = mock.MagicMock(status_code=503, headers={})
     with mock.patch('geospaas_harvesting.utils.http_request',
                     return_value=mock_response) as mock_request:
         self.assertEqual(provider.check_url(mock_dataset_uri), 'http_503')
         mock_request.assert_called_once()
Example #7
0
 def test_check_and_write_stale_url_valid(self):
     """Should not write anything to the output file if the URL is
     valid
     """
     provider = verify_urls.HTTPProvider('test', {})
     mock_lock = mock.MagicMock()
     with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url',
                     return_value=verify_urls.PRESENT), \
             mock.patch('geospaas_harvesting.verify_urls.open') as mock_open:
         provider.check_and_write_stale_url(mock_lock, 'output.txt', mock.Mock())
         mock_open.assert_not_called()
Example #8
0
 def test_check_all_urls_thread_error(self):
     """Exceptions happening in the threads should be raised in the
     main thread
     """
     provider = verify_urls.HTTPProvider('test', {'url': 'https://foo'})
     with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider'
                     '.check_and_write_stale_url') as mock_write, \
             mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects') as mock_manager:
         mock_write.side_effect = ValueError
         mock_manager.filter.return_value.iterator.return_value = [mock.Mock()]
         with self.assertRaises(ValueError), \
                 self.assertLogs(verify_urls.logger, level=logging.INFO):
             provider.check_all_urls('out.txt')
Example #9
0
    def test_delete_stale_urls(self):
        """404 URLs should be deleted unless the force option is used
        """
        provider = verify_urls.HTTPProvider('test', {
            'url': 'https://foo',
            'username': '******',
            'password': '******',
            'auth_renew': -1
        })
        file_contents = f'{verify_urls.ABSENT} 12 https://foo/bar\nhttp_500 13 https://foo/baz'
        check_url_results = (verify_urls.ABSENT, 'http_500')

        dataset_uris = {12: 'https://foo/bar', 13: 'https://foo/baz'}
        mock_manager = mock.Mock()
        mock_manager.filter.side_effect = lambda id: [mock.Mock(uri=dataset_uris.get(id))]

        with mock.patch('geospaas_harvesting.verify_urls.find_provider', return_value=provider), \
             mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects', mock_manager):

            # force == False, only the URL that returns 404 must be
            # deleted
            buffer = io.StringIO(file_contents)
            with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer), \
                    mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url',
                               side_effect=check_url_results), \
                    mock.patch('geospaas_harvesting.verify_urls.remove_dataset_uri',
                               return_value=(True, True)) as mock_remove:
                self.assertEqual(verify_urls.delete_stale_urls('', {}, force=False), (1, 1))
                self.assertListEqual(
                    [args[0][0].uri for args in mock_remove.call_args_list],
                    ['https://foo/bar'])

            # force == True, both URLs must be deleted
            buffer = io.StringIO(file_contents)
            with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer), \
                    mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url',
                               side_effect=check_url_results), \
                    mock.patch('geospaas_harvesting.verify_urls.remove_dataset_uri',
                               return_value=(True, True)) as mock_remove:
                self.assertEqual(verify_urls.delete_stale_urls('', {}, force=True), (2, 2))
                self.assertListEqual(
                    [args[0][0].uri for args in mock_remove.call_args_list],
                    ['https://foo/bar', 'https://foo/baz'])

            # The URI does not exist
            buffer = io.StringIO(file_contents)
            with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer):
                mock_manager.filter.side_effect = None
                mock_manager.filter.return_value = []
                with self.assertLogs(verify_urls.logger, level=logging.WARNING):
                    self.assertEqual(verify_urls.delete_stale_urls('', {}, force=False), (0, 0))
Example #10
0
    def test_find_provider(self):
        """Should return the right provider given a URL"""
        scihub_provider = verify_urls.HTTPProvider('scihub', {
            'url': 'https://scihub.copernicus.eu/',
            'username': '******',
            'password': '******',
            'throttle': 0
        })
        podaac_provider = verify_urls.HTTPProvider('podaac', {
            'url': 'https://opendap.jpl.nasa.gov/opendap/',
            'username': '******',
            'password': '******',
            'throttle': 0
        })
        providers = [scihub_provider, podaac_provider]

        self.assertIsNone(verify_urls.find_provider('foo.txt', providers))
        self.assertEqual(
            verify_urls.find_provider('scihub_stale_urls_2021-05-25T10:22:27.txt', providers),
            scihub_provider)
        self.assertEqual(
            verify_urls.find_provider('podaac_stale_urls_2021-05-25T10:22:28.txt', providers),
            podaac_provider)
Example #11
0
    def test_check_url_connection_error_retry(self):
        """The request should be retried if a ConnectionError occurs"""
        provider = verify_urls.HTTPProvider('test', {})
        mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
        with mock.patch('geospaas_harvesting.utils.http_request') as mock_request, \
             mock.patch('time.sleep') as mock_sleep:
            mock_request.side_effect = (
                requests.exceptions.ConnectionError,
                requests.exceptions.ConnectionError,
                mock.MagicMock(status_code=200, headers={})
            )
            with self.assertLogs(verify_urls.logger, level=logging.ERROR):
                provider.check_url(mock_dataset_uri, tries=5)

        self.assertListEqual(mock_sleep.call_args_list, [mock.call(5), mock.call(5), mock.call(0)])
Example #12
0
    def test_check_url_429_too_many_retries(self):
        """When there are too many retries, an exception should be
        raised
        """
        provider = verify_urls.HTTPProvider('test', {})
        mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
        mock_responses = (
            mock.MagicMock(status_code=429, headers={}),
            mock.MagicMock(status_code=200, headers={})
        )
        with mock.patch('geospaas_harvesting.utils.http_request',
                        side_effect=mock_responses) as mock_request:

            with self.assertRaises(verify_urls.TooManyRequests):
                provider.check_url(mock_dataset_uri, tries=1)
            mock_request.assert_called_once()
Example #13
0
 def test_check_and_write_stale_url_invalid(self):
     """Should write the URL info to the output file if the URL is
     invalid
     """
     provider = verify_urls.HTTPProvider('test', {})
     with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url',
                     return_value=verify_urls.ABSENT), \
             mock.patch('geospaas_harvesting.verify_urls.open') as mock_open:
         mock_file = mock.MagicMock()
         mock_open.return_value.__enter__.return_value = mock_file
         mock_dataset_uri = mock.Mock()
         mock_dataset_uri.id = 1
         mock_dataset_uri.uri = 'https://foo'
         provider.check_and_write_stale_url(mock.MagicMock(), 'output.txt', mock_dataset_uri)
         mock_file.write.assert_called_once_with(
             f"{verify_urls.ABSENT} 1 https://foo{os.linesep}")
Example #14
0
    def test_auth_oauth2(self):
        """The auth property should return the right authentication
        object based on the provider attributes
        """
        provider = verify_urls.HTTPProvider('test', {
            'username': '******',
            'password': '******',
            'token_url': 'https://foo',
            'client_id': 'CLIENT'
        })

        mock_oauth2 = mock.Mock()
        with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.build_oauth2',
                        return_value=mock_oauth2) as mock_build_oauth2:
            self.assertEqual(
                provider.auth,
                mock_oauth2)
            mock_build_oauth2.assert_called_once_with('user', 'pass', 'https://foo', 'CLIENT')
Example #15
0
    def test_check_url_connection_error_too_many_retries(self):
        """The request should be retried if a ConnectionError occurs
        and the exception should be raised if the retry limit is
        reached
        """
        provider = verify_urls.HTTPProvider('test', {})
        mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
        with mock.patch('geospaas_harvesting.utils.http_request') as mock_request, \
                mock.patch('time.sleep') as mock_sleep:
            mock_request.side_effect = (
                requests.exceptions.ConnectionError,
                requests.exceptions.ConnectionError,
            )
            with self.assertLogs(verify_urls.logger, level=logging.ERROR), \
                 self.assertRaises(requests.exceptions.ConnectionError):
                provider.check_url(mock_dataset_uri, tries=2)

        self.assertListEqual(mock_sleep.call_args_list, [mock.call(5)])
Example #16
0
    def test_auth_renew(self):
        """Test that authentication is renewed when necessary"""
        provider = verify_urls.HTTPProvider('test', {
            'username': '******',
            'password': '******',
            'token_url': 'token',
            'client_id': 'ID',
            'auth_renew': 1
        })

        with mock.patch('time.monotonic', side_effect=(1, 2, 2.1)), \
             mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.build_oauth2',
                        side_effect=('auth1', 'auth2', 'auth3')):
            # First call -> first return value from build_oauth2()
            self.assertEqual(provider.auth, 'auth1')
            # Second call, one second later -> second return value from build_oauth2()
            self.assertEqual(provider.auth, 'auth2')
            # Third call, less than one second later -> the value does not change
            self.assertEqual(provider.auth, 'auth2')
Example #17
0
    def test_check_url_429_no_header(self):
        """When an error 429 occurs, the URL should ne retried after a
        delay
        """
        provider = verify_urls.HTTPProvider('test', {})
        mock_dataset_uri = mock.Mock(id=1, uri='https://foo')
        mock_responses = (
            mock.MagicMock(status_code=429, headers={}),
            mock.MagicMock(status_code=404, headers={})
        )
        with mock.patch('geospaas_harvesting.utils.http_request',
                        side_effect=mock_responses) as mock_request, \
                mock.patch('time.sleep') as mock_sleep:

            with self.assertLogs(verify_urls.logger, level=logging.WARNING):
                self.assertEqual(provider.check_url(mock_dataset_uri),verify_urls.ABSENT)

            self.assertEqual(mock_request.call_count, 2)
            self.assertListEqual(mock_sleep.call_args_list, [mock.call(60), mock.call(0)])
Example #18
0
 def test_instantiation(self):
     """Test that the attributes are correctly initialized"""
     provider = verify_urls.HTTPProvider('test', {'foo': 'bar'})
     self.assertEqual(provider.name, 'test')
     self.assertEqual(provider.config, {'foo': 'bar'})
     self.assertEqual(provider._auth_start, None)
Example #19
0
 def test_auth_no_auth(self):
     """The auth property should return None when no authentication
     method can be determined
     """
     provider = verify_urls.HTTPProvider('test', {})
     self.assertIsNone(provider.auth)