Exemple #1
0
    def test_hostname_with_leading_protocol(self):
        """
        Test hostname with leading protocol.
        """
        host1 = 'https://storageaccount.dfs.core.windows.net'
        adlsAdapter1 = ADLSAdapter(hostname=host1, root='root-without-slash', shared_key='')
        adapterPath = 'https://storageaccount.dfs.core.windows.net/root-without-slash/a/1.csv'
        corpusPath1 = adlsAdapter1.create_corpus_path(adapterPath)
        self.assertEqual(adlsAdapter1.hostname, 'https://storageaccount.dfs.core.windows.net')
        self.assertEqual(adlsAdapter1.root, '/root-without-slash')
        self.assertEqual(corpusPath1, '/a/1.csv')
        self.assertEqual(adlsAdapter1.create_adapter_path(corpusPath1), adapterPath)

        host2 = 'Https://storageaccount.dfs.core.windows.net'
        adlsAdapter2 = ADLSAdapter(hostname=host2, root='root-without-slash', shared_key='')
        corpusPath2 = adlsAdapter2.create_corpus_path(adapterPath)
        self.assertEqual(adlsAdapter2.hostname, 'Https://storageaccount.dfs.core.windows.net')
        self.assertEqual(adlsAdapter2.root, '/root-without-slash')
        self.assertEqual(corpusPath2, '/a/1.csv')
        self.assertEqual(adlsAdapter2.create_adapter_path(corpusPath2), adapterPath)

        try:
            host3 = 'http://storageaccount.dfs.core.windows.net'
            adlsAdapter3 = ADLSAdapter(hostname=host3, root='root-without-slash', shared_key='')
            self.fail('Expected Exception for using a http:// hostname.')
        except Exception as ex:
            self.assertTrue(isinstance(ex, ValueError))

        try:
            host4 = 'https://bar:baz::]/foo/'
            adlsAdapter4 = ADLSAdapter(hostname=host4, root='root-without-slash', shared_key='')
            self.fail('Expected Exception for using an invalid hostname.')
        except Exception as ex:
            self.assertTrue(isinstance(ex, ValueError))
Exemple #2
0
 def create_dummy_adapter(self):
     adapter = ADLSAdapter(root='/fs',
                           hostname='dummy.dfs.core.windows.net',
                           tenant='dummyTenant',
                           resource='dummyResource',
                           client_id='dummyClientId',
                           secret='dummySecret')
     adapter.number_of_retries = 0
     return adapter
Exemple #3
0
    def test_endpoint_missing_on_config(self):
        """Checks if the endpoint of the adls adapter is set to default if not present in the config parameters.
        This is necessary to support old config files that do not include an "endpoint"."""

        config = {
            'hostname': 'hostname.dfs.core.windows.net',
            'root': 'root',
            'tenant': 'tenant',
            'clientId': 'clientId'
        }

        adls_adapter = ADLSAdapter()
        adls_adapter.update_config(json.dumps(config))
        self.assertEqual(AzureCloudEndpoint.AZURE_PUBLIC, adls_adapter.endpoint)
Exemple #4
0
    def test_formatted_hostname_from_config(self):
        """Test if formatted_hostname is properly set when loading from config."""

        config = {
            'hostname': 'hostname.dfs.core.windows.net',
            'root': 'root',
            'tenant': 'tenant',
            'clientId': 'clientId'
        }

        adls_adapter = ADLSAdapter()
        adls_adapter.update_config(json.dumps(config))

        corpus_path = adls_adapter.create_corpus_path('https://hostname.dfs.core.windows.net/root/partitions/data.csv')
        self.assertEqual('/partitions/data.csv', corpus_path)
Exemple #5
0
 def setUp(self):
     self.adapter = ADLSAdapter(root='/fs',
                                hostname='dummy.dfs.core.windows.net',
                                tenant='dummyTenant',
                                resource='dummyResource',
                                client_id='dummyClientId',
                                secret='dummySecret')
     self.adapter.number_of_retries = 0
Exemple #6
0
    def create_adapter_with_shared_key(self, root_relative_path: str = None):
        hostname = os.environ.get("ADLS_HOSTNAME")
        root_path = os.environ.get("ADLS_ROOTPATH")
        shared_key = os.environ.get("ADLS_SHAREDKEY")

        self.assertFalse(StringUtils.is_null_or_white_space(hostname), "ADLS_HOSTNAME environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(root_path), "ADLS_ROOTPATH environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(shared_key), "ADLS_SHAREDKEY environment variable not set up")

        adapter = ADLSAdapter(hostname=hostname, root=self.combine_path(root_path, root_relative_path), shared_key=shared_key)

        return adapter
Exemple #7
0
    def create_adapter_with_client_id(self, root_relative_path: str = None):
        hostname = os.environ.get("ADLS_HOSTNAME")
        root_path = os.environ.get("ADLS_ROOTPATH")
        tenant = os.environ.get("ADLS_TENANT")
        client_id = os.environ.get("ADLS_CLIENTID")
        client_secret = os.environ.get("ADLS_CLIENTSECRET")

        self.assertFalse(StringUtils.is_null_or_white_space(hostname), "ADLS_HOSTNAME environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(root_path), "ADLS_ROOTPATH environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(tenant), "ADLS_TENANT environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(client_id), "ADLS_CLIENTID environment variable not set up")
        self.assertFalse(StringUtils.is_null_or_white_space(client_secret), "ADLS_CLIENTSECRET environment variable not set up")

        adapter = ADLSAdapter(hostname=hostname, root=self.combine_path(root_path, root_relative_path), tenant=tenant, client_id=client_id, secret=client_secret)

        return adapter
Exemple #8
0
    def test_config_and_update_config_without_secret(self):
        """
        The secret property is not saved to the config.json file for security reasons.
        When constructing and ADLS adapter from config, the user should be able to set the secret after the adapter is constructed.
        """
        adls_adapter = ADLSAdapter()

        try:
            config = {
                'root': 'root',
                'hostname': 'hostname',
                'tenant': 'tenant',
                'clientId': 'clientId',
            }
            adls_adapter.update_config(json.dumps(config))
            adls_adapter.secret = 'secret'
            adls_adapter.shared_key = 'sharedKey'
        except Exception:
            self.fail('adls_adapter initialized without secret shouldn\'t throw exception when updating config.')
Exemple #9
0
    def test_initialize_hostname_and_root(self):
        """
        Test initialize hostname and root for adls adapter.
        """
        host1 = 'storageaccount.dfs.core.windows.net'
        adlsAdapter1 = ADLSAdapter(hostname=host1,
                                   root='root-without-slash',
                                   shared_key='')
        self.assertEqual(adlsAdapter1.hostname,
                         'storageaccount.dfs.core.windows.net')
        self.assertEqual(adlsAdapter1.root, '/root-without-slash')

        adapterPath1 = 'https://storageaccount.dfs.core.windows.net/root-without-slash/a/1.csv'
        corpusPath1 = adlsAdapter1.create_corpus_path(adapterPath1)
        self.assertEqual(corpusPath1, '/a/1.csv')
        self.assertEqual(adlsAdapter1.create_adapter_path(corpusPath1),
                         adapterPath1)

        adlsAdapter1WithFolders = ADLSAdapter(
            hostname=host1,
            root='root-without-slash/folder1/folder2',
            shared_key='')
        self.assertEqual(adlsAdapter1WithFolders.root,
                         '/root-without-slash/folder1/folder2')

        adapterPath2 = 'https://storageaccount.dfs.core.windows.net/root-without-slash/folder1/folder2/a/1.csv'
        corpusPath2 = adlsAdapter1WithFolders.create_corpus_path(adapterPath2)
        self.assertEqual(corpusPath2, '/a/1.csv')
        self.assertEqual(
            adlsAdapter1WithFolders.create_adapter_path(corpusPath2),
            adapterPath2)

        adlsAdapter2 = ADLSAdapter(hostname=host1,
                                   root='/root-starts-with-slash',
                                   shared_key='')
        self.assertEqual(adlsAdapter2.root, '/root-starts-with-slash')
        adlsAdapter2WithFolders = ADLSAdapter(
            hostname=host1,
            root='/root-starts-with-slash/folder1/folder2',
            shared_key='')
        self.assertEqual(adlsAdapter2WithFolders.root,
                         '/root-starts-with-slash/folder1/folder2')

        adlsAdapter3 = ADLSAdapter(hostname=host1,
                                   root='root-ends-with-slash/',
                                   shared_key='')
        self.assertEqual(adlsAdapter3.root, '/root-ends-with-slash')
        adlsAdapter3WithFolders = ADLSAdapter(
            hostname=host1,
            root='root-ends-with-slash/folder1/folder2/',
            shared_key='')
        self.assertEqual(adlsAdapter3WithFolders.root,
                         '/root-ends-with-slash/folder1/folder2')

        adlsAdapter4 = ADLSAdapter(hostname=host1,
                                   root='/root-with-slashes/',
                                   shared_key='')
        self.assertEqual(adlsAdapter4.root, '/root-with-slashes')
        adlsAdapter4WithFolders = ADLSAdapter(
            hostname=host1,
            root='/root-with-slashes/folder1/folder2',
            shared_key='')
        self.assertEqual(adlsAdapter4WithFolders.root,
                         '/root-with-slashes/folder1/folder2')

        # Mount from config
        config = TestHelper.get_input_file_content(
            self.test_subpath, 'test_initialize_hostname_and_root',
            'config.json')
        corpus = CdmCorpusDefinition()
        corpus.storage.mount_from_config(config)
        self.assertEqual(
            corpus.storage.fetch_adapter('adlsadapter1').root,
            '/root-without-slash')
        self.assertEqual(
            corpus.storage.fetch_adapter('adlsadapter2').root,
            '/root-without-slash/folder1/folder2')
        self.assertEqual(
            corpus.storage.fetch_adapter('adlsadapter3').root,
            '/root-starts-with-slash/folder1/folder2')
        self.assertEqual(
            corpus.storage.fetch_adapter('adlsadapter4').root,
            '/root-ends-with-slash/folder1/folder2')
        self.assertEqual(
            corpus.storage.fetch_adapter('adlsadapter5').root,
            '/root-with-slashes/folder1/folder2')
Exemple #10
0
    def test_create_corpus_and_adapter_path(self):
        host_1 = 'storageaccount.dfs.core.windows.net'
        root = '/fs'
        adls_adapter = ADLSAdapter(root=root,
                                   hostname=host_1,
                                   tenant='dummyTenant',
                                   client_id='dummyClientId',
                                   secret='dummySecret')

        adapter_path_1 = 'https://storageaccount.dfs.core.windows.net/fs/a/1.csv'
        adapter_path_2 = 'https://storageaccount.dfs.core.windows.net:443/fs/a/2.csv'
        adapter_path_3 = 'https://storageaccount.blob.core.windows.net/fs/a/3.csv'
        adapter_path_4 = 'https://storageaccount.blob.core.windows.net:443/fs/a/4.csv'

        corpus_path_1 = adls_adapter.create_corpus_path(adapter_path_1)
        corpus_path_2 = adls_adapter.create_corpus_path(adapter_path_2)
        corpus_path_3 = adls_adapter.create_corpus_path(adapter_path_3)
        corpus_path_4 = adls_adapter.create_corpus_path(adapter_path_4)

        self.assertEqual(corpus_path_1, '/a/1.csv')
        self.assertEqual(corpus_path_2, '/a/2.csv')
        self.assertEqual(corpus_path_3, '/a/3.csv')
        self.assertEqual(corpus_path_4, '/a/4.csv')

        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_1),
                         adapter_path_1)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_2),
                         adapter_path_2)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_3),
                         adapter_path_3)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_4),
                         adapter_path_4)

        # Check that an adapter path is correctly created from a corpus path with any namespace
        corpus_path_with_namespace_1 = 'adls:/test.json'
        corpus_path_with_namespace_2 = 'mylake:/test.json'
        expected_adapter_path = 'https://storageaccount.dfs.core.windows.net/fs/test.json'

        self.assertEqual(
            expected_adapter_path,
            adls_adapter.create_adapter_path(corpus_path_with_namespace_1))
        self.assertEqual(
            expected_adapter_path,
            adls_adapter.create_adapter_path(corpus_path_with_namespace_2))

        # Check that an adapter path is correctly created from a corpus path with colons
        corpus_path_with_colons = 'namespace:/a/path:with:colons/some-file.json'
        self.assertEqual(
            'https://storageaccount.dfs.core.windows.net/fs/a/path%3Awith%3Acolons/some-file.json',
            adls_adapter.create_adapter_path(corpus_path_with_colons))
        self.assertEqual(
            '/a/path:with:colons/some-file.json',
            adls_adapter.create_corpus_path(
                'https://storageaccount.dfs.core.windows.net/fs/a/path%3Awith%3Acolons/some-file.json'
            ))
        self.assertEqual(
            '/a/path:with:colons/some-file.json',
            adls_adapter.create_corpus_path(
                'https://storageaccount.dfs.core.windows.net/fs/a/path%3awith%3acolons/some-file.json'
            ))

        # Check other special characters
        self.assertEqual(
            'https://storageaccount.dfs.core.windows.net/fs/a/path%20with%3Dspecial%3Dcharacters/some-file.json',
            adls_adapter.create_adapter_path(
                'namespace:/a/path with=special=characters/some-file.json'))
        self.assertEqual(
            '/a/path with=special=characters/some-file.json',
            adls_adapter.create_corpus_path(
                'https://storageaccount.dfs.core.windows.net/fs/a/path%20with%3dspecial%3dcharacters/some-file.json'
            ))
        self.assertEqual(
            '/a/path with=special=characters/some-file.json',
            adls_adapter.create_corpus_path(
                'https://storageaccount.dfs.core.windows.net/fs/a/path%20with%3dspecial%3Dcharacters/some-file.json'
            ))

        # Check that an adapter path is null if the corpus path provided is null
        self.assertIsNone(adls_adapter.create_adapter_path(None))

        host_2 = 'storageaccount.blob.core.windows.net:8888'
        adls_adapter = ADLSAdapter(
            root=root,
            hostname=host_2,
            tenant='11111111-1111-1111-1111-111111111111',
            client_id='dummyClientId',
            secret='dummySecret')
        adapter_path_5 = 'https://storageaccount.blob.core.windows.net:8888/fs/a/5.csv'
        adapter_path_6 = 'https://storageaccount.dfs.core.windows.net:8888/fs/a/6.csv'
        adapter_path_7 = 'https://storageaccount.blob.core.windows.net/fs/a/7.csv'

        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_5),
                         '/a/5.csv')
        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_6),
                         '/a/6.csv')
        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_7), None)
Exemple #11
0
    def test_create_corpus_and_adapter_path(self):
        host_1 = 'storageaccount.dfs.core.windows.net'
        root = '/fs'
        adls_adapter = ADLSAdapter(root=root,
                                   hostname=host_1,
                                   tenant='dummyTenant',
                                   resource='dummyResource',
                                   client_id='dummyClientId',
                                   secret='dummySecret')

        adapter_path_1 = 'https://storageaccount.dfs.core.windows.net/fs/a/1.csv'
        adapter_path_2 = 'https://storageaccount.dfs.core.windows.net:443/fs/a/2.csv'
        adapter_path_3 = 'https://storageaccount.blob.core.windows.net/fs/a/3.csv'
        adapter_path_4 = 'https://storageaccount.blob.core.windows.net:443/fs/a/4.csv'

        corpus_path_1 = adls_adapter.create_corpus_path(adapter_path_1)
        corpus_path_2 = adls_adapter.create_corpus_path(adapter_path_2)
        corpus_path_3 = adls_adapter.create_corpus_path(adapter_path_3)
        corpus_path_4 = adls_adapter.create_corpus_path(adapter_path_4)

        self.assertEqual(corpus_path_1, '/a/1.csv')
        self.assertEqual(corpus_path_2, '/a/2.csv')
        self.assertEqual(corpus_path_3, '/a/3.csv')
        self.assertEqual(corpus_path_4, '/a/4.csv')

        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_1),
                         adapter_path_1)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_2),
                         adapter_path_2)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_3),
                         adapter_path_3)
        self.assertEqual(adls_adapter.create_adapter_path(corpus_path_4),
                         adapter_path_4)

        host_2 = 'storageaccount.blob.core.windows.net:8888'
        adls_adapter = ADLSAdapter(root=root,
                                   hostname=host_2,
                                   tenant='dummyTenant',
                                   resource='dummyResource',
                                   client_id='dummyClientId',
                                   secret='dummySecret')
        adapter_path_5 = 'https://storageaccount.blob.core.windows.net:8888/fs/a/5.csv'
        adapter_path_6 = 'https://storageaccount.dfs.core.windows.net:8888/fs/a/6.csv'
        adapter_path_7 = 'https://storageaccount.blob.core.windows.net/fs/a/7.csv'

        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_5),
                         '/a/5.csv')
        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_6),
                         '/a/6.csv')
        self.assertEqual(adls_adapter.create_corpus_path(adapter_path_7), None)