def sanitize_path(path): "Utility for cleaning up paths." storage_option = infer_storage_options(path) protocol = storage_option['protocol'] if protocol in ('http', 'https'): # Most FSs remove the protocol but not HTTPFS. We need to strip # it to match properly. return os.path.normpath(path.replace("{}://".format(protocol), '')) elif protocol == 'file': # Just removing trailing slashes from file paths. return os.path.normpath(path) # Otherwise we leave the path alone return path
def sanitize_path(path): """Utility for cleaning up paths.""" storage_option = infer_storage_options(path) protocol = storage_option['protocol'] if protocol in ('http', 'https'): # Most FSs remove the protocol but not HTTPFS. We need to strip # it to match properly. path = os.path.normpath(path.replace("{}://".format(protocol), '')) elif protocol == 'file': # Remove trailing slashes from file paths. path = os.path.normpath(path) # Remove colons path = path.replace(':', '') # Otherwise we just make sure that path is posix return make_path_posix(path)
def test_infer_storage_options(): so = infer_storage_options('/mnt/datasets/test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == '/mnt/datasets/test.csv' assert not so assert infer_storage_options('./test.csv')['path'] == './test.csv' assert infer_storage_options('../test.csv')['path'] == '../test.csv' so = infer_storage_options('C:\\test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == 'C:\\test.csv' assert not so assert infer_storage_options('d:\\test.csv')['path'] == 'd:\\test.csv' assert infer_storage_options('\\test.csv')['path'] == '\\test.csv' assert infer_storage_options('.\\test.csv')['path'] == '.\\test.csv' assert infer_storage_options('test.csv')['path'] == 'test.csv' so = infer_storage_options( 'hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm', inherit_storage_options={'extra': 'value'}) assert so.pop('protocol') == 'hdfs' assert so.pop('username') == 'username' assert so.pop('password') == 'pwd' assert so.pop('host') == 'Node' assert so.pop('port') == 123 assert so.pop('path') == '/mnt/datasets/test.csv' assert so.pop('url_query') == 'q=1' assert so.pop('url_fragment') == 'fragm' assert so.pop('extra') == 'value' assert not so so = infer_storage_options('hdfs://[email protected]/mnt/datasets/test.csv') assert so.pop('username') == 'User-name' assert so.pop('host') == 'Node-name.com' u = 'http://127.0.0.1:8080/test.csv' assert infer_storage_options(u) == {'protocol': 'http', 'path': u} # For s3 and gcs the netloc is actually the bucket name, so we want to # include it in the path. Test that: # - Parsing doesn't lowercase the bucket # - The bucket is included in path for protocol in ['s3', 'gcs', 'gs']: options = infer_storage_options('%s://Bucket-name.com/test.csv' % protocol) assert options['path'] == 'Bucket-name.com/test.csv' with pytest.raises(KeyError): infer_storage_options('file:///bucket/file.csv', {'path': 'collide'}) with pytest.raises(KeyError): infer_storage_options('hdfs:///bucket/file.csv', {'protocol': 'collide'})
def test_infer_storage_options_c(urlpath, expected_path): so = infer_storage_options(urlpath) assert so['protocol'] == 'file' assert so['path'] == expected_path
def _trim_filename(self, fn): so = infer_storage_options(fn) return so['path']
def test_infer_storage_options(): so = infer_storage_options('/mnt/datasets/test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == '/mnt/datasets/test.csv' assert not so assert infer_storage_options('./test.csv')['path'] == './test.csv' assert infer_storage_options('../test.csv')['path'] == '../test.csv' so = infer_storage_options('C:\\test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == 'C:\\test.csv' assert not so assert infer_storage_options('d:\\test.csv')['path'] == 'd:\\test.csv' assert infer_storage_options('\\test.csv')['path'] == '\\test.csv' assert infer_storage_options('.\\test.csv')['path'] == '.\\test.csv' assert infer_storage_options('test.csv')['path'] == 'test.csv' so = infer_storage_options( 'hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm', inherit_storage_options={'extra': 'value'}) assert so.pop('protocol') == 'hdfs' assert so.pop('username') == 'username' assert so.pop('password') == 'pwd' assert so.pop('host') == 'Node' assert so.pop('port') == 123 assert so.pop('path') == '/mnt/datasets/test.csv' assert so.pop('url_query') == 'q=1' assert so.pop('url_fragment') == 'fragm' assert so.pop('extra') == 'value' assert not so so = infer_storage_options('hdfs://[email protected]/mnt/datasets/test.csv') assert so.pop('username') == 'User-name' assert so.pop('host') == 'Node-name.com' assert infer_storage_options('s3://Bucket-name.com/test.csv')['host'] == 'Bucket-name.com' assert infer_storage_options('http://127.0.0.1:8080/test.csv')['host'] == '127.0.0.1' with pytest.raises(KeyError): infer_storage_options('file:///bucket/file.csv', {'path': 'collide'}) with pytest.raises(KeyError): infer_storage_options('hdfs:///bucket/file.csv', {'protocol': 'collide'})
def test_infer_storage_options(): so = infer_storage_options("/mnt/datasets/test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "/mnt/datasets/test.csv" assert not so assert infer_storage_options("./test.csv")["path"] == "./test.csv" assert infer_storage_options("../test.csv")["path"] == "../test.csv" so = infer_storage_options("C:\\test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "C:\\test.csv" assert not so assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" assert infer_storage_options("test.csv")["path"] == "test.csv" so = infer_storage_options( "hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm", inherit_storage_options={"extra": "value"}, ) assert so.pop("protocol") == "hdfs" assert so.pop("username") == "username" assert so.pop("password") == "pwd" assert so.pop("host") == "Node" assert so.pop("port") == 123 assert so.pop("path") == "/mnt/datasets/test.csv" assert so.pop("url_query") == "q=1" assert so.pop("url_fragment") == "fragm" assert so.pop("extra") == "value" assert not so so = infer_storage_options( "hdfs://[email protected]/mnt/datasets/test.csv") assert so.pop("username") == "User-name" assert so.pop("host") == "Node-name.com" u = "http://127.0.0.1:8080/test.csv" assert infer_storage_options(u) == {"protocol": "http", "path": u} # For s3 and gcs the netloc is actually the bucket name, so we want to # include it in the path. Test that: # - Parsing doesn't lowercase the bucket # - The bucket is included in path for protocol in ["s3", "gcs", "gs"]: options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol) assert options["path"] == "Bucket-name.com/test.csv" with pytest.raises(KeyError): infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) with pytest.raises(KeyError): infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
def test_infer_storage_options_c(urlpath, expected_path): so = infer_storage_options(urlpath) assert so["protocol"] == "file" assert so["path"] == expected_path
def test_infer_storage_options(): so = infer_storage_options('/mnt/datasets/test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == '/mnt/datasets/test.csv' assert not so assert infer_storage_options('./test.csv')['path'] == './test.csv' assert infer_storage_options('../test.csv')['path'] == '../test.csv' so = infer_storage_options('C:\\test.csv') assert so.pop('protocol') == 'file' assert so.pop('path') == 'C:\\test.csv' assert not so assert infer_storage_options('d:\\test.csv')['path'] == 'd:\\test.csv' assert infer_storage_options('\\test.csv')['path'] == '\\test.csv' assert infer_storage_options('.\\test.csv')['path'] == '.\\test.csv' assert infer_storage_options('test.csv')['path'] == 'test.csv' so = infer_storage_options( 'hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm', inherit_storage_options={'extra': 'value'}) assert so.pop('protocol') == 'hdfs' assert so.pop('username') == 'username' assert so.pop('password') == 'pwd' assert so.pop('host') == 'Node' assert so.pop('port') == 123 assert so.pop('path') == '/mnt/datasets/test.csv' assert so.pop('url_query') == 'q=1' assert so.pop('url_fragment') == 'fragm' assert so.pop('extra') == 'value' assert not so so = infer_storage_options( 'hdfs://[email protected]/mnt/datasets/test.csv') assert so.pop('username') == 'User-name' assert so.pop('host') == 'Node-name.com' assert infer_storage_options( 's3://Bucket-name.com/test.csv')['host'] == 'Bucket-name.com' assert infer_storage_options( 'http://127.0.0.1:8080/test.csv')['host'] == '127.0.0.1' with pytest.raises(KeyError): infer_storage_options('file:///bucket/file.csv', {'path': 'collide'}) with pytest.raises(KeyError): infer_storage_options('hdfs:///bucket/file.csv', {'protocol': 'collide'})