Exemple #1
0
def token_restore():
    cache = GCSFileSystem.tokens
    try:
        GCSFileSystem.tokens = {}
        yield
    finally:
        GCSFileSystem.tokens = cache
        GCSFileSystem._save_tokens()
Exemple #2
0
 def __init__(self, project_id: str, bucket_name: str,
              service_account_file: str):
     self._bucket_name = bucket_name
     self._client = storage.Client(project=project_id,
             credentials=service_account.Credentials.\
             from_service_account_file(service_account_file))
     self._fs = GCSFileSystem(token=service_account_file,
                              check_connection=True)
Exemple #3
0
def test_raise_on_project_mismatch(mock_auth):
    mock_auth.default.return_value = (requests.Session(), "my_other_project")
    match = "'my_project' does not match the google default project 'my_other_project'"
    with pytest.raises(ValueError, match=match):
        GCSFileSystem(project="my_project", token="google_default")

    result = GCSFileSystem(token="google_default")
    assert result.project == "my_other_project"
Exemple #4
0
def token_restore():
    cache = GoogleCredentials.tokens
    try:
        GoogleCredentials.tokens = {}
        yield
    finally:
        GoogleCredentials.tokens = cache
        GoogleCredentials._save_tokens()
        GCSFileSystem.clear_instance_cache()
Exemple #5
0
def test_current():
    from google.auth import credentials

    with gcs_maker() as gcs:
        assert GCSFileSystem.current() is gcs
        gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
        assert gcs2.session is gcs.session
        gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, secure_serialize=False)
        assert isinstance(gcs2.token, credentials.Credentials)
Exemple #6
0
def test_request_header():
    with gcs_maker():
        gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True)
        # test directly against `_call` to inspect the result
        r = gcs.call(
            "GET",
            "b/{}/o/",
            TEST_REQUESTER_PAYS_BUCKET,
            delimiter="/",
            prefix="test",
            maxResults=100,
            info_out=True,
        )
        assert r.headers["User-Agent"] == "python-gcsfs/" + version
Exemple #7
0
def test_request_user_project():
    with gcs_maker():
        gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True)
        # test directly against `_call` to inspect the result
        r = gcs._call(
            "GET",
            "b/{}/o/",
            TEST_REQUESTER_PAYS_BUCKET,
            delimiter="/",
            prefix="test",
            maxResults=100,
        )
        qs = urlparse(r.request.url).query
        result = parse_qs(qs)
        assert result["userProject"] == [TEST_PROJECT]
Exemple #8
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    try:
        if not gcs.exists(TEST_BUCKET):
            gcs.mkdir(TEST_BUCKET)
        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET + '/' + fname, 'wb') as f:
                        f.write(data)
        yield gcs
    finally:
        [gcs.rm(f) for f in gcs.walk(TEST_BUCKET)]
def test_simple(token_restore):
    assert not GCSFileSystem.tokens
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    assert gcs.ls('')

    # token is now cached
    gcs = GCSFileSystem(TEST_PROJECT)
    assert gcs.ls('')
Exemple #10
0
def _load_fs_and_path(path, creds=None, session_creds=True, google_cloud_project=""):
    """Given url(path) and creds returns filesystem required for accessing that file + url's filepath in that filesystem"""
    if (
        path.startswith("./")
        or path.startswith("/")
        or path.startswith("../")
        or path.startswith("~/")
    ):
        return fsspec.filesystem("file"), os.path.expanduser(path.replace("fs://", ""))

    if (
        session_creds
        and creds is None
        and not path.startswith("s3://")
        and not path.startswith("gcs://")
    ):
        path, creds = _connect(path)

    if path.startswith("s3://"):
        path = path[5:]
        if creds is not None and session_creds:

            return (
                fsspec.filesystem(
                    "s3",
                    key=creds["access_key"],
                    secret=creds["secret_key"],
                    token=creds["session_token"],
                    client_kwargs={
                        "endpoint_url": creds["endpoint"],
                        "region_name": creds["region"],
                    },
                ),
                path,
            )
        elif creds is not None:
            return (
                fsspec.filesystem(
                    "s3",
                    key=creds.get("access_key"),
                    secret=creds.get("secret_key"),
                ),
                path,
            )
        else:
            return fsspec.filesystem("s3"), path
    elif path.startswith("gcs://"):
        return (
            GCSFileSystem(project=google_cloud_project, token=creds),
            path[6:],
        )
Exemple #11
0
class GoogleCloudStorage(BlobStorage):
    """Google Cloud Storage storage provider.

    Args:
        project_id: the ID of the Google Cloud project.
        bucket_name: the name of the Cloud Storage bucket to use for all blobs.
        service_account_file: the filename of the GCP service account JSON key 
            file.
    """
    def __init__(self, project_id: str, bucket_name: str, 
            service_account_file: str):
        self._bucket_name = bucket_name
        self._client = storage.Client(project=project_id, 
                credentials=service_account.Credentials.\
                from_service_account_file(service_account_file))
        self._fs = GCSFileSystem(token=service_account_file,
                check_connection=True)
    
    def get_object(self, blob_name):
        blob = self._client.bucket(self._bucket_name).get_blob(blob_name)
        if blob is None:
            raise ValueError("Cannot find blob: "+blob_name)
        return json.loads(blob.download_as_string().decode("utf-8"))

    @contextlib.contextmanager
    def get_file(self, blob_name):
        path = os.path.join(self._bucket_name, blob_name)
        try:
            fileobj = self._fs.open(path, 'rb')
            yield fileobj
        finally:
            fileobj.close()

    def put_object(self, obj, blob_name):
        blob = self._client.bucket(self._bucket_name).blob(blob_name)
        data = json.dumps(obj).encode("utf-8")
        blob.upload_from_string(data, content_type="application/json")
        blob.reload()
        return Blob(blob_name, blob.size)

    def put_file(self, fileobj, blob_name):
        blob = self._client.bucket(self._bucket_name).blob(blob_name)
        blob.upload_from_file(fileobj)
        blob.reload()
        return Blob(blob_name, blob.size)
    
    def put_avro(self, schema, records, blob_name, codec='snappy'):
        path = os.path.join(self._bucket_name, blob_name)
        tmp_path = os.path.join(os.path.dirname(path),
                "~{}".format(os.path.basename(path)))
        with self._fs.open(tmp_path, "wb") as of:
            fastavro.writer(of, schema, records, codec)
        self._fs.mv(tmp_path, path)
        self._fs.setxattrs(path, content_type="avro/binary")
        blob = self._client.bucket(self._bucket_name).get_blob(blob_name)
        if blob is None:
            raise RuntimeError("Cannot find new avro blob: "+blob_name)
        return Blob(blob_name, blob.size)
Exemple #12
0
def test_many_connect():
    from multiprocessing.pool import ThreadPool

    GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)

    def task(i):
        GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("")
        return True

    pool = ThreadPool(processes=20)
    out = pool.map(task, range(40))
    assert all(out)
    pool.close()
    pool.join()
Exemple #13
0
def test_user_project_fallback_google_default(mock_auth):
    mock_auth.default.return_value = (requests.Session(), "my_default_project")
    fs = GCSFileSystem(token="google_default")
    assert fs.project == "my_default_project"
Exemple #14
0
def test_user_project_cat():
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True)
    result = gcs.cat(TEST_REQUESTER_PAYS_BUCKET + "/foo.csv")
    assert len(result)
Exemple #15
0
 def task(i):
     GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("")
     return True
Exemple #16
0
def test_current():
    with gcs_maker() as gcs:
        assert GCSFileSystem.current() is gcs
        gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
        assert gcs2.session is gcs.session
Exemple #17
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    gcs.invalidate_cache()
    try:
        try:
            gcs.mkdir(TEST_BUCKET,
                      default_acl="authenticatedread",
                      acl="publicReadWrite")
        except gcsfs.utils.HttpError:
            pass

        # ensure we're empty.
        gcs.rm(TEST_BUCKET, recursive=True)

        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except FileNotFoundError:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET + "/" + fname, "wb") as f:
                        f.write(data)
        gcs.invalidate_cache()
        yield gcs
    finally:
        for f in gcs.find(TEST_BUCKET):
            try:
                gcs.rm(f)
            except:  # noqa: E722
                pass
Exemple #18
0
def test_simple():
    assert not GCSFileSystem.tokens
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    gcs.ls(TEST_BUCKET)  # no error
    gcs.ls("/" + TEST_BUCKET)  # OK to lead with '/'
Exemple #19
0
 def task(i):
     # first instance is made within thread - creating loop
     GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("")
     return True
Exemple #20
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    gcs.invalidate_cache()
    try:
        try:
            gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread",
                      acl="publicReadWrite")
        except:
            pass
        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET+'/'+fname, 'wb') as f:
                        f.write(data)
        yield gcs
    finally:
        for f in gcs.find(TEST_BUCKET):
            try:
                gcs.rm(f)
            except:
                pass
Exemple #21
0
parser.add_argument('--firecloud', action='store_true', help="Use logic to generate primary keys for Terra upload via Firecloud")
parser.add_argument('--debug', action='store_true', help="Write additional logs for debugging")
args = parser.parse_args()

log_level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(level=log_level)

log = logging.getLogger(__name__)

TERRA_COLUMN_LIMIT = 1000

table_names = args.table or ['cslb', 'hles_cancer_condition', 'hles_dog', 'hles_health_condition', 'hles_owner',
                             'environment', 'sample', 'eols']
PRIMARY_KEY_PREFIX = 'entity'

gcs = GCSFileSystem()

# create a service object to handle all aspects of generating a primary key
@dataclass
class PrimaryKeyGenerator:
    table_name: str
    pk_name: str = field(init=False)
    firecloud: bool

    # this will calculate pk_name during init
    def __post_init__(self):
        # most tables should have "dog_id" as a key
        if self.table_name in {"hles_dog", "hles_cancer_condition", "hles_health_condition", "environment", "cslb", "eols"}:
            self.pk_name = 'dog_id'
        # owner table is linked to hles_dog via "owner_id"
        elif self.table_name == 'hles_owner':
Exemple #22
0
def test_validate_response():
    gcs = GCSFileSystem(token="anon")
    gcs.validate_response(200, None, None, "/path")

    # HttpError with no JSON body
    with pytest.raises(HttpError) as e:
        gcs.validate_response(503, b"", None, "/path")
    assert e.value.code == 503
    assert e.value.message == ""

    # HttpError with JSON body
    j = {"error": {"code": 503, "message": b"Service Unavailable"}}
    with pytest.raises(HttpError) as e:
        gcs.validate_response(503, None, j, "/path")
    assert e.value.code == 503
    assert e.value.message == b"Service Unavailable"

    # 403
    j = {"error": {"message": "Not ok"}}
    with pytest.raises(IOError, match="Forbidden: /path\nNot ok"):
        gcs.validate_response(403, None, j, "/path")

    # 404
    with pytest.raises(FileNotFoundError):
        gcs.validate_response(404, b"", None, "/path")

    # 502
    with pytest.raises(ProxyError):
        gcs.validate_response(502, b"", None, "/path")
Exemple #23
0
def gcs_maker(populate=False, **kwargs):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, **kwargs)
    gcs.invalidate_cache()
    try:
        # ensure we're empty.
        try:
            gcs.rm(TEST_BUCKET, recursive=True)
        except FileNotFoundError:
            pass
        try:
            gcs.mkdir(
                TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite"
            )
        except Exception:
            pass

        if populate:
            gcs.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()})
        gcs.invalidate_cache()
        yield gcs
    finally:
        try:
            gcs.rm(gcs.find(TEST_BUCKET))
        except:  # noqa: E722
            pass
Exemple #24
0
def test_validate_response():
    gcs = GCSFileSystem(token="anon")
    gcs.validate_response(200, None, None, "/path")

    # HttpError with no JSON body
    with pytest.raises(HttpError) as e:
        gcs.validate_response(503, b"", None, "/path")
    assert e.value.code == 503
    assert e.value.message == ", 503"

    # HttpError with JSON body
    j = {"error": {"code": 503, "message": b"Service Unavailable"}}
    with pytest.raises(HttpError) as e:
        gcs.validate_response(503, None, j, "/path")
    assert e.value.code == 503
    assert e.value.message == b"Service Unavailable, 503"

    # 403
    j = {"error": {"message": "Not ok"}}
    with pytest.raises(IOError, match="Forbidden: /path\nNot ok"):
        gcs.validate_response(403, None, j, "/path")

    # 404
    with pytest.raises(FileNotFoundError):
        gcs.validate_response(404, b"", None, "/path")

    # 502
    with pytest.raises(ProxyError):
        gcs.validate_response(502, b"", None, "/path")

    # ChecksumError
    md5 = repr(base64.b64encode(hashlib.md5(b"foo").digest()))[2:-1]
    with pytest.raises(ChecksumError):
        gcs.validate_response(0, b"f", None, "/path",
                              {"X-Goog-Hash": f"md5={md5}"})
def test_current(token_restore):
    with gcs_maker() as gcs:
        assert GCSFileSystem.current() is gcs