def __init__( self, bucket_name, blob_name, read_size=QuerybookSettings.STORE_READ_SIZE, max_read_size=QuerybookSettings.STORE_MAX_READ_SIZE, ): from google.cloud import storage from google.auth.transport.requests import AuthorizedSession from google.resumable_media.requests import ChunkedDownload # First check for existence cred = get_google_credentials() client = storage.Client(project=cred.project_id, credentials=cred) bucket = client.bucket(bucket_name) blob = bucket.blob(blob_name) if not blob.exists(): raise FileDoesNotExist("{}/{} does not exist".format( bucket_name, blob_name)) # Start the transport process self._transport = AuthorizedSession(credentials=client._credentials) self._stream = BytesIO() download_url = ( f"https://storage.googleapis.com/storage/v1/b/" f"{bucket_name}/o/{quote(blob_name, safe='')}?alt=media") self._download = ChunkedDownload(download_url, read_size, self._stream) super(GoogleDownloadClient, self).__init__(read_size, max_read_size)
def _download_blob(self, blob): destination_file = self._destination_file(blob.name) transport = _create_transport() chunk_size = 10 * 1024 * 1024 # 10MB download = ChunkedDownload(blob.media_link, chunk_size, destination_file) initial_bytes_downloaded = self._status.downloaded_bytes while not download.finished: download.consume_next_chunk(transport) self._update_status(downloaded_bytes=initial_bytes_downloaded + download.bytes_downloaded) self._update_status(downloaded_files=self._status.downloaded_files + 1)
def main(args): file_location = Path(args.file_location) file_name = file_location.name local_file = file_location client = storage.Client() blob_folder = "word2vec_service/v2" bucket_name = "hutoma-datasets" bucket = client.get_bucket(bucket_name) blob_path = "{}/{}".format(blob_folder, file_name) blob = bucket.blob(blob_path) bytes_in_1MB = 1024 * 1024 print("Operation {}: blob is {}, local file is {}".format( args.operation, blob_path, local_file)) transport = g_requests.AuthorizedSession(credentials=client._credentials) if args.operation == "download": if not blob.exists(): raise DataError("Blob {} doesn't exist".format(blob_path)) if local_file.exists(): confirm_prompt("File {} exists, overwrite?".format(local_file)) url = ("https://www.googleapis.com/download/storage/v1/b/" "{bucket}/o/{blob_name}?alt=media").format( bucket=bucket_name, blob_name=urllib.parse.quote_plus(blob_path)) chunk_size = bytes_in_1MB * 5 # 5MB with local_file.open("wb") as file_stream: download = ChunkedDownload(url, chunk_size, file_stream) download.finished response = download.consume_next_chunk(transport) if not download.finished: process_operation(transport, download) elif args.operation == "upload": if not local_file.exists(): raise DataError("File {} doesn't exist".format(blob_path)) if blob.exists(): confirm_prompt("Blob {} exists, overwrite?".format(local_file)) url = ("https://www.googleapis.com/upload/storage/v1/b/{bucket}" + "/o?uploadType=resumable").format(bucket=bucket_name) chunk_size = bytes_in_1MB # 1MB upload = ResumableUpload(url, chunk_size) metadata = {"name": blob_path} content_type = "application/octet-stream" with local_file.open("rb") as file_stream: response = upload.initiate(transport, file_stream, metadata, content_type) if response.status_code != 200: raise DataError("Failed to initiate upload") process_operation(transport, upload)
def _make_download(self): self.file_obj = io.BytesIO() download_url = self.blob._get_download_url() headers = storage.blob._get_encryption_headers( self.blob._encryption_key) headers['accept-encoding'] = 'gzip' if self.blob.chunk_size is None: self.download = Download(download_url, stream=self.file_obj, headers=headers, start=self.start, end=self.end) else: self.download = ChunkedDownload( download_url, self.blob.chunk_size, self.file_obj, headers=headers, start=self.start if self.start else 0, end=self.end)
def _download_from_gcs_bucket(bucket_name, bucket_path, local_path, expected_size_in_bytes=None, progress_indicator=None): # pylint: disable=import-outside-toplevel # lazily initialize Google Cloud Storage support - we might not need it import google.auth import google.auth.transport.requests as tr_requests import google.oauth2.credentials # Using Google Resumable Media as the standard storage library doesn't support progress # (https://github.com/googleapis/python-storage/issues/27) from google.resumable_media.requests import ChunkedDownload ro_scope = "https://www.googleapis.com/auth/devstorage.read_only" access_token = os.environ.get("GOOGLE_AUTH_TOKEN") if access_token: credentials = google.oauth2.credentials.Credentials( token=access_token, scopes=(ro_scope, )) else: # https://google-auth.readthedocs.io/en/latest/user-guide.html credentials, _ = google.auth.default(scopes=(ro_scope, )) transport = tr_requests.AuthorizedSession(credentials) chunk_size = 50 * 1024 * 1024 # 50MB with open(local_path, "wb") as local_fp: media_url = _build_gcs_object_url(bucket_name, bucket_path) download = ChunkedDownload(media_url, chunk_size, local_fp) # allow us to calculate the total bytes download.consume_next_chunk(transport) if not expected_size_in_bytes: expected_size_in_bytes = download.total_bytes while not download.finished: if progress_indicator and download.bytes_downloaded and download.total_bytes: progress_indicator(download.bytes_downloaded, expected_size_in_bytes) download.consume_next_chunk(transport)
url_template = (u'https://www.googleapis.com/download/storage/v1/b/' u'{bucket}/o/{blob_name}?alt=media') url_template_upload = ( u'https://www.googleapis.com/upload/storage/v1/b/{bucket}/o?' u'uploadType=resumable') upload_url = url_template_upload.format(bucket=bucket_upload) media_url = url_template.format(bucket=bucket, blob_name=blob_name) chunk_size = 1 * 1024 * 1024 stream = io.BytesIO() download = ChunkedDownload(media_url, chunk_size, stream) upload = ResumableUpload(upload_url, chunk_size) data = [] while download.finished != True: response = download.consume_next_chunk(transport) data.append(response.content.decode("utf-8").replace(',', '|')) new_data = ''.join(data) stream_upload = io.BytesIO(bytes(new_data, 'UTF-8')) metadata = {u'name': blob_name_upload} reponse_upload = upload.initiate(transport, stream_upload, metadata, content_type) while upload.finished != True: upload.transmit_next_chunk(transport)
class BlobReader(io.IOBase): def __init__(self, blob, start=0, end=None, client=None): self.blob = blob self.start = start self.end = end self.client = client self.download = None self.file_obj = None def _make_download(self): self.file_obj = io.BytesIO() download_url = self.blob._get_download_url() headers = storage.blob._get_encryption_headers( self.blob._encryption_key) headers['accept-encoding'] = 'gzip' if self.blob.chunk_size is None: self.download = Download(download_url, stream=self.file_obj, headers=headers, start=self.start, end=self.end) else: self.download = ChunkedDownload( download_url, self.blob.chunk_size, self.file_obj, headers=headers, start=self.start if self.start else 0, end=self.end) def read(self, size=-1): value = b'' if self.file_obj.tell() < len(self.file_obj.getvalue()): value = self.file_obj.read(size) if len(value) < size or size < 0: self.start += self.file_obj.tell() self._make_download() transport = self.blob._get_transport(self.client) if self.blob.chunk_size is None: self.download.consume(transport) else: self.download.consume_next_chunk(transport) self.file_obj.seek(0) value += self.file_obj.read(size - len(value)) return value def seekable(self): return True def seek(self, offset, whence=0): if whence == 0: self.start = offset or 0 self._make_download() return self.start elif whence == 1: if self.file_obj is None: self.start += offset return self.start pos = self.file_obj.tell() + offset if pos < 0 or pos > len(self.file_obj.getvalue()): self.start += pos self._make_download() return self.start self.file_obj.seek(offset, 1) return self.start + self.file_obj.tell() else: assert False, "whence == 2 is not supported"