def setup_bucket(project_id, service_account_key, bucket_name): credentials = service_account.Credentials.from_service_account_file( service_account_key, scopes=SCOPES) client = GSClient(project_id, credentials) bucket = client.bucket(bucket_name) needs_create = not bucket.exists() if needs_create: bucket.create()
def get_bucket(service_account_b64_key, bucket_name, google_project): if service_account_b64_key: assert bucket_name and google_project, "missing required GCS configurations" credentials_info = json.loads( base64.decodebytes(service_account_b64_key.encode())) credentials = service_account.Credentials.from_service_account_info( credentials_info) gcs = Client(project=google_project, credentials=credentials) return gcs.bucket(bucket_name) else: return None
def test_upload_html_gcs(client: Client): id_ = str(uuid.uuid4()) html = "hello" mime = "text/html" bucket_name = "vcm-ml-scratch" blob = f"testing/{id_}/index.html" url = f"gs://{bucket_name}/{blob}" upload(html, url, mime) bucket = client.bucket(bucket_name) blob = bucket.get_blob(blob) blob.content_type == "text/html"
def retrieve_bucket(self): """Return bucket used for storing files. Returns: Bucket object. """ project = self.env("GOOGLE_PROJECT") service_account_file = json.loads( self.env("GOOGLE_APPLICATION_CREDENTIALS")) credentials = service_account.Credentials.from_service_account_info( service_account_file) client = Client(project=project, credentials=credentials) return client.bucket("cs4teachers-static")
def get_bucket(): global _bucket if _bucket is None: from google.cloud.storage.client import Client # A `None` project behaves differently with the client, so # we need to call it differently try: client = Client(project=os.environ["GOOGLE_CLOUD_PROJECT"]) except KeyError: client = Client() _bucket = client.bucket(settings.GOOGLE_STORAGE_BUCKET) return _bucket
def get_food_recommender(request): """Get existing food recommender list.""" list_id = request.args.get('listId') if not list_id: raise ValueError('Query parameter "listId" must be provided') if os.getenv('ENV', 'production') == 'local': expected_path = (Path(tempfile.gettempdir()) / 'jmyrberg-food-recommender' / 'data' / f'{list_id}.json') if expected_path.exists(): with open(expected_path, 'r') as f: data = json.load(f) return { 'status': 'success', 'message': 'Food recommender list fetched successfully', 'data': data }, 200 else: return { 'status': 'error', 'message': (f'Food recommender list "{list_id}"' ' could not be found'), 'data': None }, 404 else: global storage_client if not storage_client: storage_client = Client() bucket_name = os.getenv('FOOD_RECOMMENDER_BUCKET_NAME', 'jmyrberg-food-recommender') blob_name = f'/data/{list_id}.json' blob = storage_client.bucket(bucket_name).get_blob(blob_name) if blob: data = json.loads(blob.download_as_string()) return { 'status': 'success', 'message': 'Food recommender list fetched successfully', 'data': data }, 200 else: return { 'status': 'error', 'message': (f'Food recommender list "{list_id}"' ' could not be found'), 'data': None }, 404
def save_json(self, data, blob_name): if os.getenv('ENV', 'production') == 'local': expected_path = (Path(tempfile.gettempdir()) / HOME_DATA_BUCKET_NAME / blob_name) expected_path.parent.mkdir(exist_ok=True, parents=True) with open(expected_path, 'w') as f: data = json.dump(data, f) print(f'Saved into {str(expected_path)}!') else: global storage_client if not storage_client: storage_client = Client() blob = storage_client.bucket(HOME_DATA_BUCKET_NAME).blob(blob_name) blob.upload_from_string(json.dumps(data), content_type='application/json') print(f'Saved into {blob_name}!')
def get_previous_history_timestamp(self): timestamp = None if os.getenv('ENV', 'production') == 'local': expected_path = (Path(tempfile.gettempdir()) / HOME_DATA_BUCKET_NAME / 'nibe/history/previous_timestamp.txt') if expected_path.exists(): with open(expected_path, 'r') as f: timestamp = int(json.load(f)[0]) else: global storage_client if not storage_client: storage_client = Client() blob_name = 'nibe/history/previous_timestamp.txt' blob = (storage_client.bucket(HOME_DATA_BUCKET_NAME).get_blob( blob_name)) if blob: timestamp = int(json.loads(blob.download_as_string())[0]) return timestamp
def post_food_recommender(request): """Get existing food recommender list.""" data = request.get_json()['data'] list_id = request.args.get('listId') or str(uuid.uuid4()) data['listId'] = list_id if os.getenv('ENV', 'production') == 'local': expected_path = (Path(tempfile.gettempdir()) / 'jmyrberg-food-recommender' / 'data' / f'{list_id}.json') expected_path.parent.mkdir(exist_ok=True, parents=True) with open(expected_path, 'w') as f: json.dump(data, f) return { 'status': 'success', 'message': 'Food recommender list saved successfully', 'data': { 'listId': list_id } }, 200 else: global storage_client if not storage_client: storage_client = Client() bucket_name = os.getenv('FOOD_RECOMMENDER_BUCKET_NAME', 'jmyrberg-food-recommender') blob_name = f'/data/{list_id}.json' new_blob = storage_client.bucket(bucket_name).blob(blob_name) new_blob.upload_from_string(json.dumps(data)) return { 'status': 'success', 'message': 'Food recommender list saved successfully', 'data': { 'listId': list_id } }, 200
class IO: def __init__(self, project, cas_url_prefix, credentials=None, compute_hash=compute_hash): assert project is not None self.buckets = {} self.credentials = credentials self.project = project self.client = GSClient(project, credentials=credentials) if cas_url_prefix[-1] == "/": cas_url_prefix = cas_url_prefix[:-1] self.cas_url_prefix = cas_url_prefix self.compute_hash = compute_hash def generate_signed_url(self, path, expiry=datetime.timedelta(days=30)): bucket, key = self._get_bucket_and_path(path) blob = bucket.get_blob(key) return blob.generate_signed_url(expiry) def bulk_get_as_str(self, paths): from multiprocessing.pool import ThreadPool import threading my = threading.local() def init_thread(): my.client = GSClient(self.project, credentials=self.credentials) pool = ThreadPool(processes=10, initializer=init_thread) def get_as_str(url): m = re.match("^gs://([^/]+)/(.*)$", url) assert m != None, "invalid remote path: {}".format(url) bucket_name = m.group(1) path = m.group(2) bucket = my.client.bucket(bucket_name) blob = bucket.blob(path) if not blob.exists(): return (url, None) return (url, blob.download_as_string()) result = dict(pool.map(get_as_str, paths)) return result def bulk_exists_check(self, paths): from multiprocessing.pool import ThreadPool import threading my = threading.local() def init_thread(): my.client = GSClient(self.project, credentials=self.credentials) pool = ThreadPool(processes=10, initializer=init_thread) def check(url): m = re.match("^gs://([^/]+)/(.*)$", url) assert m != None, "invalid remote path: {}".format(url) bucket_name = m.group(1) path = m.group(2) bucket = my.client.bucket(bucket_name) blob = bucket.blob(path) return (url, blob.exists()) result = dict(pool.map(check, paths)) return result def _get_bucket_and_path(self, path): m = re.match("^gs://([^/]+)/(.*)$", path) assert m != None, "invalid remote path: {}".format(path) bucket_name = m.group(1) path = m.group(2) if bucket_name in self.buckets: bucket = self.buckets[bucket_name] else: bucket = self.client.bucket(bucket_name) return bucket, path def exists(self, src_url): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) return blob.exists() def get_child_keys(self, src_url): bucket, path = self._get_bucket_and_path(src_url) keys = [] # I'm unclear if _I_ am responsible for requesting the next page or whether iterator does it for me. for blob in bucket.list_blobs(prefix=path + "/"): keys.append("gs://" + bucket.name + "/" + blob.name) return keys def get(self, src_url, dst_filename, must=True): log.info("Downloading %s -> %s", src_url, dst_filename) bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): blob.download_to_filename(dst_filename) else: assert not must, "Could not find {}".format(path) def get_as_str(self, src_url, must=True, start=None): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): end = None if start is not None: blob.reload() end = blob.size if start == end: return "" # log.warning("Downloading %s (%s, %s)", src_url, start, end) return blob.download_as_string(start=start, end=end).decode("utf8") else: assert not must, "Could not find {}".format(path) return None def put(self, src_filename, dst_url, must=True, skip_if_exists=False): if must: assert os.path.exists(src_filename), "{} does not exist".format( src_filename) bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) if skip_if_exists and blob.exists(): log.info("Already in CAS cache, skipping upload of %s", src_filename) log.debug("skipping put %s -> %s", src_filename, dst_url) else: log.info("put %s -> %s", src_filename, dst_url) # if greater than 10MB ask gsutil to upload for us if use_gustil and os.path.getsize(src_filename) > 10 * 1024 * 1024: import subprocess subprocess.check_call(["gsutil", "cp"] + [src_filename, dst_url]) else: blob.upload_from_filename(src_filename) def _get_url_prefix(self): return "gs://" def write_file_to_cas(self, filename): m = hashlib.sha256() with open(filename, "rb") as fd: for chunk in iter(lambda: fd.read(10000), b""): m.update(chunk) hash = m.hexdigest() dst_url = self.cas_url_prefix + hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_filename(filename) return self._get_url_prefix() + bucket.name + "/" + path def write_str_to_cas(self, text): text = text.encode("utf8") hash = hashlib.sha256(text).hexdigest() dst_url = self.cas_url_prefix + "/" + hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_string(text) return self._get_url_prefix() + bucket.name + "/" + path def write_json_to_cas(self, obj): obj_str = json.dumps(obj) return self.write_str_to_cas(obj_str)
def __init__(self, bucket_name): storage_client = Client() self.bucket = storage_client.bucket(bucket_name)
class IO: def __init__(self, project, cas_url_prefix, credentials=None): assert project is not None self.buckets = {} self.client = GSClient(project, credentials=credentials) if cas_url_prefix[-1] == "/": cas_url_prefix = cas_url_prefix[:-1] self.cas_url_prefix = cas_url_prefix def _get_bucket_and_path(self, path): m = re.match("^gs://([^/]+)/(.*)$", path) assert m != None, "invalid remote path: {}".format(path) bucket_name = m.group(1) path = m.group(2) if bucket_name in self.buckets: bucket = self.buckets[bucket_name] else: bucket = self.client.bucket(bucket_name) return bucket, path def get(self, src_url, dst_filename, must=True): log.info("Downloading %s -> %s", src_url, dst_filename) bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): blob.download_to_filename(dst_filename) else: assert not must, "Could not find {}".format(path) def get_as_str(self, src_url, must=True): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): return blob.download_as_string().decode("utf8") else: assert not must, "Could not find {}".format(path) def put(self, src_filename, dst_url, must=True, skip_if_exists=False): if must: assert os.path.exists(src_filename), "{} does not exist".format( src_filename) bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) if skip_if_exists and blob.exists(): log.info("Already in CAS cache, skipping upload of %s", src_filename) log.debug("skipping put %s -> %s", src_filename, dst_url) else: log.info("put %s -> %s", src_filename, dst_url) blob.upload_from_filename(src_filename) def _get_url_prefix(self): return "gs://" def write_file_to_cas(self, filename): m = hashlib.sha256() with open(filename, "rb") as fd: for chunk in iter(lambda: fd.read(10000), b""): m.update(chunk) hash = m.hexdigest() dst_url = self.cas_url_prefix + hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_filename(filename) return self._get_url_prefix() + bucket.name + "/" + path def write_str_to_cas(self, text): text = text.encode("utf8") hash = hashlib.sha256(text).hexdigest() dst_url = self.cas_url_prefix + "/" + hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_string(text) return self._get_url_prefix() + bucket.name + "/" + path def write_json_to_cas(self, obj): obj_str = json.dumps(obj) return self.write_str_to_cas(obj_str)
class IO: def __init__(self, project, cas_url_prefix, credentials=None, compute_hash=_compute_hash): assert project is not None self.buckets = {} self.client = GSClient(project, credentials=credentials) if cas_url_prefix[-1] == "/": cas_url_prefix = cas_url_prefix[:-1] self.cas_url_prefix = cas_url_prefix self.compute_hash = compute_hash def _get_bucket_and_path(self, path): m = re.match("^gs://([^/]+)/(.*)$", path) assert m != None, "invalid remote path: {}".format(path) bucket_name = m.group(1) path = m.group(2) if bucket_name in self.buckets: bucket = self.buckets[bucket_name] else: bucket = self.client.bucket(bucket_name) return bucket, path def exists(self, src_url): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) return blob.exists() def get_child_keys(self, src_url): bucket, path = self._get_bucket_and_path(src_url) keys = [] # I'm unclear if _I_ am responsible for requesting the next page or whether iterator does it for me. for blob in bucket.list_blobs(prefix=path+"/"): keys.append("gs://"+bucket.name+"/"+blob.name) return keys def get(self, src_url, dst_filename, must=True): log.info("Downloading %s -> %s", src_url, dst_filename) bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): blob.download_to_filename(dst_filename) else: assert not must, "Could not find {}".format(path) def get_as_str(self, src_url, must=True): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): return blob.download_as_string().decode("utf8") else: assert not must, "Could not find {}".format(path) def put(self, src_filename, dst_url, must=True, skip_if_exists=False): if must: assert os.path.exists(src_filename), "{} does not exist".format(src_filename) bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) if skip_if_exists and blob.exists(): log.info("Already in CAS cache, skipping upload of %s", src_filename) log.debug("skipping put %s -> %s", src_filename, dst_url) else: log.info("put %s -> %s", src_filename, dst_url) # if greater than 10MB ask gsutil to upload for us if os.path.getsize(src_filename) > 10 * 1024 * 1024: import subprocess subprocess.check_call(['gsutil', 'cp', src_filename, dst_url]) else: blob.upload_from_filename(src_filename) def _get_url_prefix(self): return "gs://" def write_file_to_cas(self, filename): m = hashlib.sha256() with open(filename, "rb") as fd: for chunk in iter(lambda: fd.read(10000), b""): m.update(chunk) hash = m.hexdigest() dst_url = self.cas_url_prefix+hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_filename(filename) return self._get_url_prefix()+bucket.name+"/"+path def write_str_to_cas(self, text): text = text.encode("utf8") hash = hashlib.sha256(text).hexdigest() dst_url = self.cas_url_prefix+"/"+hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_string(text) return self._get_url_prefix()+bucket.name+"/"+path def write_json_to_cas(self, obj): obj_str = json.dumps(obj) return self.write_str_to_cas(obj_str)