def __init__(self, bucket_name, subdir='_/'): gcs = Client() try: self.bucket = gcs.get_bucket(bucket_name) except NotFound: self.bucket = gcs.bucket(bucket_name) # Hardcode the bucket location to EU self.bucket.location = 'EU' self.bucket.create() self.subdir = subdir
class IO: def __init__(self, project, cas_url_prefix): assert project is not None self.buckets = {} self.client = GSClient(project) if cas_url_prefix[-1] == "/": cas_url_prefix = cas_url_prefix[:-1] self.cas_url_prefix = cas_url_prefix def _get_bucket_and_path(self, path): m = re.match("^gs://([^/]+)/(.*)$", path) assert m != None, "invalid remote path: {}".format(path) bucket_name = m.group(1) path = m.group(2) if bucket_name in self.buckets: bucket = self.buckets[bucket_name] else: bucket = self.client.bucket(bucket_name) return bucket, path def get(self, src_url, dst_filename, must=True): log.info("get %s -> %s", src_url, dst_filename) bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) if blob.exists(): blob.download_to_filename(dst_filename) else: assert not must, "Could not find {}".format(path) def get_as_str(self, src_url): bucket, path = self._get_bucket_and_path(src_url) blob = bucket.blob(path) return blob.download_as_string().decode("utf8") def put(self, src_filename, dst_url, must=True, skip_if_exists=False): if must: assert os.path.exists(src_filename) bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) if skip_if_exists and blob.exists(): log.info("skipping put %s -> %s", src_filename, dst_url) else: log.info("put %s -> %s", src_filename, dst_url) blob.upload_from_filename(src_filename) def _get_url_prefix(self): return "gs://" def write_file_to_cas(self, filename): m = hashlib.sha256() with open(filename, "rb") as fd: for chunk in iter(lambda: fd.read(10000), b""): m.update(chunk) hash = m.hexdigest() dst_url = self.cas_url_prefix + hash bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_filename(filename) return self._get_url_prefix() + bucket.name + "/" + path def write_str_to_cas(self, text): text = text.encode("utf8") hash = hashlib.sha256(text).hexdigest() dst_url = self.cas_url_prefix + "/" + hash # print("self.cas_url_prefix", self.cas_url_prefix) bucket, path = self._get_bucket_and_path(dst_url) blob = bucket.blob(path) blob.upload_from_string(text) return self._get_url_prefix() + bucket.name + "/" + path def write_json_to_cas(self, obj): obj_str = json.dumps(obj) return self.write_str_to_cas(obj_str)