Beispiel #1
0
    def __init__(self, bucket_name, subdir='_/'):
        gcs = Client()
        try:
            self.bucket = gcs.get_bucket(bucket_name)
        except NotFound:
            self.bucket = gcs.bucket(bucket_name)
            # Hardcode the bucket location to EU
            self.bucket.location = 'EU'
            self.bucket.create()

        self.subdir = subdir
Beispiel #2
0
    def __init__(self, bucket_name, subdir='_/'):
        gcs = Client()
        try:
            self.bucket = gcs.get_bucket(bucket_name)
        except NotFound:
            self.bucket = gcs.bucket(bucket_name)
            # Hardcode the bucket location to EU
            self.bucket.location = 'EU'
            self.bucket.create()

        self.subdir = subdir
Beispiel #3
0
class IO:
    def __init__(self, project, cas_url_prefix):
        assert project is not None

        self.buckets = {}
        self.client = GSClient(project)
        if cas_url_prefix[-1] == "/":
            cas_url_prefix = cas_url_prefix[:-1]
        self.cas_url_prefix = cas_url_prefix

    def _get_bucket_and_path(self, path):
        m = re.match("^gs://([^/]+)/(.*)$", path)
        assert m != None, "invalid remote path: {}".format(path)
        bucket_name = m.group(1)
        path = m.group(2)

        if bucket_name in self.buckets:
            bucket = self.buckets[bucket_name]
        else:
            bucket = self.client.bucket(bucket_name)
        return bucket, path

    def get(self, src_url, dst_filename, must=True):
        log.info("get %s -> %s", src_url, dst_filename)
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            blob.download_to_filename(dst_filename)
        else:
            assert not must, "Could not find {}".format(path)

    def get_as_str(self, src_url):
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        return blob.download_as_string().decode("utf8")

    def put(self, src_filename, dst_url, must=True, skip_if_exists=False):
        if must:
            assert os.path.exists(src_filename)

        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        if skip_if_exists and blob.exists():
            log.info("skipping put %s -> %s", src_filename, dst_url)
        else:
            log.info("put %s -> %s", src_filename, dst_url)
            blob.upload_from_filename(src_filename)

    def _get_url_prefix(self):
        return "gs://"

    def write_file_to_cas(self, filename):
        m = hashlib.sha256()
        with open(filename, "rb") as fd:
            for chunk in iter(lambda: fd.read(10000), b""):
                m.update(chunk)
        hash = m.hexdigest()
        dst_url = self.cas_url_prefix + hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_filename(filename)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_str_to_cas(self, text):
        text = text.encode("utf8")
        hash = hashlib.sha256(text).hexdigest()
        dst_url = self.cas_url_prefix + "/" + hash
        #        print("self.cas_url_prefix", self.cas_url_prefix)
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_string(text)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_json_to_cas(self, obj):
        obj_str = json.dumps(obj)
        return self.write_str_to_cas(obj_str)