Beispiel #1
0
def setup_bucket(project_id, service_account_key, bucket_name):
    credentials = service_account.Credentials.from_service_account_file(
        service_account_key, scopes=SCOPES)

    client = GSClient(project_id, credentials)
    bucket = client.bucket(bucket_name)
    needs_create = not bucket.exists()

    if needs_create:
        bucket.create()
Beispiel #2
0
def get_bucket(service_account_b64_key, bucket_name, google_project):
    if service_account_b64_key:
        assert bucket_name and google_project, "missing required GCS configurations"
        credentials_info = json.loads(
            base64.decodebytes(service_account_b64_key.encode()))
        credentials = service_account.Credentials.from_service_account_info(
            credentials_info)
        gcs = Client(project=google_project, credentials=credentials)
        return gcs.bucket(bucket_name)
    else:
        return None
Beispiel #3
0
def test_upload_html_gcs(client: Client):
    id_ = str(uuid.uuid4())
    html = "hello"
    mime = "text/html"
    bucket_name = "vcm-ml-scratch"
    blob = f"testing/{id_}/index.html"
    url = f"gs://{bucket_name}/{blob}"

    upload(html, url, mime)
    bucket = client.bucket(bucket_name)
    blob = bucket.get_blob(blob)
    blob.content_type == "text/html"
Beispiel #4
0
    def retrieve_bucket(self):
        """Return bucket used for storing files.

        Returns:
            Bucket object.
        """
        project = self.env("GOOGLE_PROJECT")
        service_account_file = json.loads(
            self.env("GOOGLE_APPLICATION_CREDENTIALS"))
        credentials = service_account.Credentials.from_service_account_info(
            service_account_file)
        client = Client(project=project, credentials=credentials)
        return client.bucket("cs4teachers-static")
Beispiel #5
0
def get_bucket():
    global _bucket
    if _bucket is None:
        from google.cloud.storage.client import Client

        # A `None` project behaves differently with the client, so
        # we need to call it differently
        try:
            client = Client(project=os.environ["GOOGLE_CLOUD_PROJECT"])
        except KeyError:
            client = Client()
        _bucket = client.bucket(settings.GOOGLE_STORAGE_BUCKET)
    return _bucket
Beispiel #6
0
def get_food_recommender(request):
    """Get existing food recommender list."""
    list_id = request.args.get('listId')
    if not list_id:
        raise ValueError('Query parameter "listId" must be provided')

    if os.getenv('ENV', 'production') == 'local':
        expected_path = (Path(tempfile.gettempdir()) /
                         'jmyrberg-food-recommender' / 'data' /
                         f'{list_id}.json')
        if expected_path.exists():
            with open(expected_path, 'r') as f:
                data = json.load(f)
            return {
                'status': 'success',
                'message': 'Food recommender list fetched successfully',
                'data': data
            }, 200
        else:
            return {
                'status':
                'error',
                'message': (f'Food recommender list "{list_id}"'
                            ' could not be found'),
                'data':
                None
            }, 404
    else:
        global storage_client
        if not storage_client:
            storage_client = Client()
        bucket_name = os.getenv('FOOD_RECOMMENDER_BUCKET_NAME',
                                'jmyrberg-food-recommender')
        blob_name = f'/data/{list_id}.json'
        blob = storage_client.bucket(bucket_name).get_blob(blob_name)
        if blob:
            data = json.loads(blob.download_as_string())
            return {
                'status': 'success',
                'message': 'Food recommender list fetched successfully',
                'data': data
            }, 200
        else:
            return {
                'status':
                'error',
                'message': (f'Food recommender list "{list_id}"'
                            ' could not be found'),
                'data':
                None
            }, 404
Beispiel #7
0
 def save_json(self, data, blob_name):
     if os.getenv('ENV', 'production') == 'local':
         expected_path = (Path(tempfile.gettempdir()) /
                          HOME_DATA_BUCKET_NAME / blob_name)
         expected_path.parent.mkdir(exist_ok=True, parents=True)
         with open(expected_path, 'w') as f:
             data = json.dump(data, f)
             print(f'Saved into {str(expected_path)}!')
     else:
         global storage_client
         if not storage_client:
             storage_client = Client()
         blob = storage_client.bucket(HOME_DATA_BUCKET_NAME).blob(blob_name)
         blob.upload_from_string(json.dumps(data),
                                 content_type='application/json')
         print(f'Saved into {blob_name}!')
Beispiel #8
0
    def get_previous_history_timestamp(self):
        timestamp = None
        if os.getenv('ENV', 'production') == 'local':
            expected_path = (Path(tempfile.gettempdir()) /
                             HOME_DATA_BUCKET_NAME /
                             'nibe/history/previous_timestamp.txt')
            if expected_path.exists():
                with open(expected_path, 'r') as f:
                    timestamp = int(json.load(f)[0])
        else:
            global storage_client
            if not storage_client:
                storage_client = Client()
            blob_name = 'nibe/history/previous_timestamp.txt'
            blob = (storage_client.bucket(HOME_DATA_BUCKET_NAME).get_blob(
                blob_name))
            if blob:
                timestamp = int(json.loads(blob.download_as_string())[0])

        return timestamp
Beispiel #9
0
def post_food_recommender(request):
    """Get existing food recommender list."""
    data = request.get_json()['data']
    list_id = request.args.get('listId') or str(uuid.uuid4())
    data['listId'] = list_id

    if os.getenv('ENV', 'production') == 'local':
        expected_path = (Path(tempfile.gettempdir()) /
                         'jmyrberg-food-recommender' / 'data' /
                         f'{list_id}.json')
        expected_path.parent.mkdir(exist_ok=True, parents=True)
        with open(expected_path, 'w') as f:
            json.dump(data, f)
        return {
            'status': 'success',
            'message': 'Food recommender list saved successfully',
            'data': {
                'listId': list_id
            }
        }, 200
    else:
        global storage_client
        if not storage_client:
            storage_client = Client()
        bucket_name = os.getenv('FOOD_RECOMMENDER_BUCKET_NAME',
                                'jmyrberg-food-recommender')
        blob_name = f'/data/{list_id}.json'
        new_blob = storage_client.bucket(bucket_name).blob(blob_name)
        new_blob.upload_from_string(json.dumps(data))
        return {
            'status': 'success',
            'message': 'Food recommender list saved successfully',
            'data': {
                'listId': list_id
            }
        }, 200
Beispiel #10
0
class IO:
    def __init__(self,
                 project,
                 cas_url_prefix,
                 credentials=None,
                 compute_hash=compute_hash):
        assert project is not None

        self.buckets = {}
        self.credentials = credentials
        self.project = project
        self.client = GSClient(project, credentials=credentials)
        if cas_url_prefix[-1] == "/":
            cas_url_prefix = cas_url_prefix[:-1]
        self.cas_url_prefix = cas_url_prefix
        self.compute_hash = compute_hash

    def generate_signed_url(self, path, expiry=datetime.timedelta(days=30)):
        bucket, key = self._get_bucket_and_path(path)
        blob = bucket.get_blob(key)
        return blob.generate_signed_url(expiry)

    def bulk_get_as_str(self, paths):
        from multiprocessing.pool import ThreadPool
        import threading

        my = threading.local()

        def init_thread():
            my.client = GSClient(self.project, credentials=self.credentials)

        pool = ThreadPool(processes=10, initializer=init_thread)

        def get_as_str(url):
            m = re.match("^gs://([^/]+)/(.*)$", url)
            assert m != None, "invalid remote path: {}".format(url)
            bucket_name = m.group(1)
            path = m.group(2)
            bucket = my.client.bucket(bucket_name)
            blob = bucket.blob(path)
            if not blob.exists():
                return (url, None)
            return (url, blob.download_as_string())

        result = dict(pool.map(get_as_str, paths))
        return result

    def bulk_exists_check(self, paths):
        from multiprocessing.pool import ThreadPool
        import threading

        my = threading.local()

        def init_thread():
            my.client = GSClient(self.project, credentials=self.credentials)

        pool = ThreadPool(processes=10, initializer=init_thread)

        def check(url):
            m = re.match("^gs://([^/]+)/(.*)$", url)
            assert m != None, "invalid remote path: {}".format(url)
            bucket_name = m.group(1)
            path = m.group(2)
            bucket = my.client.bucket(bucket_name)
            blob = bucket.blob(path)
            return (url, blob.exists())

        result = dict(pool.map(check, paths))
        return result

    def _get_bucket_and_path(self, path):
        m = re.match("^gs://([^/]+)/(.*)$", path)
        assert m != None, "invalid remote path: {}".format(path)
        bucket_name = m.group(1)
        path = m.group(2)

        if bucket_name in self.buckets:
            bucket = self.buckets[bucket_name]
        else:
            bucket = self.client.bucket(bucket_name)
        return bucket, path

    def exists(self, src_url):
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        return blob.exists()

    def get_child_keys(self, src_url):
        bucket, path = self._get_bucket_and_path(src_url)
        keys = []

        # I'm unclear if _I_ am responsible for requesting the next page or whether iterator does it for me.
        for blob in bucket.list_blobs(prefix=path + "/"):
            keys.append("gs://" + bucket.name + "/" + blob.name)

        return keys

    def get(self, src_url, dst_filename, must=True):
        log.info("Downloading %s -> %s", src_url, dst_filename)
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            blob.download_to_filename(dst_filename)
        else:
            assert not must, "Could not find {}".format(path)

    def get_as_str(self, src_url, must=True, start=None):

        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            end = None
            if start is not None:
                blob.reload()
                end = blob.size
                if start == end:
                    return ""
            # log.warning("Downloading %s (%s, %s)", src_url, start, end)
            return blob.download_as_string(start=start, end=end).decode("utf8")
        else:
            assert not must, "Could not find {}".format(path)
            return None

    def put(self, src_filename, dst_url, must=True, skip_if_exists=False):
        if must:
            assert os.path.exists(src_filename), "{} does not exist".format(
                src_filename)

        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        if skip_if_exists and blob.exists():
            log.info("Already in CAS cache, skipping upload of %s",
                     src_filename)
            log.debug("skipping put %s -> %s", src_filename, dst_url)
        else:
            log.info("put %s -> %s", src_filename, dst_url)
            # if greater than 10MB ask gsutil to upload for us
            if use_gustil and os.path.getsize(src_filename) > 10 * 1024 * 1024:
                import subprocess

                subprocess.check_call(["gsutil", "cp"] +
                                      [src_filename, dst_url])
            else:
                blob.upload_from_filename(src_filename)

    def _get_url_prefix(self):
        return "gs://"

    def write_file_to_cas(self, filename):
        m = hashlib.sha256()
        with open(filename, "rb") as fd:
            for chunk in iter(lambda: fd.read(10000), b""):
                m.update(chunk)
        hash = m.hexdigest()
        dst_url = self.cas_url_prefix + hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_filename(filename)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_str_to_cas(self, text):
        text = text.encode("utf8")
        hash = hashlib.sha256(text).hexdigest()
        dst_url = self.cas_url_prefix + "/" + hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_string(text)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_json_to_cas(self, obj):
        obj_str = json.dumps(obj)
        return self.write_str_to_cas(obj_str)
 def __init__(self, bucket_name):
     storage_client = Client()
     self.bucket = storage_client.bucket(bucket_name)
Beispiel #12
0
class IO:
    def __init__(self, project, cas_url_prefix, credentials=None):
        assert project is not None

        self.buckets = {}
        self.client = GSClient(project, credentials=credentials)
        if cas_url_prefix[-1] == "/":
            cas_url_prefix = cas_url_prefix[:-1]
        self.cas_url_prefix = cas_url_prefix

    def _get_bucket_and_path(self, path):
        m = re.match("^gs://([^/]+)/(.*)$", path)
        assert m != None, "invalid remote path: {}".format(path)
        bucket_name = m.group(1)
        path = m.group(2)

        if bucket_name in self.buckets:
            bucket = self.buckets[bucket_name]
        else:
            bucket = self.client.bucket(bucket_name)
        return bucket, path

    def get(self, src_url, dst_filename, must=True):
        log.info("Downloading %s -> %s", src_url, dst_filename)
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            blob.download_to_filename(dst_filename)
        else:
            assert not must, "Could not find {}".format(path)

    def get_as_str(self, src_url, must=True):
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            return blob.download_as_string().decode("utf8")
        else:
            assert not must, "Could not find {}".format(path)

    def put(self, src_filename, dst_url, must=True, skip_if_exists=False):
        if must:
            assert os.path.exists(src_filename), "{} does not exist".format(
                src_filename)

        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        if skip_if_exists and blob.exists():
            log.info("Already in CAS cache, skipping upload of %s",
                     src_filename)
            log.debug("skipping put %s -> %s", src_filename, dst_url)
        else:
            log.info("put %s -> %s", src_filename, dst_url)
            blob.upload_from_filename(src_filename)

    def _get_url_prefix(self):
        return "gs://"

    def write_file_to_cas(self, filename):
        m = hashlib.sha256()
        with open(filename, "rb") as fd:
            for chunk in iter(lambda: fd.read(10000), b""):
                m.update(chunk)
        hash = m.hexdigest()
        dst_url = self.cas_url_prefix + hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_filename(filename)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_str_to_cas(self, text):
        text = text.encode("utf8")
        hash = hashlib.sha256(text).hexdigest()
        dst_url = self.cas_url_prefix + "/" + hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_string(text)
        return self._get_url_prefix() + bucket.name + "/" + path

    def write_json_to_cas(self, obj):
        obj_str = json.dumps(obj)
        return self.write_str_to_cas(obj_str)
Beispiel #13
0
class IO:
    def __init__(self, project, cas_url_prefix, credentials=None, compute_hash=_compute_hash):
        assert project is not None

        self.buckets = {}
        self.client = GSClient(project, credentials=credentials)
        if cas_url_prefix[-1] == "/":
            cas_url_prefix = cas_url_prefix[:-1]
        self.cas_url_prefix = cas_url_prefix
        self.compute_hash = compute_hash

    def _get_bucket_and_path(self, path):
        m = re.match("^gs://([^/]+)/(.*)$", path)
        assert m != None, "invalid remote path: {}".format(path)
        bucket_name = m.group(1)
        path = m.group(2)

        if bucket_name in self.buckets:
            bucket = self.buckets[bucket_name]
        else:
            bucket = self.client.bucket(bucket_name)
        return bucket, path

    def exists(self, src_url):
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        return blob.exists()

    def get_child_keys(self, src_url):
        bucket, path = self._get_bucket_and_path(src_url)
        keys = []

        # I'm unclear if _I_ am responsible for requesting the next page or whether iterator does it for me.
        for blob in bucket.list_blobs(prefix=path+"/"):
            keys.append("gs://"+bucket.name+"/"+blob.name)

        return keys

    def get(self, src_url, dst_filename, must=True):
        log.info("Downloading %s -> %s", src_url, dst_filename)
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            blob.download_to_filename(dst_filename)
        else:
            assert not must, "Could not find {}".format(path)

    def get_as_str(self, src_url, must=True):
        bucket, path = self._get_bucket_and_path(src_url)
        blob = bucket.blob(path)
        if blob.exists():
            return blob.download_as_string().decode("utf8")
        else:
            assert not must, "Could not find {}".format(path)

    def put(self, src_filename, dst_url, must=True, skip_if_exists=False):
        if must:
            assert os.path.exists(src_filename), "{} does not exist".format(src_filename)

        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        if skip_if_exists and blob.exists():
            log.info("Already in CAS cache, skipping upload of %s", src_filename)
            log.debug("skipping put %s -> %s", src_filename, dst_url)
        else:
            log.info("put %s -> %s", src_filename, dst_url)
            # if greater than 10MB ask gsutil to upload for us
            if os.path.getsize(src_filename) > 10 * 1024 * 1024:
                import subprocess
                subprocess.check_call(['gsutil', 'cp', src_filename, dst_url])
            else:
                blob.upload_from_filename(src_filename)

    def _get_url_prefix(self):
        return "gs://"

    def write_file_to_cas(self, filename):
        m = hashlib.sha256()
        with open(filename, "rb") as fd:
            for chunk in iter(lambda: fd.read(10000), b""):
                m.update(chunk)
        hash = m.hexdigest()
        dst_url = self.cas_url_prefix+hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_filename(filename)
        return self._get_url_prefix()+bucket.name+"/"+path         

    def write_str_to_cas(self, text):
        text = text.encode("utf8")
        hash = hashlib.sha256(text).hexdigest()
        dst_url = self.cas_url_prefix+"/"+hash
        bucket, path = self._get_bucket_and_path(dst_url)
        blob = bucket.blob(path)
        blob.upload_from_string(text)
        return self._get_url_prefix()+bucket.name+"/"+path
        
    def write_json_to_cas(self, obj):
        obj_str = json.dumps(obj)
        return self.write_str_to_cas(obj_str)