Ejemplo n.º 1
0
def get_gcp_service_account_credentials(gcp_project_id):

    # Retrieve service account information corresponding to the GCP Project ID provided
    #
    bucket, blob_name = get_gcp_service_account_infos(gcp_project_id)

    if (bucket is None) or (blob_name is None):
        return None

    try:
        # Read the credentials from GCS
        #
        gcs_client = Client()
        bucket = gcs_client.get_bucket(bucket)
        blob = Blob(blob_name, bucket)
        json_credentials = json.loads(blob.download_as_string())

        # Build and return GCP Credentials
        #
        return service_account.Credentials.from_service_account_info(
            json_credentials)

    except Exception as ex:
        print("Cannot retrieve Service Account credentials.")
        print(ex)
        return None
Ejemplo n.º 2
0
    def lock(self):
        """
        This is the best we can do. It is impossible to acquire the lock reliably without
        using any additional services. test-and-set is impossible to implement.
        :return:
        """
        log = self._log
        log.info("Locking the bucket...")

        # Client should be imported here because grpc starts threads during import
        # and if you call fork after that, a child process will be hang during exit
        from google.cloud.storage import Client

        if self.credentials:
            client = Client.from_service_account_json(self.credentials)
        else:
            client = Client()
        bucket = client.get_bucket(self.bucket_name)
        self._bucket = bucket
        sentinel = bucket.blob("index.lock")
        try:
            while sentinel.exists():
                log.warning("Failed to acquire the lock, waiting...")
                time.sleep(1)
            sentinel.upload_from_string(b"")
            # Several agents can get here. No test-and-set, sorry!
            yield None
        finally:
            self._bucket = None
            if sentinel is not None:
                try:
                    sentinel.delete()
                except:
                    pass
Ejemplo n.º 3
0
    def _write_async(self):
        if len(self.pending_records) == 0:
            return
        try:
            client = Client(project=DB_LOGGER_WRITE_TO_GCS_PROJECT_ID)
            bucket_path = f"{self.bucket_inner_path}/{self.filename}"
            if DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG:
                bucket_path = self._compose_progressing_log_file_name(
                    bucket_path)

            bucket = client.bucket(bucket_name=self.bucket_name)
            blob = bucket.blob(blob_name=bucket_path)

            records = self.pending_records
            self.pending_records = []

            if not DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG and blob.exists():
                current_log = blob.download_as_string().decode(
                    encoding="utf-8").strip()
                if current_log:
                    records.insert(0, current_log)
                # Reset the blob
                blob = bucket.blob(blob_name=bucket_path)

            blob.upload_from_string("\n".join(records))

        except Exception as err:
            airflow_db_logger_log.error(
                f"Failed to flash to bucket @ {self.bucket_name}/{self.bucket_inner_path}/{self.filename}"
            )
            airflow_db_logger_log.error(err)
Ejemplo n.º 4
0
 def _store_in_thread(self, file):
     file.seek(0)
     from google.cloud.storage import Client
     client = Client(project=self.project_id)
     bucket = client.get_bucket(self.bucket_name)
     blob = bucket.blob(self.blob_name)
     blob.upload_from_file(file, predefined_acl=self.acl)
Ejemplo n.º 5
0
 def open_gcs_url(config, logger, storage, url):
     reader_impl = SourceFile.extract_reader_impl(config)
     use_gcs_service_account = "service_account_json" in config["provider"] and storage == "gs://"
     file_to_close = None
     if reader_impl == "gcsfs":
         if use_gcs_service_account:
             try:
                 token_dict = json.loads(config["provider"]["service_account_json"])
             except json.decoder.JSONDecodeError as err:
                 logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}")
                 raise err
         else:
             token_dict = "anon"
         fs = gcsfs.GCSFileSystem(token=token_dict)
         file_to_close = fs.open(f"gs://{url}")
         result = file_to_close
     else:
         if use_gcs_service_account:
             try:
                 credentials = json.dumps(json.loads(config["provider"]["service_account_json"]))
                 tmp_service_account = tempfile.NamedTemporaryFile(delete=False)
                 with open(tmp_service_account, "w") as f:
                     f.write(credentials)
                 tmp_service_account.close()
                 client = Client.from_service_account_json(tmp_service_account.name)
                 result = open(f"gs://{url}", transport_params=dict(client=client))
                 os.remove(tmp_service_account.name)
             except json.decoder.JSONDecodeError as err:
                 logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}")
                 raise err
         else:
             client = Client.create_anonymous_client()
             result = open(f"{storage}{url}", transport_params=dict(client=client))
     return result, file_to_close
Ejemplo n.º 6
0
async def upload_picture(file: bytes = File(None, media_type="image/jpeg")):
    """
    Uploads image from phone to server and saves it to bucket
    Args:
        file (bytes): the image taken from camera app encoded in bytes

    Returns:
        response (str): JSON Response with uuid of the file uploaded
    """
    try:
        if file is None:
            raise HTTPException(status_code=422, detail="Empty image sent")

        else:
            # Initializes the Storage client
            storage_client = Client(project=creds.project_id)
            bucket = storage_client.get_bucket(creds.bucket_id)

            # Generates a unique identifier for storage
            img_uuid = str(uuid.uuid4())[0:6]
            blob = bucket.blob(img_uuid)

            # Decodes the base64 encoded bytearry of incoming image
            content = base64.b64decode(file)

            # Takes base64 decoded image and converts to image/jpeg
            blob.upload_from_string(data=content, content_type="image/jpeg")

            return {"detail": img_uuid}
    except GoogleCloudError as e:
        raise HTTPException(detail=str(e), status_code=500)
Ejemplo n.º 7
0
    def _get_native_gcp_handle() -> typing.Any:
        if Config.BLOBSTORE_GS_MAX_CUMULATIVE_RETRY is not None:
            google.resumable_media.common.MAX_CUMULATIVE_RETRY = Config.BLOBSTORE_GS_MAX_CUMULATIVE_RETRY

        if Config.BLOBSTORE_CONNECT_TIMEOUT is None and Config.BLOBSTORE_READ_TIMEOUT is None:
            return Client.from_service_account_json(
                os.environ['GOOGLE_APPLICATION_CREDENTIALS'], )
        else:
            # GCP has no direct interface to configure retries and timeouts. However, it makes use of Python's
            # stdlib `requests` package, which has straightforward timeout usage.
            class SessionWithTimeouts(
                    google.auth.transport.requests.AuthorizedSession):
                def request(self, *args, **kwargs):
                    kwargs['timeout'] = (Config.BLOBSTORE_CONNECT_TIMEOUT,
                                         Config.BLOBSTORE_READ_TIMEOUT)
                    return super().request(*args, **kwargs)

            credentials = service_account.Credentials.from_service_account_file(
                os.environ['GOOGLE_APPLICATION_CREDENTIALS'],
                scopes=Client.SCOPE)

            # _http is a "private" parameter, and we may need to re-visit GCP timeout retry
            # strategies in the future.
            return Client(_http=SessionWithTimeouts(credentials),
                          credentials=credentials)
Ejemplo n.º 8
0
    def _get_native_gcp_handle() -> typing.Any:
        if Config.BLOBSTORE_CONNECT_TIMEOUT is None and Config.BLOBSTORE_READ_TIMEOUT is None:
            client = Client.from_service_account_json(
                os.environ['GOOGLE_APPLICATION_CREDENTIALS'], )
        else:
            # GCP has no direct interface to configure retries and timeouts. However, it makes use of Python's
            # stdlib `requests` package, which has straightforward timeout usage.
            class SessionWithTimeouts(AuthorizedSession):
                def request(self, *args, **kwargs):
                    kwargs['timeout'] = (Config.BLOBSTORE_CONNECT_TIMEOUT,
                                         Config.BLOBSTORE_READ_TIMEOUT)
                    return super().request(*args, **kwargs)

            credentials = service_account.Credentials.from_service_account_file(
                os.environ['GOOGLE_APPLICATION_CREDENTIALS'],
                scopes=Client.SCOPE)

            # _http is a "private" parameter, and we may need to re-visit GCP timeout retry
            # strategies in the future.
            client = Client(_http=SessionWithTimeouts(credentials),
                            credentials=credentials)

        adapter_kwargs = dict(pool_maxsize=max(DEFAULT_POOLSIZE, 20))
        if Config.BLOBSTORE_RETRIES is not None:
            adapter_kwargs['max_retries'] = Retry(
                total=Config.BLOBSTORE_RETRIES,
                backoff_factor=0.3,
                status_forcelist=(500, 502, 504))
        adapter = HTTPAdapter(**adapter_kwargs)
        # _http is a "private" parameter, and we may need to re-visit GCP timeout retry
        # strategies in the future.
        client._http.mount('https://', adapter)
        client._http.mount('http://', adapter)
        return client
Ejemplo n.º 9
0
    def open_fs(self, fs_url, parse_result, writeable, create, cwd):  # pylint: disable=no-self-use
        path_parts = iteratepath(parse_result.resource)

        bucket_name = path_parts[0]
        root_path = join(*path_parts[1:])

        if not bucket_name:
            raise OpenerError("invalid bucket name in '{}'".format(fs_url))

        if parse_result.params.get("strict") == "False":
            strict = False
        else:
            strict = True

        client = Client()
        project = parse_result.params.get("project")
        if project:
            client.project = project
        api_endpoint = parse_result.params.get("api_endpoint")
        if api_endpoint:
            client.client_options = {"api_endpoint": api_endpoint}

        return GCSFS(bucket_name,
                     root_path=root_path,
                     create=create,
                     client=client,
                     strict=strict)
    def __init__(
        self,
        bucketname: str,  #Should be bucket and/or blob name
        filename: str,
        store_user_data: bool = True,
        store_chat_data: bool = True,
        store_bot_data: bool = True,
        single_file:
        bool = True,  #If false, stores in chatID_user_data.json, chatID_chat_data.json, chatID_bot_data.json
        on_flush: bool = False,
        storage_client: storage.Client = storage.Client()):
        super().__init__(
            store_user_data=store_user_data,
            store_chat_data=store_chat_data,
            store_bot_data=store_bot_data,
        )
        self.bucketname = bucketname
        self.filename = filename
        try:
            self.bucket = storage_client.get_bucket(bucketname)
        except:

            self.bucket = storage_client.create_bucket(bucketname)
            blob = self.bucket.blob(filename)
            blob.upload_from_string(json.dumps({}))
        self.filename = filename
        self.storage_client = storage_client

        self.single_file = single_file
        self.on_flush = on_flush
        self.user_data: Optional[DefaultDict[int, Dict]] = None
        self.chat_data: Optional[DefaultDict[int, Dict]] = None
        self.bot_data: Optional[Dict] = None
        self.conversations: Optional[Dict[str, Dict[Tuple, object]]] = None
Ejemplo n.º 11
0
 def _download_from_cloudstorage(self, blob_path: str,
                                 local_path: str) -> str:
     client = Client()
     bucket = client.get_bucket(self.BUCKET)
     blob = bucket.blob(blob_path)
     blob.download_to_filename(local_path)
     return local_path
Ejemplo n.º 12
0
 def _remove_from_cloudstorage(self, blob_path: str):
     client = Client()
     bucket = client.bucket(self.BUCKET)
     try:  # don't fail entire task if this fails
         bucket.delete_blob(blob_path)
     except NotFound:
         print(f"{blob_path} not found")
Ejemplo n.º 13
0
def hello(**kwargs):
    gcs = Client()
    bucket = gcs.bucket("data.visitdata.org")
    blob = bucket.blob("processed/hello/lastrun")
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    blob.upload_from_string(f"{timestamp}\n")
    print("Successfully wrote timestamp to bucket: {}".format(timestamp))
Ejemplo n.º 14
0
    def _upload(self, payload: bytes, filename: str, bucket: str) -> None:
        """
        Upload a payload to GCS

        """

        client = Client(project=self.project_id)
        count = 0
        while count < self.max_retries:
            try:
                bucket_obj = client.get_bucket(bucket)
                if self.use_encryption:
                    payload = self._encrypt(payload)
                content = BytesIO(payload)
                blob = Blob(filename, bucket_obj)
                blob.upload_from_file(content)
                break
            except (
                InvalidResponse,
                GoogleAPICallError,
                InternalServerError,
                SSLError,
            ) as e:
                if count >= self.max_retries:
                    raise StoqPluginException(
                        f'Failed to upload {bucket}/{filename} to GCS: {str(e)}'
                    )
                count += 1
                sleep(randrange(0, 4))
Ejemplo n.º 15
0
    def __init__(self, bucket_name, project=None, credentials=None):
        """
            Constructor

                :param bucket_name:
                    Name of the bucket that the files are on.

                :param project: the project which the client acts on behalf of. Will be
                                passed when creating a topic.  If not passed,
                                falls back to the default inferred from the environment.

                :param credentials: (Optional) The OAuth2 Credentials to use for this
                                    client. If not passed (and if no ``_http`` object is
                                    passed), falls back to the default inferred from the
                                    environment.

            Make sure the credentials have the correct permissions set up on
            Google Cloud or else GoogleStorage will return a 403 FORBIDDEN error.
        """
        if not Client:
            raise ValueError(
                'Could not import google.cloud.storage. You can install '
                'google.cloud.storage by using pip install google-cloud-storage'
            )

        connection = Client(project=project, credentials=credentials)
        self.bucket = connection.bucket(bucket_name)
Ejemplo n.º 16
0
def test_extract_table(client, to_delete):
    DATASET_ID = 'export_data_dataset_{}'.format(_millis())
    dataset = bigquery.Dataset(client.dataset(DATASET_ID))
    client.create_dataset(dataset)
    to_delete.append(dataset)

    table_ref = dataset.table('person_ages')
    table = client.create_table(bigquery.Table(table_ref, schema=SCHEMA))
    to_delete.insert(0, table)
    client.create_rows(table, ROWS)

    bucket_name = 'extract_person_ages_job_{}'.format(_millis())
    # [START extract_table]
    from google.cloud.storage import Client as StorageClient

    storage_client = StorageClient()
    bucket = storage_client.create_bucket(bucket_name)  # API request
    destination_blob_name = 'person_ages_out.csv'
    destination = bucket.blob(destination_blob_name)

    destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name)
    extract_job = client.extract_table(table_ref,
                                       destination_uri)  # API request
    extract_job.result(timeout=100)  # Waits for job to complete.

    got = destination.download_as_string().decode('utf-8')  # API request
    assert 'Bharney Rhubble' in got
    # [END extract_table]
    to_delete.append(bucket)
    to_delete.insert(0, destination)
Ejemplo n.º 17
0
def bucket_object(storage_client: storage.Client) -> storage.Bucket:
    """ GCS Bucket from .env config """
    if not storage_client.lookup_bucket(TEST_BUCKET):
        bucket = storage_client.create_bucket(TEST_BUCKET)
    else:
        bucket = storage_client.get_bucket(TEST_BUCKET)
    yield bucket
  def _create_connection(self):
    client = Client(
      credentials=self.credentials,
      project=self.project,
    )

    return client.bucket(self.bucket)
Ejemplo n.º 19
0
Archivo: gs.py Proyecto: ofirbb/Hub
class GS(Base):
    _creds: service_account.Credentials = None
    _project: str = None
    _bucket: Bucket = None

    def __init__(self, bucket: str, creds_path: Optional[str] = None):
        super().__init__()
        if creds_path is not None:
            self._creds = service_account.Credentials.from_service_account_file(
                creds_path)
            with open(creds_path, 'rt') as f:
                self._project = json.loads(f.read())['project_id']

            self._bucket = Client(self._project, self._creds).bucket(bucket)
        else:
            self._bucket = Client().bucket(bucket)

    def get(self, path: str) -> bytes:
        return self._bucket.get_blob(path).download_as_string()

    def put(self, path: str, content: bytes):
        self._bucket.blob(path).upload_from_string(content)

    def exists(self, path: str) -> bool:
        return self._bucket.get_blob(path) is not None

    def delete(self, path: str):
        blobs = self._bucket.list_blobs(prefix=path)
        for blob in blobs:
            blob.delete()
Ejemplo n.º 20
0
    def gs(self):
        from google.cloud.storage import Client

        return (
            Client.from_service_account_json(self.credentialpath)
            if self.credentialpath
            else Client(self.projectname)
        )
    def _client(self):
        from google.cloud.storage import Client

        if type(self._credentials) == str:
            return Client.from_service_account_json(self._credentials)
        else:
            return Client(credentials=self._credentials,
                          project=self.project_name)
Ejemplo n.º 22
0
 def _create_default_client(self,
                            service_account_credentials_path=settings.
                            GCS_STORAGE_SERVICE_ACCOUNT_KEY_PATH):
     if service_account_credentials_path:
         return Client.from_service_account_json(
             service_account_credentials_path)
     else:
         return Client()
Ejemplo n.º 23
0
 def create_client(self):
     # Client should be imported here because grpc starts threads during import
     # and if you call fork after that, a child process will be hang during exit
     from google.cloud.storage import Client
     if self.credentials:
         client = Client.from_service_account_json(self.credentials)
     else:
         client = Client()
     return client
Ejemplo n.º 24
0
 def create_auth(self):
     if self.auth_params['service_account']:
         self.client = Client.from_service_account_json(
             self.auth_params['service_account'])
     elif self.auth_params['token']:
         self.sess = Credentials(token=self.auth_params['token'])
         self.client = Client(credentials=self.sess)
     else:
         self.client = None
class RetimoDataset:
    cache_path = '.cache'

    def __init__(self, config_records: List[RetimoDatasetConfigRecord]):
        self.config_records = config_records
        self.storage_client = Client()

    def load(self, to_shuffle=True):
        self.config_records = [self._download(config_record) for config_record in self.config_records]
        self.config_records = [self._unzip(config_record) for config_record in self.config_records]
        arrays = dict(self._load_to_nparray(config_record) for config_record in self.config_records)
        if to_shuffle:
            values = arrays.values()
            y = {x for x in values}
            values_2 = values.values()
            shuffeled = sklearn.utils.shuffle()
        return arrays

    def _download(self, config_record: RetimoDatasetConfigRecord) -> RetimoDatasetConfigRecord:
        local_path = f"{self.cache_path}/{config_record.dataset_name}/{config_record.name}"
        os.makedirs(local_path, exist_ok=True)
        raw_path = f"{local_path}/raw"
        if not path.exists(raw_path):
            with open(raw_path, "w") as f:
                f.write("")

            print(f"Downloading raw data from {config_record.gcs_path} to {raw_path}")
            with open(raw_path, 'wb') as file_obj:
                self.storage_client.download_blob_to_file(config_record.gcs_path, file_obj)
        else:
            print(f"Downloading raw data for '{config_record.name}' not needed. Using cache '{raw_path}'")
        return config_record.add_local_path(local_path)

    def _unzip(self, config_record: RetimoDatasetConfigRecord) -> RetimoDatasetConfigRecord:
        unzipped_path = f"{config_record.local_path}/unziped"
        if not path.exists(unzipped_path):
            print(f"Unzipping file {config_record.raw_path} to {unzipped_path}")
            with zipfile.ZipFile(config_record.raw_path, 'r') as zip_ref:
                zip_ref.extractall(unzipped_path)
        else:
            print(f"Unzipping file for '{config_record.name}' not needed. Using cache '{unzipped_path}'")
        return config_record.unzipped()

    def _load_to_nparray(self, config_record: RetimoDatasetConfigRecord) -> Tuple[str, Dict[str, Union[ndarray, Any]]]:
        collector = {}
        for label in config_record.labels.keys():
            directory = f"{os.getcwd()}/{config_record.unzipped_path}/{config_record.name}/{label}/"
            dataset = numpy.asarray(
                [asarray(Image.open(f"{directory}/{image}"), dtype=numpy.uint8) for image in os.listdir(directory) if
                 image.endswith('.jpg')])
            label_list = full((len(dataset)), fill_value=config_record.labels[label], dtype=numpy.uint8)
            collector['dataset'] = numpy.append(collector.get('dataset', numpy.empty(0)), dataset).reshape(
                dataset.shape[0] + collector.get('dataset', numpy.empty(0)).shape[0], *dataset.shape[1:])
            collector['label'] = numpy.append(collector.get('label', numpy.empty(0)), label_list)

        return config_record.name, collector
Ejemplo n.º 26
0
def main():
    args = parse_args()

    # Imports of thor modules are deferred until after argument parsing to avoid
    # numba JIT time if the arguments are invalid or the user asked for --help.
    import thor.utils.logging

    thor.utils.logging.setupLogger("thor")

    from thor.taskqueue.client import Client as TaskQueueClient
    from thor.taskqueue.queue import TaskQueueConnection
    from thor.orbits import Orbits
    from thor.config import Config

    if not isinstance(args.config, str):
        config = Config
    else:
        config = Config.fromYaml(args.config)

    # Read observations
    preprocessed_observations = pd.read_csv(args.preprocessed_observations,
                                            index_col=False,
                                            dtype={"obs_id": str})

    # Read test orbits
    test_orbits = Orbits.from_csv(args.test_orbits)

    # Connect to Rabbit
    queue = TaskQueueConnection(
        pika.ConnectionParameters(
            host=args.rabbit_host,
            port=args.rabbit_port,
            credentials=pika.PlainCredentials(
                username=args.rabbit_username,
                password=args.rabbit_password,
            ),
        ),
        args.queue,
    )
    queue.connect()

    # Connect to GCS bucket
    gcs = GCSClient()
    if args.create_bucket:
        try:
            gcs.create_bucket(args.bucket)
        except google.cloud.exceptions.Conflict:
            # Bucket already exists.
            pass
    bucket = gcs.bucket(args.bucket)
    taskqueue_client = TaskQueueClient(bucket, queue)

    manifest = taskqueue_client.launch_job(config, preprocessed_observations,
                                           test_orbits)
    taskqueue_client.monitor_job_status(manifest.job_id)
    taskqueue_client.download_results(manifest, args.out_dir)
Ejemplo n.º 27
0
    def __init__(
        self,
        application_credentials: Optional[Union[str, os.PathLike]] = None,
        credentials: Optional["Credentials"] = None,
        project: Optional[str] = None,
        storage_client: Optional["StorageClient"] = None,
        local_cache_dir: Optional[Union[str, os.PathLike]] = None,
    ):
        """Class constructor. Sets up a [`Storage
        Client`](https://googleapis.dev/python/storage/latest/client.html).
        Supports the following authentication methods of `Storage Client`.

        - Environment variable `"GOOGLE_APPLICATION_CREDENTIALS"` containing a
          path to a JSON credentials file for a Google service account. See
          [Authenticating as a Service
          Account](https://cloud.google.com/docs/authentication/production).
        - File path to a JSON credentials file for a Google service account.
        - OAuth2 Credentials object and a project name.
        - Instantiated and already authenticated `Storage Client`.

        If multiple methods are used, priority order is reverse of list above
        (later in list takes priority). If no authentication methods are used,
        then the client will be instantiated as anonymous, which will only have
        access to public buckets.

        Args:
            application_credentials (Optional[Union[str, os.PathLike]]): Path to Google service
                account credentials file.
            credentials (Optional[Credentials]): The OAuth2 Credentials to use for this client.
                See documentation for [`StorageClient`](
                https://googleapis.dev/python/storage/latest/client.html).
            project (Optional[str]): The project which the client acts on behalf of. See
                documentation for [`StorageClient`](
                https://googleapis.dev/python/storage/latest/client.html).
            storage_client (Optional[StorageClient]): Instantiated [`StorageClient`](
                https://googleapis.dev/python/storage/latest/client.html).
            local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
                for downloaded files. If None, will use a temporary directory.
        """
        if application_credentials is None:
            application_credentials = os.getenv(
                "GOOGLE_APPLICATION_CREDENTIALS")

        if storage_client is not None:
            self.client = storage_client
        elif credentials is not None:
            self.client = StorageClient(credentials=credentials,
                                        project=project)
        elif application_credentials is not None:
            self.client = StorageClient.from_service_account_json(
                application_credentials)
        else:
            self.client = StorageClient.create_anonymous_client()

        super().__init__(local_cache_dir=local_cache_dir)
Ejemplo n.º 28
0
    def __init__(self,
                 bucket: str,
                 account_info: Union[Client, str, Dict],
                 timeout: int = 15):
        self.account_info = account_info
        self.timeout = timeout
        self._client, self._project = ((account_info, None) if isinstance(
            account_info, Client) else self._generate_client(account_info))

        self.client = Client(credentials=self._client, project=self._project)
        self.bucket = self.client.bucket(bucket)
Ejemplo n.º 29
0
Archivo: gs.py Proyecto: ofirbb/Hub
    def __init__(self, bucket: str, creds_path: Optional[str] = None):
        super().__init__()
        if creds_path is not None:
            self._creds = service_account.Credentials.from_service_account_file(
                creds_path)
            with open(creds_path, 'rt') as f:
                self._project = json.loads(f.read())['project_id']

            self._bucket = Client(self._project, self._creds).bucket(bucket)
        else:
            self._bucket = Client().bucket(bucket)
Ejemplo n.º 30
0
 def setUp(self):
     os.environ['FILESYSTEM_PUBLISH_ENABLED'] = '0'
     os.environ['FILESYSTEM_ENABLED'] = '0'
     os.environ['GOOGLE_PUBLISH_ENABLED'] = '1'
     os.environ['GOOGLE_ENABLED'] = '1'
     os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'
     os.environ['CONFIG'] = './example/config.yml'
     self.store = Datastore()
     self.publish_config = self.store.config.publish['handlers']['gcloud']
     self.storage_config = self.store.config.storage['gcloud']
     self.dataset = self.store.datasets[0]
     self.client = Client()
Ejemplo n.º 31
0
    def test_load_table_from_storage_then_dump_table(self):
        import csv
        import tempfile
        from google.cloud.storage import Client as StorageClient
        local_id = unique_resource_id()
        BUCKET_NAME = 'bq_load_test' + local_id
        BLOB_NAME = 'person_ages.csv'
        GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME)
        ROWS = [
            ('Phred Phlyntstone', 32),
            ('Bharney Rhubble', 33),
            ('Wylma Phlyntstone', 29),
            ('Bhettye Rhubble', 27),
        ]
        TABLE_NAME = 'test_table'

        s_client = StorageClient()

        # In the **very** rare case the bucket name is reserved, this
        # fails with a ConnectionError.
        bucket = s_client.create_bucket(BUCKET_NAME)
        self.to_delete.append(bucket)

        blob = bucket.blob(BLOB_NAME)

        with tempfile.TemporaryFile(mode='w+') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(('Full Name', 'Age'))
            writer.writerows(ROWS)
            blob.upload_from_file(
                csv_file, rewind=True, content_type='text/csv')

        self.to_delete.insert(0, blob)

        dataset = Config.CLIENT.dataset(
            _make_dataset_name('load_gcs_then_dump'))

        retry_403(dataset.create)()
        self.to_delete.append(dataset)

        full_name = bigquery.SchemaField('full_name', 'STRING',
                                         mode='REQUIRED')
        age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED')
        table = dataset.table(TABLE_NAME, schema=[full_name, age])
        table.create()
        self.to_delete.insert(0, table)

        job = Config.CLIENT.load_table_from_storage(
            'bq_load_storage_test_' + local_id, table, GS_URL)
        job.create_disposition = 'CREATE_NEVER'
        job.skip_leading_rows = 1
        job.source_format = 'CSV'
        job.write_disposition = 'WRITE_EMPTY'

        job.begin()

        def _job_done(instance):
            return instance.state in ('DONE', 'done')

        # Allow for 90 seconds of "warm up" before rows visible.  See:
        # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
        # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds
        retry = RetryInstanceState(_job_done, max_tries=8)
        retry(job.reload)()

        rows, _, _ = table.fetch_data()
        by_age = operator.itemgetter(1)
        self.assertEqual(sorted(rows, key=by_age),
                         sorted(ROWS, key=by_age))