def __init__(self, options): """Initializes a Dataflow API client object.""" self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) if _use_fnapi(options): self.environment_version = _FNAPI_ENVIRONMENT_MAJOR_VERSION else: self.environment_version = _LEGACY_ENVIRONMENT_MAJOR_VERSION if self.google_cloud_options.no_auth: credentials = None else: credentials = get_service_credentials() http_client = get_new_http() self._client = dataflow.DataflowV1b3( url=self.google_cloud_options.dataflow_endpoint, credentials=credentials, get_credentials=(not self.google_cloud_options.no_auth), http=http_client, response_encoding=get_response_encoding()) self._storage_client = storage.StorageV1( url='https://www.googleapis.com/storage/v1', credentials=credentials, get_credentials=(not self.google_cloud_options.no_auth), http=http_client, response_encoding=get_response_encoding())
def assert_bucket_exists(bucket_name): # type: (str) -> None """Asserts whether the specified GCS bucket with the name bucket_name exists. Logs an error and raises a ValueError if the bucket does not exist. Logs a warning if the bucket cannot be verified to exist. """ try: from apitools.base.py.exceptions import HttpError storage_client = storage.StorageV1( credentials=auth.get_service_credentials(), get_credentials=False, http=get_new_http(), response_encoding='utf8') request = storage.StorageBucketsGetRequest(bucket=bucket_name) storage_client.buckets.Get(request) except HttpError as e: if e.status_code == 404: _LOGGER.error('%s bucket does not exist!', bucket_name) raise ValueError('Invalid GCS bucket provided!') else: _LOGGER.warning( 'HttpError - unable to verify whether bucket %s exists', bucket_name) except ImportError: _LOGGER.warning( 'ImportError - unable to verify whether bucket %s exists', bucket_name)
def _download_file(from_url, to_path): """Downloads a file over http/https from a url or copy it from a remote path to local path.""" if from_url.startswith('http://') or from_url.startswith('https://'): # TODO(silviuc): We should cache downloads so we do not do it for every # job. try: # We check if the file is actually there because wget returns a file # even for a 404 response (file will contain the contents of the 404 # response). # TODO(angoenka): Extract and use the filename when downloading file. response, content = get_new_http().request(from_url) if int(response['status']) >= 400: raise RuntimeError( 'Artifact not found at %s (response: %s)' % (from_url, response)) with open(to_path, 'w') as f: f.write(content) except Exception: logging.info('Failed to download Artifact from %s', from_url) raise else: if not os.path.isdir(os.path.dirname(to_path)): logging.info( 'Created folder (since we have not done yet, and any errors ' 'will follow): %s ', os.path.dirname(to_path)) os.mkdir(os.path.dirname(to_path)) shutil.copyfile(from_url, to_path)
def _download_file(from_url, to_path): """Downloads a file over http/https from a url or copy it from a remote path to local path.""" if from_url.startswith('http://') or from_url.startswith('https://'): # TODO(silviuc): We should cache downloads so we do not do it for every # job. try: # We check if the file is actually there because wget returns a file # even for a 404 response (file will contain the contents of the 404 # response). # TODO(angoenka): Extract and use the filename when downloading file. response, content = get_new_http().request(from_url) if int(response['status']) >= 400: raise RuntimeError( 'Artifact not found at %s (response: %s)' % (from_url, response)) with open(to_path, 'w') as f: f.write(content) except Exception: logging.info('Failed to download Artifact from %s', from_url) raise else: if not os.path.isdir(os.path.dirname(to_path)): logging.info( 'Created folder (since we have not done yet, and any errors ' 'will follow): %s ', os.path.dirname(to_path)) os.mkdir(os.path.dirname(to_path)) shutil.copyfile(from_url, to_path)
def __init__(self, options): """Initializes a Dataflow API client object.""" self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) if _use_fnapi(options): self.environment_version = _FNAPI_ENVIRONMENT_MAJOR_VERSION else: self.environment_version = _LEGACY_ENVIRONMENT_MAJOR_VERSION if self.google_cloud_options.no_auth: credentials = None else: credentials = get_service_credentials() http_client = get_new_http() self._client = dataflow.DataflowV1b3( url=self.google_cloud_options.dataflow_endpoint, credentials=credentials, get_credentials=(not self.google_cloud_options.no_auth), http=http_client, response_encoding=get_response_encoding()) self._storage_client = storage.StorageV1( url='https://www.googleapis.com/storage/v1', credentials=credentials, get_credentials=(not self.google_cloud_options.no_auth), http=http_client, response_encoding=get_response_encoding())
def __init__(self, client=None): self.client = client or bigquery.BigqueryV2( http=get_new_http(), credentials=auth.get_service_credentials()) self._unique_row_id = 0 # For testing scenarios where we pass in a client we do not want a # randomized prefix for row IDs. self._row_id_prefix = '' if client else uuid.uuid4() self._temporary_table_suffix = uuid.uuid4().hex
def __init__(self, storage_client=None): if storage_client is None: storage_client = storage.StorageV1( credentials=auth.get_service_credentials(), get_credentials=False, http=get_new_http(), response_encoding='utf8') self.client = storage_client self._rewrite_cb = None
def __init__(self, client=None): self.client = client or bigquery.BigqueryV2( http=get_new_http(), credentials=auth.get_service_credentials(), response_encoding=None if sys.version_info[0] < 3 else 'utf8') self._unique_row_id = 0 # For testing scenarios where we pass in a client we do not want a # randomized prefix for row IDs. self._row_id_prefix = '' if client else uuid.uuid4() self._temporary_table_suffix = uuid.uuid4().hex
def __init__(self, options): super().__init__(options) self._google_cloud_options = options.view_as(GoogleCloudOptions) if self._google_cloud_options.no_auth: credentials = None else: credentials = get_service_credentials() self._storage_client = storage.StorageV1( url='https://www.googleapis.com/storage/v1', credentials=credentials, get_credentials=(not self._google_cloud_options.no_auth), http=get_new_http(), response_encoding='utf8') self._cloudbuild_client = cloudbuild.CloudbuildV1( credentials=credentials, get_credentials=(not self._google_cloud_options.no_auth), http=get_new_http(), response_encoding='utf8') if not self._docker_registry_push_url: self._docker_registry_push_url = ( 'gcr.io/%s/prebuilt_beam_sdk' % self._google_cloud_options.project)
def __init__(self, storage_client=None, pipeline_options=None): if storage_client is None: storage_client = storage.StorageV1( credentials=auth.get_service_credentials(pipeline_options), get_credentials=False, http=get_new_http(), response_encoding='utf8', additional_http_headers={ "User-Agent": "apache-beam-%s" % apache_beam.__version__ }) self.client = storage_client self._rewrite_cb = None self.bucket_to_project_number = {}
def _download_file(from_url, to_path): """Downloads a file over http/https from a url or copy it from a remote path to local path.""" if from_url.startswith('http://') or from_url.startswith('https://'): # TODO(silviuc): We should cache downloads so we do not do it for every # job. try: # We check if the file is actually there because wget returns a file # even for a 404 response (file will contain the contents of the 404 # response). response, content = get_new_http().request(from_url) if int(response['status']) >= 400: raise RuntimeError( 'Artifact not found at %s (response: %s)' % (from_url, response)) with open(to_path, 'wb') as f: f.write(content) except Exception: _LOGGER.info('Failed to download Artifact from %s', from_url) raise else: try: read_handle = FileSystems.open( from_url, compression_type=CompressionTypes.UNCOMPRESSED) with read_handle as fin: with open(to_path, 'wb') as f: while True: chunk = fin.read(Stager._DEFAULT_CHUNK_SIZE) if not chunk: break f.write(chunk) _LOGGER.info('Copied remote file from %s to %s.', from_url, to_path) return except Exception as e: _LOGGER.info( 'Failed to download file from %s via apache_beam.io.filesystems.' 'Trying to copy directly. %s', from_url, repr(e)) if not os.path.isdir(os.path.dirname(to_path)): _LOGGER.info( 'Created folder (since we have not done yet, and any errors ' 'will follow): %s ', os.path.dirname(to_path)) os.mkdir(os.path.dirname(to_path)) shutil.copyfile(from_url, to_path)
def __new__(cls, storage_client=None): if storage_client: # This path is only used for testing. return super(GcsIO, cls).__new__(cls) else: # Create a single storage client for each thread. We would like to avoid # creating more than one storage client for each thread, since each # initialization requires the relatively expensive step of initializing # credentaials. local_state = threading.local() if getattr(local_state, 'gcsio_instance', None) is None: credentials = auth.get_service_credentials() storage_client = storage.StorageV1( credentials=credentials, get_credentials=False, http=get_new_http(), response_encoding=None if sys.version_info[0] < 3 else 'utf8') local_state.gcsio_instance = super(GcsIO, cls).__new__(cls) local_state.gcsio_instance.client = storage_client return local_state.gcsio_instance
def test_get_new_http_proxy_info(self): os.environ['http_proxy'] = 'localhost' http = get_new_http() expected = ProxyInfo(3, 'localhost', 80) self.assertEquals(str(http.proxy_info), str(expected))
def test_get_new_http_proxy_info(self): with mock.patch.dict(os.environ, http_proxy='localhost'): http = get_new_http() expected = ProxyInfo(3, 'localhost', 80) self.assertEquals(str(http.proxy_info), str(expected))
def test_get_new_http_timeout(self): http = get_new_http() self.assertEquals(http.timeout, DEFAULT_HTTP_TIMEOUT_SECONDS)
def test_get_new_http_proxy_info(self): os.environ['http_proxy'] = 'localhost' http = get_new_http() expected = ProxyInfo(3, 'localhost', 80) self.assertEquals(str(http.proxy_info), str(expected))
def test_get_new_http_proxy_info(self): with mock.patch.dict(os.environ, http_proxy='localhost'): http = get_new_http() expected = ProxyInfo(3, 'localhost', 80) self.assertEqual(str(http.proxy_info), str(expected))
def test_get_new_http_timeout(self): http = get_new_http() self.assertEqual(http.timeout, DEFAULT_HTTP_TIMEOUT_SECONDS)