def skip_blob(blob_name, bucket_name="project_vaxx"): bucket_t = storage_client.bucket(bucket_name, user_project=None) temp = storage.Blob(blob_name, bucket_t) # # print('Blob {} deleted.'.format(temp)) blob = bucket.blob(blob_name) name = "yyy-" + blob_name blob = bucket_t.rename_blob(blob, name)
def test_copy_existing_file_with_user_project(self): new_bucket_name = 'copy-w-requester-pays' + unique_resource_id('-') created = Config.CLIENT.create_bucket(new_bucket_name, requester_pays=True) self.case_buckets_to_delete.append(new_bucket_name) self.assertEqual(created.name, new_bucket_name) self.assertTrue(created.requester_pays) to_delete = [] blob = storage.Blob('simple', bucket=created) blob.upload_from_string(b'DEADBEEF') to_delete.append(blob) try: with_user_project = Config.CLIENT.bucket(new_bucket_name, user_project=USER_PROJECT) new_blob = retry_bad_copy(with_user_project.copy_blob)( blob, with_user_project, 'simple-copy') to_delete.append(new_blob) base_contents = blob.download_as_string() copied_contents = new_blob.download_as_string() self.assertEqual(base_contents, copied_contents) finally: for blob in to_delete: retry_429(blob.delete)()
def test_copy(self, mock_service, mock_bucket): source_bucket = 'test-source-bucket' source_object = 'test-source-object' destination_bucket = 'test-dest-bucket' destination_object = 'test-dest-object' destination_bucket_instance = mock_bucket source_blob = mock_bucket.blob(source_object) destination_blob = storage.Blob(bucket=destination_bucket_instance, name=destination_object) # Given bucket_mock = mock_service.return_value.bucket bucket_mock.return_value = mock_bucket copy_method = bucket_mock.return_value.copy_blob copy_method.return_value = destination_blob # When response = self.gcs_hook.copy( # pylint:disable=assignment-from-no-return source_bucket=source_bucket, source_object=source_object, destination_bucket=destination_bucket, destination_object=destination_object) # Then self.assertEqual(response, None) copy_method.assert_called_once_with( blob=source_blob, destination_bucket=destination_bucket_instance, new_name=destination_object)
def get_ondemand_bot_log(intended_user, *, user_id): if user_id != intended_user: raise web_util.user_mismatch_error( message="Cannot get bot log for another user.") bucket = model.get_ondemand_replay_bucket() blob = gcloud_storage.Blob("ondemand_bot_log_{}".format(user_id), bucket, chunk_size=262144) buffer = io.BytesIO() try: blob.download_to_file(buffer) except gcloud_exceptions.NotFound: raise util.APIError(404, message="Error log not found.") buffer.seek(0) response = web_util.no_cache( flask.make_response( flask.send_file(buffer, mimetype="text/plain", as_attachment=True, attachment_filename="{}.log".format(user_id)))) response.headers["Content-Length"] = str(buffer.getbuffer().nbytes) return response
def gcs_upload(local_path, gcs_path, project_id=None, force=False): bucket_path, filename = os.path.split(gcs_path) bucket_name = os.path.basename(bucket_path) if project_id is None: client = GcsClient.client else: client = storage.Client( project=project_id ) try: result = gsutil_ls(bucket_name, filter=filename, project_id=project_id) # result = __shell__("gsutil ls {}".format(BUCKET_PATH, split=False)) if "BucketNotFoundException" in result: raise ValueError( "ERROR: bucket not found, path={}".format(bucket_name)) if result and not force: raise Warning("WARNING: gcs file already exists, use force=True. bucket={}".format(bucket_name)) # client = storage.Client( project=project_id ) bucket = client.get_bucket(bucket_name) blob = storage.Blob(filename, bucket) print("uploading file={} ...".format(gcs_path)) blob.upload_from_filename(local_path) return gcs_path except exceptions.NotFound: raise ValueError("BucketNotFoundException: GCS bucket not found, path={}".format(bucket_path)) except Exception as e: print(e)
def file_exists(self, filename): """ Check if 'filename' file exists within bucket :param filename: :return: (Bool) """ return storage.Blob(filename, self._bucket).exists(self._gcsclient)
def gcs_download(gcs_path, local_path, project_id=None, force=False): bucket_path, filename = os.path.split(gcs_path) bucket_name = os.path.basename(bucket_path) if os.path.isfile(local_path) and not force: raise Warning( "WARNING: local file already exists, use force=True. path={}". format(local_path)) if project_id is None: client = GcsClient.client else: client = storage.Client(project=project_id) try: # client = storage.Client( project=project_id ) bucket = client.get_bucket(bucket_name) blob = storage.Blob(filename, bucket) print("downloading file={} ...".format(gcs_path)) blob.download_to_filename(local_path) return local_path except exceptions.NotFound: raise ValueError( "BucketNotFoundException: GCS bucket not found, path={}".format( bucket_path)) except Exception as e: print(e)
def blobExists(self, location): ''' Check if file is located in GCP bucket :param location: string, the blob location string :return: boolean ''' return storage.Blob(bucket=self.BUCKET, name=location).exists()
def write_config_to_bucket(activity_json): #if WRITE_EPOCH is True: if len(activity_json) > 0: logger.log_text( f"{FUNCTION_NAME}: Updating configuration with epoch value of the current time: {datetime.datetime.fromtimestamp(int(CALLED_EPOCH)).strftime('%Y-%m-%d %H:%M:%S')}" ) CONFIGURATION.set('strava_client', 'strava_current_epoch', f'{CALLED_EPOCH}') else: logger.log_text( f'{FUNCTION_NAME}: No activities returned; so not updating the epoch' ) GCS_BUCKET = os.environ.get(KEY_EV_GCS_BUCKET) CONFIG_FILE = os.environ.get(KEY_EV_CONFIG_FILE) localconfig = StringIO() CONFIGURATION.write(localconfig) localconfig.seek(0) logger.log_text( f'{FUNCTION_NAME}: Writing {CONFIG_FILE} to bucket: {localconfig.read()}' ) try: client = storage.Client() bucket = client.get_bucket(GCS_BUCKET) blob = storage.Blob(CONFIG_FILE, bucket) localconfig.seek(0) blob.upload_from_file(localconfig) except: logger.log_text( f'{FUNCTION_NAME}: Error while transacting with GCS: {sys.exc_info()}' ) raise RuntimeError( f'Error while transacting with GCS: {sys.exc_info()}') localconfig.close()
def check_schema_stg(self, tag): # Get schemas bucket from other project external_credentials = self.gcp_helper.request_auth_token() storage_client_external = storage.Client( credentials=external_credentials) storage_bucket = storage_client_external.get_bucket( config.SCHEMAS_BUCKET) # Get schema name from tag tag = tag.replace('/', '_') if not tag.endswith(".json"): tag = tag + ".json" blob_name = tag # Check if schema is in schema storage if storage.Blob(bucket=storage_bucket, name=blob_name).exists(storage_client_external): # Get blob blob = storage_bucket.get_blob(blob_name) if blob: # Convert to string blob_json_string = blob.download_as_string() # Convert to json blob_json = json.loads(blob_json_string) # return blob in json format return blob_json return None
def _lock_down_bucket(spinner, cloud_logger, bucket, lock_file_name, service_account_email): """Change the ACL/IAM on the bucket so that only the service account can access it. Args: spinner: The spinner displayed in the console cloud_logger: A GCP logging client instance bucket: The bucket object to lock down lock_file_name: The name of the lock file service_account_email: The email of the service account """ if storage.Blob(lock_file_name, bucket).exists(): spinner.fail('X') msg = 'The lock file exists in the source bucket, so we cannot continue' cloud_logger.log_text(msg) raise SystemExit(msg) spinner.ok(_CHECKMARK) msg = 'Locking down the bucket by revoking all ACLs/IAM policies' spinner.text = msg cloud_logger.log_text(msg) # Turn off any bucket ACLs bucket.acl.save_predefined('private') # Revoke all IAM access and only set the service account as an admin policy = api_core_iam.Policy() policy['roles/storage.admin'].add('serviceAccount:' + service_account_email) bucket.set_iam_policy(policy)
def read_block(self, offset): try: blob = storage.Blob(str(offset), self.gcp_bucket) block = blob.download_as_string() return bytearray(block) except NotFound: return bytearray([])
def check_bucket(file, bucketID): """ checks bucket for file and returns bool """ bucket = storage_client.bucket(bucketID) check = storage.Blob(bucket=bucket, name=file).exists(storage_client) return check
def convert(): blobs = bucket.list_blobs() counter = 0 for blob in blobs: just_name = os.path.splitext(blob.name)[0] if just_name.startswith("ogg/") or just_name.startswith("wav/"): continue counter += 1 if is_converted(blob): print("++++Already converted. Deleting.") blob.delete() print("++++Deleted") continue print(blob.name) ogg_file_name = './ogg/' + blob.name blob.download_to_filename(ogg_file_name) print("++++Downloaded") wav_file_name = './wav/' + just_name + ".wav" blob_wav_name = 'wav/' + just_name + ".wav" try: call(["ffmpeg", "-i", ogg_file_name, wav_file_name]) print("++++Converted") blob_wav = storage.Blob(blob_wav_name, bucket) blob_wav.upload_from_filename(wav_file_name) print("++++Uploaded") bucket.copy_blob(blob, bucket, "ogg/" + blob.name) print("++++Copied") except IOError: continue if is_converted(blob): print("++++Successfully converted. Deleting.") blob.delete() print("++++Deleted") print(counter)
def _local_to_gs(self, configs): self._logger.debug('Starting local to gs...') start_timestamp = datetime.now() for config in configs: self.delete_in_gs(data_name=config.data_name) for config in configs: if not self.exist_in_local(data_name=config.data_name): raise ValueError('There is no data named {} in local'.format( config.data_name)) for local_file_path in self.list_local_file_paths( data_name=config.data_name): basename = os.path.basename(local_file_path) if self._gs_dir_path_in_bucket is None: name = basename else: name = self._gs_dir_path_in_bucket + '/' + basename blob = storage.Blob(name=name, bucket=self._bucket, chunk_size=self._chunk_size) blob.upload_from_filename(filename=local_file_path) for config in configs: if config.delete_in_source: self.delete_in_local(data_name=config.data_name) end_timestamp = datetime.now() duration = (end_timestamp - start_timestamp).seconds self._logger.debug('Ended local to gs [{}s]'.format(duration)) return duration
def read_config_from_bucket(): global CONFIGURATION GCS_BUCKET = os.environ.get(KEY_EV_GCS_BUCKET) logger.log_text( f'{FUNCTION_NAME}: Environment variable GCS_BUCKET: {GCS_BUCKET}') CONFIG_FILE = os.environ.get(KEY_EV_CONFIG_FILE) logger.log_text( f'{FUNCTION_NAME}: Environment variable CONFIG_FILE: {CONFIG_FILE}') if GCS_BUCKET is None or CONFIG_FILE is None: logger.log_text( f'{FUNCTION_NAME}: Expected environment variables are missing; throwing RuntimeError' ) raise RuntimeError('Expected environment variables are missing') try: client = storage.Client() bucket = client.get_bucket(GCS_BUCKET) blob = storage.Blob(CONFIG_FILE, bucket) localconfig = BytesIO() client.download_blob_to_file(blob, localconfig) except: logger.log_text( f'{FUNCTION_NAME}: Error while transacting with GCS: {sys.exc_info()}' ) raise RuntimeError( f'Error while transacting with GCS: {sys.exc_info()}') localconfig.seek(0) logger.log_text( f'{FUNCTION_NAME}: Read {CONFIG_FILE} from bucket: {localconfig.read().decode("utf-8")}' ) localconfig.seek(0) CONFIGURATION.read_string(localconfig.read().decode('utf-8')) localconfig.close()
def get_match_replay(intended_user, match_id): with model.engine.connect() as conn: match = conn.execute( sqlalchemy.sql.select([ model.games.c.replay_name, model.games.c.replay_bucket, ]).where(model.games.c.id == match_id)).first() if not match: raise util.APIError(404, message="Match not found.") bucket = model.get_replay_bucket(match["replay_bucket"]) blob = gcloud_storage.Blob(match["replay_name"], bucket, chunk_size=262144) buffer = io.BytesIO() blob.download_to_file(buffer) buffer.seek(0) response = flask.make_response( flask.send_file(buffer, mimetype="application/x-halite-2-replay", as_attachment=True, attachment_filename=str(match_id) + ".hlt")) response.headers["Content-Length"] = str(buffer.getbuffer().nbytes) return response
def calc(btc_history, btc_score, btc_prediction_file): client = gcs.Client(project_name) blob = gcs.Blob(btc_prediction_file, client.get_bucket(bucket_name)) content = blob.download_as_string() btc_prediction = json.loads(content) # calc today = (datetime.date.today() - datetime.timedelta(days=2)).strftime("%Y-%m-%d") latest_actual = btc_history['bpi'][today] latest_prediction = btc_prediction['prediction'][-1] before_prediction = btc_prediction['prediction'][-2] if latest_actual > before_prediction: a = before_prediction b = latest_actual else: a = latest_actual b = before_prediction r = a / b xs = btc_score return [ xs[0] + (0 if 0.95 <= r and r > 0.98 else 1), xs[1] + (0 if 0.98 <= r and r > 0.99 else 1), xs[2] + (0 if r <= 0.99 else 1), latest_prediction ]
def download_from_cloud(): client = storage.Client() bucket = client.get_bucket('spk_bucket1') blob = storage.Blob('asd/c1.txt', bucket) with open('/home/sujithpk/Desktop/d.csv', 'wb') as file_obj: blob.download_to_file(file_obj)
def get_exported_table_df(table_name): """Retrieve exported table file on GCS. Args: table_name (string): Name of the table to load. Returns: pandas.DataFrame """ bucket = storage\ .Client(get_config('gcp_project_name'))\ .get_bucket(get_config('gcs_bucket_name')) key = \ '{experiment_name}/exported_tables/{table_name}/' \ '{date_descriptor}/out.csv.gzip'.format( experiment_name=get_config('experiment_name'), table_name=table_name, date_descriptor='{{ ds_nodash }}') blob = storage.Blob(key, bucket) bio = io.BytesIO() blob.download_to_file(bio) bio.seek(0) return pd.read_csv(bio, compression='gzip')
def push_to_storage(file_path, bucket_name, blob_name=''): max_retry = 3 fails = 0 chunk_size = 1 << 23 bck, err = get_gcs_bucket(bucket_name) if err: return ('', file_path, err) if blob_name: blob_name = '/'.join([blob_name, basename(file_path)]) gs.blob._MAX_MULTIPART_SIZE = chunk_size blob = gs.Blob(blob_name, bck, chunk_size) while blob and fails < max_retry: try: with open(file_path, 'rb') as f: blob.upload_from_file(f) except Exception as e: err = str(e) fails += 1 else: err = '' break blob_path = 'gs://{}/{}'.format(bck.name, blob.name) \ if fails < max_retry else '' return (blob_path, file_path, err)
def download_source_blob(): """Retrieve the worker blob from object storage.""" cached_blob = cache.get(config.WORKER_ARTIFACT_KEY) if cached_blob is None: print("Getting from GCloud", config.WORKER_ARTIFACT_KEY) # Retrieve from GCloud try: gcloud_blob = gcloud_storage.Blob( config.WORKER_ARTIFACT_KEY, model.get_deployed_artifacts_bucket(), chunk_size=262144) cached_blob = gcloud_blob.download_as_string() cache.set(config.WORKER_ARTIFACT_KEY, cached_blob) except gcloud_exceptions.NotFound: raise util.APIError(404, message="Worker blob not found.") if cached_blob is None: raise util.APIError(404, message="Worker blob not found.") print("Building buffer") buffer = io.BytesIO() buffer.write(cached_blob) buffer.seek(0) return flask.send_file(buffer, mimetype="application/gzip", as_attachment=True, attachment_filename="Halite.tgz")
def new_hires(data): df = data[data['DEPARTMENT_TITLE'] == 'CITY ATTORNEY'] cols = ['RECORD_NBR', 'JOB_CLASS_PGRADE', 'PAY_YEAR', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'JOB_STATUS', 'EMPLOYMENT_TYPE'] df2 = df.groupby(cols)['REGULAR_PAY'].sum().reset_index() df2['JOB_CLASS_PGRADE_NUMERIC'] = df2['JOB_CLASS_PGRADE'].rank(method='dense', ascending=True).astype(int) df2['JOB_CLASS_PGRADE_RANK'] = df2.groupby('RECORD_NBR')['JOB_CLASS_PGRADE_NUMERIC'].rank('dense').astype(int) df2['NEW_HIRE'] = df2.groupby('RECORD_NBR')['PAY_YEAR'].rank('dense').astype(int) df2['ETHNICITY'] = df2['ETHNICITY'].str.strip().replace(ETHNICITY_MAPPING) df2['GROUPING_01'] = df2['JOB_TITLE'].str.strip().replace(JOB_TITLE_MAPPING_01) df2['GROUPING_01'] = df2['GROUPING_01'].str.strip() df2['GROUPING_02'] = df2['JOB_TITLE'].str.strip().replace(JOB_TITLE_MAPPING_02) df2['GROUPING_02'] = df2['GROUPING_02'].str.strip() # drop 1st year new_hires = df2[(df2['PAY_YEAR'] != 2013) & (df2['NEW_HIRE'] == 1)] # drop duplicates new_hires = new_hires.drop_duplicates(subset='RECORD_NBR', keep='first') new_hires['PAY_YEAR'] = new_hires['PAY_YEAR'].apply(lambda x: f'{x}0101') bucket_name = os.getenv('GA_BUCKET') fname = 'new_hires.csv' ga_bucket_path = f'data/{fname}' client = storage.Client() bucket = client.get_bucket(bucket_name) blob = storage.Blob(ga_bucket_path, bucket) df_str = new_hires.to_csv(index=False, encoding='utf-8') blob.upload_from_string(df_str) print("uploaded new_hires.csv")
def download_blob(self, bucket_name, blob_name, local_path=None): """ Gets a blob from a bucket `Args:` bucket_name: str The name of the bucket blob_name: str The name of the blob local_path: str The local path where the file will be downloaded. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. `Returns:` str The path of the downloaded file """ if not local_path: local_path = files.create_temp_file_for_path('TEMPTHING') bucket = storage.Bucket(self.client, name=bucket_name) blob = storage.Blob(blob_name, bucket) logger.info(f'Downloading {blob_name} from {bucket_name} bucket.') with open(local_path, 'wb') as f: blob.download_to_file(f, client=self.client) logger.info(f'{blob_name} saved to {local_path}.') return local_path
def run( self, bucket_name: str = None, blob: str = None, project: str = None, wait_seconds: int = 0, fail_if_not_found: bool = True, credentials: dict = None, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - bucket_name (str, optional): the bucket to check - blob (str, optional): object for which to search within the bucket - project (str, optional): default Google Cloud project to work within. If not provided, will be inferred from your Google Cloud credentials - wait_seconds(int, optional): retry until file is found or until wait_seconds, whichever is first. Defaults to 0 - fail_if_not_found (bool, optional): Will raise Fail signal on task if blob is not found. Defaults to True - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Returns: - bool: the object exists Raises: - ValueError: if `bucket_name` or `blob` are missing - FAIL: if object not found and fail_if_not_found is True """ if None in [bucket_name, blob]: raise ValueError("Missing bucket_name or blob") # create client client = get_storage_client(project=project, credentials=credentials) bucket = client.bucket(bucket_name) blob_exists = None wait, n = 0, 1 while wait <= wait_seconds and not blob_exists: sleep(n) wait += n n *= 2 blob_exists = storage.Blob(bucket=bucket, name=blob).exists(client) if fail_if_not_found and not blob_exists: raise FAIL(message="Blob not found") return blob_exists
def file_exist_gs(self, gsPath): """ Function to know if a file is contained in a GCS bucket Input : gsPath : full path file Output : stat : True if present, False otherwise """ try: bucketName, fileName = self.extractBucketFile(gsPath) bucket = self.get_bucket(bucketName) stat = storage.Blob(bucket=bucket, name=fileName).exists(self) self.logger.log_text( text= f"File {fileName} available in bucket {bucketName} : {stat}", severity="INFO", labels=self.labels, ) return stat except Exception as e: self.logger.log_text( text=f"Error in checking file {fileName} in GS : {e}", severity="ERROR", labels=self.labels, ) return False
def execute(self): series = [] for fname in self.reg.files(): with open(fname) as fp: data = json.loads(fp.read()) series.append(data) series.sort(key=lambda x: x['refdate']) logging.debug('generating series file with %s rows', len(series)) def dump_n_convert(data): s = json.dumps(data) + '\n' return s.encode('utf_8') temp = tempfile.TemporaryFile('wb+') lines = [dump_n_convert(data) for data in series] temp.writelines(lines) temp.seek(0) logging.debug('uploading series file to %s', self.get_output_blob_name()) series_blob = storage.Blob(self.get_output_blob_name(), self.input_bucket) series_blob.upload_from_file(temp, client=self.client) self.reg.cleanup()
def get_last(channel_id): read_storage_client = storage.Client() bucket_name = 'airqo-bucket' filename = 'channel%s.json'%channel_id bucket = read_storage_client.get_bucket(bucket_name) stats = storage.Blob(bucket=bucket, name=filename).exists(storage_client) #size= storage.get_blob(bucket=bucket, name=filename).chunksize if not stats: last_id = 0 last_time = None else: blob = bucket.get_blob(filename) json_data_string = blob.download_as_string() json_data=ndjson.loads(json_data_string) json_list = [] for item in json_data: json_list.append(item) if len(json_list) != 0: last_id = json_list[-1]['entry_id'] last_time = str_to_date(json_list[-1]['created_at']) else: last_id= None last_time=None return last_id,last_time
def ingest_last_week(): try: # verify that this is a cron job request is_cron = flask.request.headers['X-Appengine-Cron'] logging.info('Received cron request {}'.format(is_cron)) # create png url = 'http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.csv' outfile = 'earthquakes.png' status = 'scheduled ingest of {} to {}'.format(url, outfile) logging.info(status) transform.create_png(url, outfile) # upload to cloud storage client = gcs.Client() bucket = client.get_bucket(CLOUD_STORAGE_BUCKET) blob = gcs.Blob('earthquakes/earthquakes.png', bucket) blob.upload_from_filename(outfile) # change permissions blob.make_public() status = 'uploaded {} to {}'.format(outfile, blob.name) logging.info(status) except KeyError as e: status = '<html>Sorry, this capability is accessible only by the Cron service, but I got a KeyError for {} -- try invoking it from <a href="{}"> the GCP console / AppEngine / taskqueues </a></html>'.format( e, 'http://console.cloud.google.com/appengine/taskqueues?tab=CRON') logging.info('Rejected non-Cron request') return status
def download_blob(source_blob_name, destination_file_name, bucket_name="project_vaxx"): bucket_t = storage_client.bucket(bucket_name, user_project=None) temp = storage.Blob(source_blob_name, bucket_t) temp.download_to_filename(destination_file_name)