class BatchDeleteGCSFiles(beam.DoFn): """ Batch delete processed objects from Google Cloud Storage. To avoid a REST API call for every object to be deleted, we create a buffer to hold the names of the objects to be deleted. When this buffer reaches it's max size, we perform `apache_beam.io.gcp.gcsio.GcsIO.batch_delete` to delete all objects in the buffer from Google Cloud Storage. Note: the `batch_delete` method set by Google has a maximum of 100 deletes per API call """ def start_bundle(self): self.MAX_BUFFER_SIZE = 100 self._buffer = [] self.storage_client = GcsIO() def process(self, element: dict, *args, **kwargs) -> None: self._buffer.append(element) if len(self._buffer) == self.MAX_BUFFER_SIZE: self.storage_client.delete_batch(paths=self._buffer) self._buffer.clear() def finish_bundle(self): if len(self._buffer) > 0: self.storage_client.delete_batch(paths=self._buffer) self._buffer.clear()
def __init__(self, hdf_bucket_path, req_tile_list="*"): gcs = GcsIO() self._existing = [ os.path.basename(l) for l in list(gcs.list_prefix(hdf_bucket_path).keys()) ] self._required_tiles = req_tile_list
def upload_file(gcspath, outpath): from apache_beam.io.gcp.gcsio import GcsIO gcs = GcsIO() w = gcs.open(gcspath, "w") r = open(outpath, "r") w.write(r.read()) r.close() w.close()
def download_file(gcspath, inpath): from apache_beam.io.gcp.gcsio import GcsIO gcs = GcsIO() r = gcs.open(gcspath, "r") w = open(inpath, "w") w.write(r.read()) w.close() r.close()
def upload_file(self, path, out_path): from apache_beam.io.gcp.gcsio import GcsIO gcs = GcsIO() w = gcs.open("gs://" + BUCKET_NAME + "/" + path, "w") r = open(out_path, "r") w.write(r.read()) r.close() w.close()
def load(path): """ Receives an image path and returns a dictionary containing the image path and a resized version of the image as a np.array. """ buf = GcsIO().open(path, mime_type="image/jpeg") img = Image.open(io.BytesIO(buf.read())) img = img.resize((IMAGE_HEIGHT, IMAGE_WIDTH), Image.ANTIALIAS) return {"path": path, "image": np.array(img)}
def __init__(self, bucketpath, layertemplate, varname, tiles="*"): #import os #from apache_beam.io.gcp.gcsio import GcsIO self._bucketproductpath = bucketpath gcs = GcsIO() self._existing = [ l for l in list(gcs.list_prefix(hdf_bucket_path).keys()) ] self._layertemplate = layertemplate self._tiles = tiles self._varname = varname
def localise_day_files(self, day): files = self.filter_to_required_files(self.get_tilenames_for_day(day)) tempfolder = self.get_tmp_folder_for_day(day) localpaths = [] gcs = GcsIO() if not os.path.isdir(tempfolder): os.makedirs(tempfolder) for f in files: localname = os.path.join(tempfolder, os.path.basename(f)) if not os.path.exists(localname): # it might have already been copied if say day and night files, which come # from the same HDFs, are being made on the same worker with gcs.open(f) as gcsfile, open(localname, 'wb') as localfile: localfile.write(gcsfile.read()) localpaths.append(localname) return (day, localpaths)
def download_file(self, url): #import requests, tempfile, os #from apache_beam.io.gcp.gcsio import GcsIO req = self._session.request('get', url) resp = self._session.get(req.url, stream=True) product, datestr, fname = url.split('/')[-3:] bucketfilename = '/'.join( [self._hdf_bucket_path, product, datestr, fname]) gcs = GcsIO() with gcs.open(bucketfilename, 'w') as fp: # with open(tempfilename, 'wb') as fp: for chunk in resp.iter_content(chunk_size=self._chunk_size): if chunk: fp.write(chunk) fp.flush() # os.fsync(fp) return bucketfilename
def test_pubsub_to_gcs(): PubSubToGCS.run( input_topic="unused", # mocked by TestStream output_path=f"gs://{BUCKET}/pubsub/{UUID}/output", window_size=1, # 1 minute num_shards=1, pipeline_args=[ "--project", PROJECT, "--temp_location", TempDir().get_path(), ], ) # Check for output files on GCS. gcs_client = GcsIO() files = gcs_client.list_prefix(f"gs://{BUCKET}/pubsub/{UUID}") assert len(files) > 0 # Clean up. gcs_client.delete_batch(list(files))
def start_bundle(self): self.MAX_BUFFER_SIZE = 100 self._buffer = [] self.storage_client = GcsIO()
# Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=Big Data Training', # Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://big-data-training-julio/staging', # Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://big-data-training-julio/temp', '--job_name=parse-avro', ]) pipeline_options = PipelineOptions(pipeline_args) avro_schema = get_schema(input_file) # Debug # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(get_schema(input_file)) # Execute pipeline with Pipeline(options=pipeline_options) as p: ( p | 'Read Avro' >> ReadFromAvro(known_args.input) | 'Write Avro to GCS' >> WriteToAvro(known_args.output, schema='SCHEMA_AUTODETECT') ) # Write schema to json avro_schema_json = json.dumps(avro_schema) gcs_client = GcsIO() with gcs_client.open("gs://big-data-training-julio/test-dataset_schema.json", mode='w', mime_type="application/json") as buffer: buffer.write(avro_schema_json.encode("utf-8"))