class BatchDeleteGCSFiles(beam.DoFn):
    """
    Batch delete processed objects from Google Cloud Storage.

    To avoid a REST API call for every object to be deleted, we create a buffer to hold the names of the objects
    to be deleted. When this buffer reaches it's max size, we perform `apache_beam.io.gcp.gcsio.GcsIO.batch_delete` to
    delete all objects in the buffer from Google Cloud Storage.

    Note: the `batch_delete` method set by Google has a maximum  of 100 deletes per API call
    """
    def start_bundle(self):
        self.MAX_BUFFER_SIZE = 100
        self._buffer = []
        self.storage_client = GcsIO()

    def process(self, element: dict, *args, **kwargs) -> None:
        self._buffer.append(element)
        if len(self._buffer) == self.MAX_BUFFER_SIZE:
            self.storage_client.delete_batch(paths=self._buffer)
            self._buffer.clear()

    def finish_bundle(self):
        if len(self._buffer) > 0:
            self.storage_client.delete_batch(paths=self._buffer)
            self._buffer.clear()
Esempio n. 2
0
 def __init__(self, hdf_bucket_path, req_tile_list="*"):
     gcs = GcsIO()
     self._existing = [
         os.path.basename(l)
         for l in list(gcs.list_prefix(hdf_bucket_path).keys())
     ]
     self._required_tiles = req_tile_list
Esempio n. 3
0
def upload_file(gcspath, outpath):
    from apache_beam.io.gcp.gcsio import GcsIO
    gcs = GcsIO()
    w = gcs.open(gcspath, "w")
    r = open(outpath, "r")
    w.write(r.read())
    r.close()
    w.close()
Esempio n. 4
0
def download_file(gcspath, inpath):
    from apache_beam.io.gcp.gcsio import GcsIO
    gcs = GcsIO()
    r = gcs.open(gcspath, "r")
    w = open(inpath, "w")
    w.write(r.read())
    w.close()
    r.close()
Esempio n. 5
0
 def upload_file(self, path, out_path):
     from apache_beam.io.gcp.gcsio import GcsIO
     gcs = GcsIO()
     w = gcs.open("gs://" + BUCKET_NAME + "/" + path, "w")
     r = open(out_path, "r")
     w.write(r.read())
     r.close()
     w.close()
Esempio n. 6
0
def load(path):
    """
    Receives an image path and returns a dictionary containing
    the image path and a resized version of the image as a np.array.
    """
    buf = GcsIO().open(path, mime_type="image/jpeg")
    img = Image.open(io.BytesIO(buf.read()))
    img = img.resize((IMAGE_HEIGHT, IMAGE_WIDTH), Image.ANTIALIAS)
    return {"path": path, "image": np.array(img)}
 def __init__(self, bucketpath, layertemplate, varname, tiles="*"):
     #import os
     #from apache_beam.io.gcp.gcsio import GcsIO
     self._bucketproductpath = bucketpath
     gcs = GcsIO()
     self._existing = [
         l for l in list(gcs.list_prefix(hdf_bucket_path).keys())
     ]
     self._layertemplate = layertemplate
     self._tiles = tiles
     self._varname = varname
 def localise_day_files(self, day):
     files = self.filter_to_required_files(self.get_tilenames_for_day(day))
     tempfolder = self.get_tmp_folder_for_day(day)
     localpaths = []
     gcs = GcsIO()
     if not os.path.isdir(tempfolder):
         os.makedirs(tempfolder)
     for f in files:
         localname = os.path.join(tempfolder, os.path.basename(f))
         if not os.path.exists(localname):
             # it might have already been copied if say day and night files, which come
             # from the same HDFs, are being made on the same worker
             with gcs.open(f) as gcsfile, open(localname,
                                               'wb') as localfile:
                 localfile.write(gcsfile.read())
         localpaths.append(localname)
     return (day, localpaths)
Esempio n. 9
0
 def download_file(self, url):
     #import requests, tempfile, os
     #from apache_beam.io.gcp.gcsio import GcsIO
     req = self._session.request('get', url)
     resp = self._session.get(req.url, stream=True)
     product, datestr, fname = url.split('/')[-3:]
     bucketfilename = '/'.join(
         [self._hdf_bucket_path, product, datestr, fname])
     gcs = GcsIO()
     with gcs.open(bucketfilename, 'w') as fp:
         # with open(tempfilename, 'wb') as fp:
         for chunk in resp.iter_content(chunk_size=self._chunk_size):
             if chunk:
                 fp.write(chunk)
         fp.flush()
         # os.fsync(fp)
     return bucketfilename
Esempio n. 10
0
def test_pubsub_to_gcs():
    PubSubToGCS.run(
        input_topic="unused",  # mocked by TestStream
        output_path=f"gs://{BUCKET}/pubsub/{UUID}/output",
        window_size=1,  # 1 minute
        num_shards=1,
        pipeline_args=[
            "--project",
            PROJECT,
            "--temp_location",
            TempDir().get_path(),
        ],
    )

    # Check for output files on GCS.
    gcs_client = GcsIO()
    files = gcs_client.list_prefix(f"gs://{BUCKET}/pubsub/{UUID}")
    assert len(files) > 0

    # Clean up.
    gcs_client.delete_batch(list(files))
 def start_bundle(self):
     self.MAX_BUFFER_SIZE = 100
     self._buffer = []
     self.storage_client = GcsIO()
Esempio n. 12
0
        # Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=Big Data Training',
        # Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://big-data-training-julio/staging',
        # Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://big-data-training-julio/temp',
        '--job_name=parse-avro',
    ])
	pipeline_options = PipelineOptions(pipeline_args)
	avro_schema = get_schema(input_file)

	# Debug
	# pp = pprint.PrettyPrinter(indent=4)
	# pp.pprint(get_schema(input_file))
	
	# Execute pipeline
	with Pipeline(options=pipeline_options) as p:
		(
			p
			| 'Read Avro' >> ReadFromAvro(known_args.input)
			| 'Write Avro to GCS' >> WriteToAvro(known_args.output, schema='SCHEMA_AUTODETECT')
		)
	
	# Write schema to json
	avro_schema_json = json.dumps(avro_schema)
	gcs_client = GcsIO()
	with gcs_client.open("gs://big-data-training-julio/test-dataset_schema.json", mode='w', mime_type="application/json") as buffer:
		buffer.write(avro_schema_json.encode("utf-8"))