Ejemplo n.º 1
0
def upload_file(gcspath, outpath):
    from apache_beam.io.gcp.gcsio import GcsIO
    gcs = GcsIO()
    w = gcs.open(gcspath, "w")
    r = open(outpath, "r")
    w.write(r.read())
    r.close()
    w.close()
Ejemplo n.º 2
0
def download_file(gcspath, inpath):
    from apache_beam.io.gcp.gcsio import GcsIO
    gcs = GcsIO()
    r = gcs.open(gcspath, "r")
    w = open(inpath, "w")
    w.write(r.read())
    w.close()
    r.close()
Ejemplo n.º 3
0
 def upload_file(self, path, out_path):
     from apache_beam.io.gcp.gcsio import GcsIO
     gcs = GcsIO()
     w = gcs.open("gs://" + BUCKET_NAME + "/" + path, "w")
     r = open(out_path, "r")
     w.write(r.read())
     r.close()
     w.close()
 def localise_day_files(self, day):
     files = self.filter_to_required_files(self.get_tilenames_for_day(day))
     tempfolder = self.get_tmp_folder_for_day(day)
     localpaths = []
     gcs = GcsIO()
     if not os.path.isdir(tempfolder):
         os.makedirs(tempfolder)
     for f in files:
         localname = os.path.join(tempfolder, os.path.basename(f))
         if not os.path.exists(localname):
             # it might have already been copied if say day and night files, which come
             # from the same HDFs, are being made on the same worker
             with gcs.open(f) as gcsfile, open(localname,
                                               'wb') as localfile:
                 localfile.write(gcsfile.read())
         localpaths.append(localname)
     return (day, localpaths)
Ejemplo n.º 5
0
 def download_file(self, url):
     #import requests, tempfile, os
     #from apache_beam.io.gcp.gcsio import GcsIO
     req = self._session.request('get', url)
     resp = self._session.get(req.url, stream=True)
     product, datestr, fname = url.split('/')[-3:]
     bucketfilename = '/'.join(
         [self._hdf_bucket_path, product, datestr, fname])
     gcs = GcsIO()
     with gcs.open(bucketfilename, 'w') as fp:
         # with open(tempfilename, 'wb') as fp:
         for chunk in resp.iter_content(chunk_size=self._chunk_size):
             if chunk:
                 fp.write(chunk)
         fp.flush()
         # os.fsync(fp)
     return bucketfilename
Ejemplo n.º 6
0
        # Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=Big Data Training',
        # Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://big-data-training-julio/staging',
        # Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://big-data-training-julio/temp',
        '--job_name=parse-avro',
    ])
	pipeline_options = PipelineOptions(pipeline_args)
	avro_schema = get_schema(input_file)

	# Debug
	# pp = pprint.PrettyPrinter(indent=4)
	# pp.pprint(get_schema(input_file))
	
	# Execute pipeline
	with Pipeline(options=pipeline_options) as p:
		(
			p
			| 'Read Avro' >> ReadFromAvro(known_args.input)
			| 'Write Avro to GCS' >> WriteToAvro(known_args.output, schema='SCHEMA_AUTODETECT')
		)
	
	# Write schema to json
	avro_schema_json = json.dumps(avro_schema)
	gcs_client = GcsIO()
	with gcs_client.open("gs://big-data-training-julio/test-dataset_schema.json", mode='w', mime_type="application/json") as buffer:
		buffer.write(avro_schema_json.encode("utf-8"))