def pd_to_csv(self, path, df, index=True, header=True, mode='w'): local_csv_name = os.path.split(path)[-1] if mode == 'a' and self.path_exists(path): try: original_df = self.pd_read_csv( path, header=None ) # treat the header (if any) as part of dataset df.columns = original_df.columns # to ignore the header of df df = pd.concat( [original_df, df], ignore_index=True) # append to the end of original_df # Create a local pickle file, with aggregated data df.to_csv(local_csv_name, index=index, header=False, mode='w') except pd.errors.EmptyDataError: # if the file exsits but is empty, append file is equivalent to write df.to_csv(local_csv_name, index=index, header=header, mode='w') else: # Create a local pickle file df.to_csv(local_csv_name, index=index, header=header, mode=mode) # Define storage bucket mybucket = storage.Bucket(self.bucket_name) # Create storage bucket if it does not exist if not mybucket.exists(): mybucket.create() # Write csv to GCS using pickle tem_object = mybucket.object(path) with open(local_csv_name, 'rb') as f: tem_object.write_stream(bytearray(f.read()), 'application/octet-stream')
def test_object_deletion_consistency(self): b = storage.Bucket(self._test_bucket_name, context=self._context) b.create() o = b.object('sample') o.write_stream('contents', 'text/plain') o.delete() b.delete()
def UploadNewToEarthEngine(): # Only most recent file (3 months ago due to earth engine delays) date_min = date.today() - timedelta(days=date.today().day + 1) date_min = date_min - timedelta(days=date_min.day + 1) date_min = date_min - timedelta(days=date_min.day + 1) date_min = date_min - timedelta(days=date_min.day - 1 ) date_min = str(date_min).replace('-', '') # Pull filenames from bucket + path filenames = [o.key for o in storage.Bucket('soli_ee_data').objects() if o.key.startswith('earthengine/Transformed_assets/')] # Iterate files in bucket for filename in filenames: # Asset ID, Filename xfer_file = filename.split('/')[-1] asset_id = xfer_file.split('.')[0] # Verify correct file if date_min in asset_id: print ("Asset ID: " + asset_id) print ("Filename: " + xfer_file) # Upload to Earth Engine try: os.system("earthengine upload image --asset_id=users/nvogler/soli/'$asset_id' gs://soli_ee_data/earthengine/Transformed_assets/'$xfer_file") except(e): print ("Error uploading image to Earth Engine.\n" + str(e))
def deploy(self, name, dag_string): bucket_name, file_path = self.gcs_dag_location.split('/', 3)[2:] # setting maxsplit to 3 file_name = '{0}{1}.py'.format(file_path, name) bucket = storage.Bucket(bucket_name) file_object = bucket.object(file_name) file_object.write_stream(dag_string, 'text/plain')
def deploy(self, name, dag_string): if self._gcs_dag_file_path is not '' and self._gcs_dag_file_path.endswith('/') is False: self._gcs_dag_file_path = self._gcs_dag_file_path + '/' file_name = '{0}{1}.py'.format(self._gcs_dag_file_path, name) bucket = storage.Bucket(self._gcs_dag_bucket) file_object = bucket.object(file_name) file_object.write_stream(dag_string, 'text/plain')
def dir_exists(self, path): # only return True if that path exists and is a directory adj_path = os.path.join( path, '') # make sure that the adj_path is ended with '/' filtered_paths = [ o.key for o in storage.Bucket(self.bucket_name).objects() if o.key.startswith(adj_path) ] return len(filtered_paths) != 0
def read_from_storage(object_name, delimiter=','): """ object_name: Full path of file, e.g 'raw-data/Cause-of-Death/CDC_cause_of_death_by_demographics_and_state_20180606.xlsx') delimiter: Based on file e.g Default ',' '\t' '|' """ global data, uri bucket = storage.Bucket('opioid-care') data = bucket.object(object_name) uri = data.uri %gcs read --object $uri --variable data if object_name.endswith('xlsx') or object_name.endswith('xls'): return read_from_storage(BytesIO(data)) return read_from_storage(BytesIO(data), delimiter=delimiter)
def pickle_dump(self, obj, path): local_pkl_name = os.path.split(path)[-1] # Create a local pickle file with open(local_pkl_name, 'wb') as fs: pickle.dump(obj, fs) # Define storage bucket mybucket = storage.Bucket(self.bucket_name) # Create storage bucket if it does not exist if not mybucket.exists(): mybucket.create() # Write pickle to GCS tem_object = mybucket.object(path) with open(local_pkl_name, 'rb') as f: tem_object.write_stream(bytearray(f.read()), 'application/octet-stream')
def get_gcsbucket(bucket_name): """Retrieves Google Cloud storage bucket Args: bucket_name (str): name of desired bucket Returns: bucket object Raises: RequestException: The service_account has inadequate permissions DefaultCredentialsError: Environment json file not found or Error parsing Json File IsADirectoryError: When directory supplied instead of file AttributeError: Not a valid service_account.Credentials object """ bucket = storage.Bucket(bucket_name) return bucket
def UploadAllToEarthEngine(): # Pull filenames from bucket + path filenames = [o.key for o in storage.Bucket('soli_ee_data').objects() if o.key.startswith('earthengine/Transformed_assets/')] # Iterate files in bucket for filename in filenames: # Asset ID, Filename xfer_file = filename.split('/')[-1] asset_id = xfer_file.split('.')[0] # Verify not null if asset_id != "": print ("Asset ID: " + asset_id) print ("Filename: " + xfer_file) # Upload to Earth Engine try: os.system("earthengine upload image --asset_id=users/nvogler/soli/" + str(asset_id) + "gs://soli_ee_data/earthengine/Transformed_assets/" + str(xfer_file)) except(e): print ("Error uploading image to Earth Engine.\n" + str(e))
def tearDownClass(cls): bucket = storage.Bucket(cls._bucket_name) for obj in bucket.objects(): obj.delete() bucket.delete()
def setUpClass(cls): cls._bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex cls._bucket_root = 'gs://%s' % cls._bucket_name storage.Bucket(cls._bucket_name).create()
import google.datalab.storage as storage import pandas as pd import simplejson as json import numpy as np import csv import re from io import BytesIO #--------------------------------------------------------------------------Buscando o arquivo no Cloud PlatForm---------------------------------------------------------------------------------------- # jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000 mybucket = storage.Bucket('fca-finance-origin') #fca-finance data = mybucket.object('Conciliacao/IF_BR_02_029_GAAP_20190605074552.txt') ##cadastro = storage.Object.read_lines(data,max_lines=100) cadastro = storage.Object.read_stream(data, start_offset=0, byte_count=None) #print (cadastro) #--------------------------------------------------------------------------Formatando o Arquivo com replace (;)------------------------------------------------------------------------------ tamanho = len(cadastro) #print '\n'+ str(tamanho) + '\n' i = 1 val = str(cadastro).replace( 'ADJCB029__G16420190605190646000177007906007904 ', 'ADJCB029__G16420190605190646000177007906007904;' ) # Colocando (;) para tratrar o arquivo para transformar em uma Ășnica linha val2 = val.replace( 'BRL', 'BRL;' ) # Colocando (;) para tratrar o arquivo para transformar em uma Ășnica linha val3 = val2.replace('\r\n', '')
def pickle_load(self, path): mybucket = storage.Bucket(self.bucket_name) remote_pickle = mybucket.object(path).read_stream() return pickle.load(BytesIO(remote_pickle))
KEY_PREFIX = 'BUCKET_NAME_' DRC_BUCKET_NAME = parameters.DRC_BUCKET_NAME bucket_keys = [ key for key in app_env.keys() if key.startswith(KEY_PREFIX) and not app_env[key].startswith('test') and not app_env[key] == DRC_BUCKET_NAME ] hpo_buckets = dict() for bucket_key in bucket_keys: hpo_id = bucket_key.replace(KEY_PREFIX, '').lower() bucket = app_env[bucket_key] hpo_buckets[hpo_id] = bucket drc_bucket = storage.Bucket(name=DRC_BUCKET_NAME) # + def hpo_ls(hpo_id, bucket): prefix = '%s/%s/' % (hpo_id, bucket) objs = list(drc_bucket.objects(prefix)) return objs def scan_obj(obj): comps = obj.key.split('/') if len(comps) != 4: return hpo_id, bucket, dir_name, file_name = comps local_dir = os.path.join(OUTPUT_DIR, hpo_id, bucket, dir_name)
def path_exists(self, path): filtered_paths = [ o.key for o in storage.Bucket(self.bucket_name).objects() if o.key.startswith(path) ] return len(filtered_paths) != 0
#Ingesting Data from BigQuery or GCS into Datalab or AI Notebooks ## GCS Bucket pull import google.datalab.storage as storage import pandas as pd from io import BytesIO mybucket = storage.Bucket('<bucket>') data_csv = mybucket.object('<object>') uri = data_csv.uri get_ipython().run_line_magic('gcs', 'read --object $uri --variable data') df = pd.read_csv(BytesIO(data) #,skiprows=81 #If there are headers ) df.head() ## BQ pull with SQL # (Spin up Datalab VM on Cloud Shell with a command similar to the following:) #datalab create babyweight --zone us-central1-a --no-create-repository # Python (will run into a credential issue if not run on a Datalab VM spun up on project with BQ) get_ipython().system('pip install --upgrade google-cloud-bigquery') from google.cloud import bigquery client = bigquery.Client() sql = """
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" # Make a BQ table, and insert 1 row. try: bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex bucket_root = 'gs://%s' % bucket_name bucket = storage.Bucket(bucket_name) bucket.create() project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'key_col', 'type': 'INTEGER'}, {'name': 'target_col', 'type': 'FLOAT'}, {'name': 'cat_col', 'type': 'STRING'}, {'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) img1_file = os.path.join(self.source_dir, 'img1.jpg') dest_file = os.path.join(bucket_root, 'img1.jpg') file_io.copy(img1_file, dest_file) data = [ { 'key_col': 1, 'target_col': 1.0, 'cat_col': 'Monday', 'num_col': 23.0, 'img_col': dest_file, }, ] table.insert(data=data) cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'), '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name), '--analysis=' + self.analysis_dir, '--prefix=features', '--project-id=' + project_id, '--output=' + self.output_dir] print('cmd ', ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options)) self.assertEqual(len(serialized_examples), 1) example = tf.train.Example() example.ParseFromString(serialized_examples[0]) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 23.0) transformed_category = example.features.feature['cat_col'].int64_list.value[0] self.assertEqual(transformed_category, 2) image_bytes = example.features.feature['img_col'].float_list.value self.assertEqual(len(image_bytes), 2048) self.assertTrue(any(x != 0.0 for x in image_bytes)) finally: dataset.delete(delete_contents=True) for obj in bucket.objects(): obj.delete() bucket.delete()
def pd_read_csv(self, path, skiprows=None, nrows=None): mybucket = storage.Bucket(self.bucket_name) remote_file = mybucket.object(path).read_stream() return pd.read_csv(BytesIO(remote_file), skiprows=skiprows, nrows=nrows)
import subprocess import google.datalab.storage as storage # pip install datalab import os uganda2k = [ o.key for o in storage.Bucket('african-seq-data').objects() if o.key.startswith('uganda2k') ] with open('inputSamplesFile.txt', 'w') as f: for x in uganda2k: if x.endswith(".bam") or x.endswith(".cram"): file_path = "gs://african-seq-data/{}".format(x) base = os.path.basename(file_path) # get file name without path # get file name without .{bam,cram} extension: for ubam file file_no_ext = os.path.splitext(base)[0] file_no_ext = file_no_ext.replace( "#", "_" ) # replace the # with _ because MergeVCFs in picard has a trouble dealing with special characters like # # e.g before this change, one of the file names was EGAR00001140731_10256_1#73 and will be changed to EGAR00001140731_10256_1_73 sample = subprocess.run(( "gsutil cat {} | samtools view -H | grep ^@RG | tr '\t' '\n' | grep -m1 '^SM:' | cut -d ':' -f 2" ).format(file_path), shell=True, capture_output=True, encoding="utf-8").stdout f.write(file_path + "\t" + file_no_ext + "\t" + sample + "\n")
#insert dataframe into bigquery def df_to_bq(insert_dataframe): table_schema = bq.Schema.from_data(insert_dataframe) table.create(schema = table_schema, overwrite = True) table.insert(insert_dataframe) #BIG QUERY Upload #================================================= #stand alone: #===================== sample_bucket_name='gds-database' sample_bucket_path = 'gs://' + sample_bucket_name sample_bucket_object = sample_bucket_path + '/GDS_annual.txt' bigquery_dataset_name = 'GDS' bigquery_table_name = 'GDS2013' sample_bucket = storage.Bucket(sample_bucket_name) if not sample_bucket.exists(): sample_bucket.create() dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) if not dataset.exists(): dataset.create() dataset.location = 'EU' table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) if not dataset.exists(): dataset.create() #bq_df=pd.concat(global_dict['amp'], axis=1, ignore_index=False) #bq_df_cleaned=clean_dataset(GDS2013) df_to_bq(GDS2013_raw)