Esempio n. 1
0
    def pd_to_csv(self, path, df, index=True, header=True, mode='w'):
        local_csv_name = os.path.split(path)[-1]
        if mode == 'a' and self.path_exists(path):
            try:
                original_df = self.pd_read_csv(
                    path, header=None
                )  # treat the header (if any) as part of dataset
                df.columns = original_df.columns  # to ignore the header of df
                df = pd.concat(
                    [original_df, df],
                    ignore_index=True)  # append to the end of original_df

                # Create a local pickle file, with aggregated data
                df.to_csv(local_csv_name, index=index, header=False, mode='w')
            except pd.errors.EmptyDataError:
                # if the file exsits but is empty, append file is equivalent to write
                df.to_csv(local_csv_name, index=index, header=header, mode='w')
        else:
            # Create a local pickle file
            df.to_csv(local_csv_name, index=index, header=header, mode=mode)

        # Define storage bucket
        mybucket = storage.Bucket(self.bucket_name)
        # Create storage bucket if it does not exist
        if not mybucket.exists():
            mybucket.create()

        # Write csv to GCS using pickle
        tem_object = mybucket.object(path)
        with open(local_csv_name, 'rb') as f:
            tem_object.write_stream(bytearray(f.read()),
                                    'application/octet-stream')
Esempio n. 2
0
 def test_object_deletion_consistency(self):
     b = storage.Bucket(self._test_bucket_name, context=self._context)
     b.create()
     o = b.object('sample')
     o.write_stream('contents', 'text/plain')
     o.delete()
     b.delete()
Esempio n. 3
0
def UploadNewToEarthEngine():
  # Only most recent file (3 months ago due to earth engine delays)
  date_min = date.today() - timedelta(days=date.today().day + 1)
  date_min = date_min - timedelta(days=date_min.day + 1)
  date_min = date_min - timedelta(days=date_min.day + 1)
  date_min = date_min - timedelta(days=date_min.day - 1 )
  date_min = str(date_min).replace('-', '')
  
  # Pull filenames from bucket + path
  filenames = [o.key for o in storage.Bucket('soli_ee_data').objects()
    if o.key.startswith('earthengine/Transformed_assets/')]

  # Iterate files in bucket
  for filename in filenames:
    # Asset ID, Filename
    xfer_file = filename.split('/')[-1]
    asset_id = xfer_file.split('.')[0]
    # Verify correct file
    if date_min in asset_id:
      print ("Asset ID: " + asset_id)
      print ("Filename: " + xfer_file)
      # Upload to Earth Engine
      try:
        os.system("earthengine upload image --asset_id=users/nvogler/soli/'$asset_id' gs://soli_ee_data/earthengine/Transformed_assets/'$xfer_file")
      except(e):
        print ("Error uploading image to Earth Engine.\n" + str(e))
Esempio n. 4
0
  def deploy(self, name, dag_string):
    bucket_name, file_path = self.gcs_dag_location.split('/', 3)[2:]  # setting maxsplit to 3
    file_name = '{0}{1}.py'.format(file_path, name)

    bucket = storage.Bucket(bucket_name)
    file_object = bucket.object(file_name)
    file_object.write_stream(dag_string, 'text/plain')
Esempio n. 5
0
  def deploy(self, name, dag_string):
    if self._gcs_dag_file_path is not '' and self._gcs_dag_file_path.endswith('/') is False:
      self._gcs_dag_file_path = self._gcs_dag_file_path + '/'
    file_name = '{0}{1}.py'.format(self._gcs_dag_file_path, name)

    bucket = storage.Bucket(self._gcs_dag_bucket)
    file_object = bucket.object(file_name)
    file_object.write_stream(dag_string, 'text/plain')
Esempio n. 6
0
 def dir_exists(self, path):
     # only return True if that path exists and is a directory
     adj_path = os.path.join(
         path, '')  # make sure that the adj_path is ended with '/'
     filtered_paths = [
         o.key for o in storage.Bucket(self.bucket_name).objects()
         if o.key.startswith(adj_path)
     ]
     return len(filtered_paths) != 0
Esempio n. 7
0
def read_from_storage(object_name, delimiter=','):
  """
  object_name: Full path of file, e.g 'raw-data/Cause-of-Death/CDC_cause_of_death_by_demographics_and_state_20180606.xlsx')
  delimiter: Based on file e.g Default ',' '\t' '|'
  """
  global data, uri
  bucket = storage.Bucket('opioid-care')
  data = bucket.object(object_name)
  uri = data.uri
  %gcs read --object $uri --variable data
  if object_name.endswith('xlsx') or object_name.endswith('xls'):
    return read_from_storage(BytesIO(data))
  return read_from_storage(BytesIO(data), delimiter=delimiter)
Esempio n. 8
0
    def pickle_dump(self, obj, path):
        local_pkl_name = os.path.split(path)[-1]
        # Create a local pickle file
        with open(local_pkl_name, 'wb') as fs:
            pickle.dump(obj, fs)

        # Define storage bucket
        mybucket = storage.Bucket(self.bucket_name)
        # Create storage bucket if it does not exist
        if not mybucket.exists():
            mybucket.create()

        # Write pickle to GCS
        tem_object = mybucket.object(path)
        with open(local_pkl_name, 'rb') as f:
            tem_object.write_stream(bytearray(f.read()),
                                    'application/octet-stream')
Esempio n. 9
0
def get_gcsbucket(bucket_name):
    """Retrieves Google Cloud storage bucket

    Args:
        bucket_name (str): name of desired bucket

    Returns:
        bucket object

    Raises:
        RequestException: The service_account has inadequate permissions
        DefaultCredentialsError: Environment json file not found or
            Error parsing Json File
        IsADirectoryError: When directory supplied instead of file
        AttributeError: Not a valid service_account.Credentials object
    """
    bucket = storage.Bucket(bucket_name)
    return bucket
Esempio n. 10
0
def UploadAllToEarthEngine():
  # Pull filenames from bucket + path
  filenames = [o.key for o in storage.Bucket('soli_ee_data').objects()
    if o.key.startswith('earthengine/Transformed_assets/')]

  # Iterate files in bucket
  for filename in filenames:
    # Asset ID, Filename
    xfer_file = filename.split('/')[-1]
    asset_id = xfer_file.split('.')[0]
    # Verify not null
    if asset_id != "":
      print ("Asset ID: " + asset_id)
      print ("Filename: " + xfer_file)
      # Upload to Earth Engine
      try:
        os.system("earthengine upload image --asset_id=users/nvogler/soli/" + str(asset_id) + "gs://soli_ee_data/earthengine/Transformed_assets/" + str(xfer_file))
      except(e):
        print ("Error uploading image to Earth Engine.\n" + str(e))
Esempio n. 11
0
 def tearDownClass(cls):
   bucket = storage.Bucket(cls._bucket_name)
   for obj in bucket.objects():
     obj.delete()
   bucket.delete()
Esempio n. 12
0
 def setUpClass(cls):
   cls._bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
   cls._bucket_root = 'gs://%s' % cls._bucket_name
   storage.Bucket(cls._bucket_name).create()
Esempio n. 13
0
import google.datalab.storage as storage
import pandas as pd
import simplejson as json
import numpy as np
import csv
import re
from io import BytesIO

#--------------------------------------------------------------------------Buscando o arquivo no Cloud PlatForm----------------------------------------------------------------------------------------
# jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000
mybucket = storage.Bucket('fca-finance-origin')  #fca-finance
data = mybucket.object('Conciliacao/IF_BR_02_029_GAAP_20190605074552.txt')

##cadastro = storage.Object.read_lines(data,max_lines=100)
cadastro = storage.Object.read_stream(data, start_offset=0, byte_count=None)

#print (cadastro)

#--------------------------------------------------------------------------Formatando o Arquivo com replace (;)------------------------------------------------------------------------------
tamanho = len(cadastro)
#print '\n'+ str(tamanho) + '\n'
i = 1

val = str(cadastro).replace(
    'ADJCB029__G16420190605190646000177007906007904          ',
    'ADJCB029__G16420190605190646000177007906007904;'
)  # Colocando (;) para tratrar o arquivo para transformar em uma única linha
val2 = val.replace(
    'BRL', 'BRL;'
)  # Colocando (;) para tratrar o arquivo para transformar em uma única linha
val3 = val2.replace('\r\n', '')
Esempio n. 14
0
 def pickle_load(self, path):
     mybucket = storage.Bucket(self.bucket_name)
     remote_pickle = mybucket.object(path).read_stream()
     return pickle.load(BytesIO(remote_pickle))
Esempio n. 15
0
KEY_PREFIX = 'BUCKET_NAME_'
DRC_BUCKET_NAME = parameters.DRC_BUCKET_NAME

bucket_keys = [
    key for key in app_env.keys()
    if key.startswith(KEY_PREFIX) and not app_env[key].startswith('test')
    and not app_env[key] == DRC_BUCKET_NAME
]

hpo_buckets = dict()
for bucket_key in bucket_keys:
    hpo_id = bucket_key.replace(KEY_PREFIX, '').lower()
    bucket = app_env[bucket_key]
    hpo_buckets[hpo_id] = bucket

drc_bucket = storage.Bucket(name=DRC_BUCKET_NAME)


# +
def hpo_ls(hpo_id, bucket):
    prefix = '%s/%s/' % (hpo_id, bucket)
    objs = list(drc_bucket.objects(prefix))
    return objs


def scan_obj(obj):
    comps = obj.key.split('/')
    if len(comps) != 4:
        return
    hpo_id, bucket, dir_name, file_name = comps
    local_dir = os.path.join(OUTPUT_DIR, hpo_id, bucket, dir_name)
Esempio n. 16
0
 def path_exists(self, path):
     filtered_paths = [
         o.key for o in storage.Bucket(self.bucket_name).objects()
         if o.key.startswith(path)
     ]
     return len(filtered_paths) != 0
#Ingesting Data from BigQuery or GCS into Datalab or AI Notebooks
## GCS Bucket pull

import google.datalab.storage as storage
import pandas as pd
from io import BytesIO

mybucket = storage.Bucket('<bucket>')
data_csv = mybucket.object('<object>')

uri = data_csv.uri
get_ipython().run_line_magic('gcs', 'read --object $uri --variable data')

df = pd.read_csv(BytesIO(data)
                 #,skiprows=81 #If there are headers
                 )
df.head()

## BQ pull with SQL

# (Spin up Datalab VM on Cloud Shell with a command similar to the following:)
#datalab create babyweight --zone us-central1-a --no-create-repository

# Python (will run into a credential issue if not run on a Datalab VM spun up on project with BQ)
get_ipython().system('pip install --upgrade google-cloud-bigquery')

from google.cloud import bigquery

client = bigquery.Client()

sql = """
Esempio n. 18
0
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""

    # Make a BQ table, and insert 1 row.
    try:
      bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
      bucket_root = 'gs://%s' % bucket_name
      bucket = storage.Bucket(bucket_name)
      bucket.create()

      project_id = dl.Context.default().project_id

      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'key_col', 'type': 'INTEGER'},
                    {'name': 'target_col', 'type': 'FLOAT'},
                    {'name': 'cat_col', 'type': 'STRING'},
                    {'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])

      img1_file = os.path.join(self.source_dir, 'img1.jpg')
      dest_file = os.path.join(bucket_root, 'img1.jpg')
      file_io.copy(img1_file, dest_file)

      data = [
          {
           'key_col': 1,
           'target_col': 1.0,
           'cat_col': 'Monday',
           'num_col': 23.0,
           'img_col': dest_file,
          },
      ]
      table.insert(data=data)

      cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
             '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analysis=' + self.analysis_dir,
             '--prefix=features',
             '--project-id=' + project_id,
             '--output=' + self.output_dir]
      print('cmd ', ' '.join(cmd))
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(self.output_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options))
      self.assertEqual(len(serialized_examples), 1)

      example = tf.train.Example()
      example.ParseFromString(serialized_examples[0])

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 23.0)
      transformed_category = example.features.feature['cat_col'].int64_list.value[0]
      self.assertEqual(transformed_category, 2)
      image_bytes = example.features.feature['img_col'].float_list.value
      self.assertEqual(len(image_bytes), 2048)
      self.assertTrue(any(x != 0.0 for x in image_bytes))
    finally:
      dataset.delete(delete_contents=True)

      for obj in bucket.objects():
        obj.delete()
      bucket.delete()
Esempio n. 19
0
 def pd_read_csv(self, path, skiprows=None, nrows=None):
     mybucket = storage.Bucket(self.bucket_name)
     remote_file = mybucket.object(path).read_stream()
     return pd.read_csv(BytesIO(remote_file),
                        skiprows=skiprows,
                        nrows=nrows)
Esempio n. 20
0
import subprocess
import google.datalab.storage as storage  # pip install datalab
import os

uganda2k = [
    o.key for o in storage.Bucket('african-seq-data').objects()
    if o.key.startswith('uganda2k')
]

with open('inputSamplesFile.txt', 'w') as f:
    for x in uganda2k:
        if x.endswith(".bam") or x.endswith(".cram"):
            file_path = "gs://african-seq-data/{}".format(x)
            base = os.path.basename(file_path)  # get file name without path
            # get file name without .{bam,cram} extension: for ubam file
            file_no_ext = os.path.splitext(base)[0]
            file_no_ext = file_no_ext.replace(
                "#", "_"
            )  # replace the # with _ because MergeVCFs in picard has a trouble dealing with special characters like #
            # e.g before this change, one of the file names was EGAR00001140731_10256_1#73 and will be changed to EGAR00001140731_10256_1_73
            sample = subprocess.run((
                "gsutil cat {} | samtools view -H | grep ^@RG | tr '\t' '\n' | grep -m1 '^SM:' | cut -d ':' -f 2"
            ).format(file_path),
                                    shell=True,
                                    capture_output=True,
                                    encoding="utf-8").stdout
            f.write(file_path + "\t" + file_no_ext + "\t" + sample + "\n")
Esempio n. 21
0
#insert dataframe into bigquery
def df_to_bq(insert_dataframe):
    table_schema = bq.Schema.from_data(insert_dataframe)
    table.create(schema = table_schema, overwrite = True)
    table.insert(insert_dataframe)
   
#BIG QUERY Upload
#=================================================
#stand alone:
#=====================
sample_bucket_name='gds-database'
sample_bucket_path = 'gs://' + sample_bucket_name
sample_bucket_object = sample_bucket_path + '/GDS_annual.txt'
bigquery_dataset_name = 'GDS'
bigquery_table_name = 'GDS2013'
sample_bucket = storage.Bucket(sample_bucket_name)
if not sample_bucket.exists():
    sample_bucket.create()
dataset = bq.Dataset(bigquery_dataset_name)
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)
if not dataset.exists():
    dataset.create()
dataset.location = 'EU'
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)
if not dataset.exists():
    dataset.create()
    
#bq_df=pd.concat(global_dict['amp'], axis=1, ignore_index=False)
#bq_df_cleaned=clean_dataset(GDS2013)
df_to_bq(GDS2013_raw)