def __init__(self, cfg_file="config.yml"): self.cfg = get_cfg(cfg_file) self.method = self.cfg['backend'] # setup cloud drive if needed if self.method == 'gcp': project = self.cfg['data_gcp']['project'] bucket_name = self.cfg['data_gcp']['bucket'] if self.cfg['gcp_local_auth'] == 1: # running on local gs_token = self.cfg['data_gcp']['json_key'] self.gcs_fs = gcsfs.GCSFileSystem(project=project, token=gs_token) self.storage_client = storage.Client.from_service_account_json( gs_token) self.bucket = self.storage_client.get_bucket( bucket_name) # now it will create bucket obj else: # running on native gc self.storage_client = storage.Client() self.bucket = self.storage_client.get_bucket( bucket_name) # now it will create bucket obj self.gcs_fs = gcsfs.GCSFileSystem(project=project) # manual way of connecting to gcs # blob = bucket.blob(bucket_folder + file) self.dfLoc = None # placeholder for existing data, but doesn't add it yet self.dfNew = None # placeholder for new data coming in
def get_gcs_root(project): """Get object map of a root GCS bucket""" fs = gcsfs.GCSFileSystem(project=project, token='cache') token = fs.session.credentials gcsfs_root = gcsfs.GCSFileSystem(project=project, token=token) return gcsfs_root
def get_fs_and_path(url: str, token=None, public=True) -> Tuple[fsspec.AbstractFileSystem, str]: if url.startswith("s3://"): token = token or dict() token = read_aws_creds(token) if isinstance(token, str) else token return ( S3FileSystemReplacement( key=token.get("aws_access_key_id"), secret=token.get("aws_secret_access_key"), token=token.get("aws_session_token"), client_kwargs={ "endpoint_url": token.get("endpoint_url"), "region_name": token.get("region"), }, ), url[5:], ) elif url.startswith("gcs://"): return gcsfs.GCSFileSystem(token=token), url[6:] elif url.find("blob.core.windows.net/") != -1: account_name = url.split(".")[0] account_name = account_name[8:] if url.startswith( "https://") else account_name return ( AzureBlobFileSystem( account_name=account_name, account_key=token.get("account_key"), ), url[url.find("blob.core.windows.net/") + 22:], ) elif (url.startswith("../") or url.startswith("./") or url.startswith("/") or url.startswith("~/")): return fsspec.filesystem("file"), url elif ( # windows local file system re.search("^[A-Za-z]:", url)): return fsspec.filesystem("file"), url else: # TOOD check if url is username/dataset:version if url.split("/")[0] == "google": org_id, ds_name = url.split("/") token, url = HubControlClient().get_dataset_credentials( org_id, ds_name) fs = gcsfs.GCSFileSystem(token=token) url = url[6:] else: url, creds = _connect(url, public=public) fs = S3FileSystemReplacement( expiration=creds["expiration"], key=creds["access_key"], secret=creds["secret_key"], token=creds["session_token"], client_kwargs={ "endpoint_url": creds["endpoint"], "region_name": creds["region"], }, ) return (fs, url)
def get_temp_filepath(self): if self.backend == 'POSIX': self.temp_dir = tempfile.mkdtemp() self.dir_store = os.path.join(self.temp_dir, 'temp-%s%s' % (next(_counter), self.suffix)) # Saving dask objects as Zarr requires more than just a filehandle if not self.dask: self.storage_obj = self.dir_store else: self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize, store=self.dir_store, dtype=self.dtype, overwrite=True) elif self.backend == 'GCS': if not self.gcs_zarr: raise NotImplementedError("Missing config for GCP test") # HACK in order to give worker pods read/write to storage fs = gcsfs.GCSFileSystem(project=self.gcp_project_name, token='cache') token = fs.session.credentials self.gcp_project = gcsfs.GCSFileSystem(project=self.gcp_project_name, token=token) self.gcsfsmap = gcsfs.mapping.GCSMap(self.gcs_zarr, gcs=self.gcp_project, check=True, create=False) if not self.dask: gsutil_arg = "gs://%s" % self.gcs_zarr call(["gsutil", "-q", "-m", "rm", "-r", gsutil_arg]) self.storage_obj = self.gcsfsmap else: self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize, store=self.gcsfsmap, dtype=self.dtype, overwrite=True) elif self.backend == 'FUSE': if not self.gcs_zarr_fuse: raise NotImplementedError("Missing config for FUSE test") self.temp_dir = tempfile.mkdtemp() self.dir_store = self.temp_dir + self.gcs_zarr_fuse call([GCSFUSE, self.gcs_bucket, self.temp_dir]) # Remove previous test runs if os.path.exists(self.dir_store): shutil.rmtree(self.dir_store) os.makedirs(self.dir_store) # Return the path if this isn't Dask # TODO: This should be a function if not self.dask: self.storage_obj = self.dir_store else: self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize, store=self.dir_store, dtype=self.dtype, overwrite=True) else: raise NotImplementedError("Storage backend not implemented.")
def __init__(self, project=None): super(CloudService, self).__init__() self.project = project if project is not None: self.client = bigquery.Client(project=project) self.storage_client = storage.Client(project=project) self.fs = gcsfs.GCSFileSystem(project=project) else: self.client = bigquery.Client() self.storage_client = storage.Client() self.fs = gcsfs.GCSFileSystem()
def create_gcsfs(bucket_id=None, token_loc=None): """ Create a GCSFileSystem (Google Cloud Storage File System), given a 'bucket_id' and 'token_loc' parameters. If anyones given, it will set by default configurations. """ if bucket_id is None: bucket_id = GLOBAL_BUCKET_ID if token_loc is None: return gcsfs.GCSFileSystem( bucket_id, token=f'{HOME_ENV}/gcloud/application_default_credentials.json') return gcsfs.GCSFileSystem(bucket_id, token=token_loc)
def read_parquet(uri): parsed_uri = urlparse(uri) if parsed_uri.scheme == "file": return pd.read_parquet(parsed_uri.path) elif parsed_uri.scheme == "gs": fs = gcsfs.GCSFileSystem() files = [ "gs://" + path for path in gcsfs.GCSFileSystem().glob(uri + "/part-*") ] ds = parquet.ParquetDataset(files, filesystem=fs) return ds.read().to_pandas() else: raise ValueError("Unsupported scheme")
def multi_open( filename, mode, use_gcs=True, use_http=True, use_file=True, use_gzip=True, token=None, **kwargs, ): if use_gcs and (filename.startswith("gcs://") or filename.startswith("gc://")): token = token or os.environ.get("FTM_PREDICT_GCS_TOKEN") logging.debug(f"Using GCSFS to open file: {filename}:{mode}") fs = gcsfs.GCSFileSystem(token=token) return fs.open(filename, mode=mode, **kwargs) elif use_http and (filename.startswith("http://") or filename.startswith("https://")): if not mode.startswith("r"): raise ValueError("HTTP File-type only supports read modes") kwargs.setdefault("method", "GET") kwargs.setdefault("url", filename) kwargs["stream"] = True response = requests.request(**kwargs) return response.raw elif use_gzip and (filename.endswith(".gz") or filename.endswith(".gzip")): logging.debug(f"Using GZIP to open file: {filename}:{mode}") return gzip.open(filename, mode, **kwargs) elif use_file: return open(filename, mode, **kwargs) raise ValueError(f"Unable to open file: {filename}:{mode}")
def users_converter(): json_gcs = [] gcs_file_system = gcsfs.GCSFileSystem(project="sirapob-bluepi-de-exam", token="cloud") gcs_json_path = "gs://airflow-postgres/users" with gcs_file_system.open(gcs_json_path) as f: gcs_string_data = json.loads(json.dumps(f.read().decode('utf-8'))) gcs = gcs_string_data.splitlines() for g in gcs: gcs = json.loads(g) gcs['created_at'] = dt.datetime.fromtimestamp( gcs['created_at']) + dt.timedelta(hours=7) gcs['updated_at'] = dt.datetime.fromtimestamp( gcs['updated_at']) + dt.timedelta(hours=7) json_gcs.append(gcs) storage_client = storage.Client() bucket = storage_client.get_bucket("airflow-postgres") blob = bucket.blob("users.csv") df = pd.DataFrame(data=json_gcs).to_csv(sep=",", header=False, index=False, quotechar='"', quoting=csv.QUOTE_ALL, encoding='utf-8') blob.upload_from_string(data=df)
def sample_qc_zarr(input_path: str, output_path: str, remote: bool): """Convert sample QC csv to zarr""" import gcsfs import pandas as pd logger.info("Converting to Xarray") df = pd.read_csv(input_path, sep="\t") pc_vars = df.filter(regex="^genetic_principal_component").columns.tolist() ds = (df[[c for c in df if c not in pc_vars ]].rename_axis("samples", axis="rows").to_xarray().drop_vars("samples")) pcs = (df[pc_vars].rename_axis( "samples", axis="rows").to_xarray().drop_vars("samples").to_array( dim="principal_components").T) ds = ds.assign( genotype_measurement_plate=ds.genotype_measurement_plate.astype("S"), genotype_measurement_well=ds.genotype_measurement_well.astype("S"), principal_component=pcs.drop_vars("principal_components"), ) # Rechunk to enforce stricter dtypes as well as ease # downstream loading/processing of PC array ds = ds.chunk("auto") store = output_path if remote: gcs = gcsfs.GCSFileSystem() store = gcsfs.GCSMap(output_path, gcs=gcs, check=False, create=True) logger.info(f"Sample QC dataset:\n{ds}") logger.info(f"Saving zarr archive at {output_path}") ds.to_zarr(store, mode="w", consolidated=True)
def clean_bucket(bucket: str, name: str, project: str) -> None: """ Find all the file names that do not have one of the 3 labels of age, gender and accent. :param bucket: name of Google Cloud bucket :param name: name of file or google cloud blob :param project: unique google cloud project name file_list = ['validated', 'train', 'test', 'dev', 'other', 'invalidated'] for file in file_list: clean_bucket(bucket = config.Bucket.META_DATA, name = file, project = 'commonvoice-voice-voice-270516') """ fs = gcsfs.GCSFileSystem(project=project) with fs.open("{}/{}.tsv".format(bucket, name)) as f: data = pd.read_csv(f, delimiter="\t") data = data[["path", "age", "gender", "accent"]] print("There are {} audio files in the development set".format(data.shape[0])) columns = ["gender", "age", "accent"] clean_files = [] for column in columns: mp3 = data[-data[column].isna()]["path"] clean_files.extend(mp3) clean_files = set(clean_files) path = data["path"] mp3_to_remove = collections.deque() for mp3 in tqdm(path): if mp3 not in clean_files: mp3_to_remove.append(mp3) clean_files = pd.DataFrame(list(mp3_to_remove)) clean_files.to_csv("remove-{}.csv".format(name)) upload_blob( bucket_name=config.Bucket.META_DATA, source_file_name="remove-{}.csv".format(name), destination_blob_name="subject_to_removal/{}".format(name), ) print( "{} mp3s do not have labels, leaving {} in the {} labeled mp3".format( len(clean_files), data.shape[0] - len(clean_files), name ) ) print( "Removed {}% of the data".format( round((len(mp3_to_remove) / data.shape[0]) * 100, 2) ) )
def gcs_to_dataframe(data, context): '''Background Cloud Function to be triggered by Cloud Storage. This function put gcs CSV file into pandas dataframe Args: data (dict): The Cloud Functions event payload. context (google.cloud.functions.Context): Metadata of triggering event. Returns: None; the output is written to Stackdriver Logging References: https://gcsfs.readthedocs.io/en/latest/ https://github.com/pandas-dev/pandas/pull/26221#issuecomment-487393880 ''' print(f"Event ID: {context.event_id}, Event type: {context.event_type}") print(f"Bucket: {data['bucket']}, Metageneration: {data['metageneration']}, File: {data['name']}, Created: {data['timeCreated']}, Updated: {data['updated']}") try: # if gcs file is updated with same filename, you may need to set cache_timeout=0 to avoid FileNotFoundError fs = gcsfs.GCSFileSystem(cache_timeout=0) with fs.open(f"{data['bucket']}/{data['name']}","rb") as fh: dataframe = pandas.read_csv(fh) return dataframe except FileNotFoundError: print("FileNotFoundError") return None
def __init__(self, path): if SARModel.__model is not None and SARModel.__path == path: self.model = SARModel.__model return # find the .sar.related & .sar.offsets files if path.startswith("gs:"): fs = gcsfs.GCSFileSystem(project='maga-bigdata') sar_file = fs.glob(f'{path}/*.sar')[0] fs.get(sar_file, 'sarplus_cache.sar') all_files = './sarplus_cache.sar' else: # bad hack but oh well raise ValueError("Please use a gcs file.") # all_files = os.listdir(path) # def find_or_raise(extension): # files = [f for f in all_files if f.endswith(extension)] # log.info(f"files are {files}") # if len(files) != 1: # raise ValueError( # "Directory '%s' must contain exactly 1 file ending in '%s'" # % (path, extension) # ) # return path + "/" + files[0] def find_or_raise(extension): log.info(f"file is {all_files}") return all_files # instantiate C++ backend SARModel.__model = self.model = pysarplus_cpp.SARModelCpp( find_or_raise(".sar")) SARModel.__path = path
def _add_qc(samples: List[Sample], namespace: str, overwrite_multiqc: bool) -> Tuple[str, str]: """ Populates s.qc_values for each Sample object. Returns paths to MultiQC html and json files. """ multiqc_html_path = join( f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-web/qc/multiqc.html') multiqc_json_path = join( f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-analysis/qc/multiqc_data.json') if 'QC' in SOURCES_TO_PROCESS: logger.info('Running MultiQC on QC files') parsed_json_fpath = _run_multiqc( samples, multiqc_html_path, multiqc_json_path, tmp_bucket=f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-tmp/qc', namespace=namespace, overwrite=overwrite_multiqc, ) gfs = gcsfs.GCSFileSystem() with gfs.open(parsed_json_fpath) as f: row_by_sample = json.load(f) for s in samples: if s.nagim_id in row_by_sample: s.qc_values = row_by_sample[s.nagim_id] return multiqc_html_path, multiqc_json_path
def online_main(request): # STATIC DATA the_project = 'autoinsight-258217' with open('catboostworkshop-e91a753d9550.json', 'rb') as rfile: token_dic = json.load(rfile) bucket = 'catboost-workshop' #input_data = request.get_json().get('columns') # dictionary input_data = request['columns'] # dictionary df = pd.DataFrame.from_dict(input_data) print(df) # READ METADATA fs = gcsfs.GCSFileSystem(project=the_project, token=token_dic) model_path = '{0}/models/final_model_amazon.pickle'.format(bucket) with open('final_model_amazon.pickle', 'rb') as rfile: model = pickle.load(rfile) #model = read_model(model_path, fs) preds_probas = model.predict(df, prediction_type='Probability') print(preds_probas) result = { 'probability_0': preds_probas[0][0], 'probability_1': preds_probas[0][1] }
def function_handler(request): request_json = request.get_json(silent=True) dataset_bucket = request_json['dataset_bucket'] dataset_blob_name = request_json['dataset_blob_name'] model_bucket = request_json['model_bucket'] model_blob_name = request_json['model_blob_name'] fs = gcsfs.GCSFileSystem(project='Serverless-faas-workbench') with fs.open(dataset_bucket + '/' + dataset_blob_name) as f: df = pd.read_csv(f) start = time() df['train'] = df['Text'].apply(cleanup) tfidf_vect = TfidfVectorizer(min_df=100).fit(df['train']) train = tfidf_vect.transform(df['train']) model = LogisticRegression() model.fit(train, df['Score']) latency = time() - start print(latency) model_file_path = "/tmp/" + model_blob_name joblib.dump(model, model_file_path) storage_client = storage.Client() m_bucket = storage_client.get_bucket(model_bucket) m_blob = m_bucket.blob(model_blob_name) upload_blob(model_bucket, m_blob, model_file_path) return "latency : " + str(latency)
def parse_to_csv(bucket, source, filename): """Ingests covid sources""" if not bucket or not source or not filename: raise CovidIngestError( "All of source, bucket, and filename must be provided") all_sources = {'prison': None, 'ucla': None, 'recidiviz_manual': None} project_id = os.environ.get('GCP_PROJECT') path = os.path.join(bucket, source, filename) # Don't use the gcsfs cache fs = gcsfs.GCSFileSystem(project=project_id, cache_timeout=-1) logging.info("The path to download from is %s", path) bucket_path = os.path.join(bucket, source) logging.info("The files in the directory are:") logging.info(fs.ls(bucket_path)) # Next we try to find the latest version of all three sources, if for # whatever reason a source folder is completely empty, we abort the # stitching process. for covid_source in all_sources: all_sources[covid_source] = _get_latest_source_file( fs, bucket, covid_source) # Once we have the latest file for each source, start stitching return _stitch_and_upload(fs, all_sources)
def main(execution_date, **kwargs): # TODO: remove hard-coded project string fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra") bucket = get_bucket() f = read_gcfs(f"schedule/{execution_date}/status.csv") status = pd.read_csv(f) success = status[lambda d: d.status == "success"] gtfs_files = [] for ii, row in success.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*" gtfs_files.append(fs.glob(gtfs_url)) res = (success[["itp_id", "url_number"]].assign(gtfs_file=gtfs_files).explode( "gtfs_file").loc[lambda d: d.gtfs_file != "processed"]) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def list_gcs_objs(bucket_path, pattern=None, output_url=False, project=None): """Function to list objects in Google Cloud Storage Bucket args: bucket_path (str): Google Cloud Storage bucket name pattern (str | None, optional): regex pattern to search in bucket. Can seach folders by adding folder names (i.e. pattern = 'subfolder/*.txt). If None then will not use search pattern. default = None output_url (bool, optional): boolean switch to output google cloud storage http url or google cloud storage object uri. If false will output gcs uri. default = False project (str | None): Cloud project name to use when initiation file spec. If None then use default gcloud config. default = None returns: list[str]: List of objects in bucket that match pattern """ fs = gcsfs.GCSFileSystem(project=project) if pattern is not None: bucket_path = (bucket_path + "/" if not bucket_path.endswith("/") else bucket_path) blobs = fs.glob(f"{bucket_path}{pattern}") else: blobs = fs.ls(bucket_path) base = "https://storage.cloud.google.com/{0}" if output_url else "gs://{0}" return [base.format(blob) for blob in blobs]
def __enter__(self): gcs_filesystem = gcsfs.GCSFileSystem(project=self.project_id) h1 = gcs_filesystem.open(self.gcs_full_path, 'wb') h = GzipFile(fileobj=h1, mode='wb') self.set_file_handle(h) self.add_file_to_registry() return self
def open_gcs_url(config, logger, storage, url): reader_impl = SourceFile.extract_reader_impl(config) use_gcs_service_account = "service_account_json" in config["provider"] and storage == "gs://" file_to_close = None if reader_impl == "gcsfs": if use_gcs_service_account: try: token_dict = json.loads(config["provider"]["service_account_json"]) except json.decoder.JSONDecodeError as err: logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}") raise err else: token_dict = "anon" fs = gcsfs.GCSFileSystem(token=token_dict) file_to_close = fs.open(f"gs://{url}") result = file_to_close else: if use_gcs_service_account: try: credentials = json.dumps(json.loads(config["provider"]["service_account_json"])) tmp_service_account = tempfile.NamedTemporaryFile(delete=False) with open(tmp_service_account, "w") as f: f.write(credentials) tmp_service_account.close() client = Client.from_service_account_json(tmp_service_account.name) result = open(f"gs://{url}", transport_params=dict(client=client)) os.remove(tmp_service_account.name) except json.decoder.JSONDecodeError as err: logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}") raise err else: client = Client.create_anonymous_client() result = open(f"{storage}{url}", transport_params=dict(client=client)) return result, file_to_close
def save_da_to_zarr(da, zarr_bucket, dim_order=['time', 'x', 'y', 'variable'], zarr_mode='a'): da = da.transpose(*dim_order) da['time'] = get_time_as_unix(da) _, y_size, x_size, _ = da.shape out_store = gcsfs.GCSMap(root=zarr_bucket, gcs=gcsfs.GCSFileSystem()) chunks = (36, y_size, x_size, 1) ds = xr.Dataset({'stacked_eumetsat_data': da.chunk(chunks)}) zarr_mode_to_extra_kwargs = { 'a': { 'append_dim': 'time' }, 'w': { 'encoding': { 'stacked_eumetsat_data': { 'compressor': numcodecs.Blosc(cname='zstd', clevel=5), 'chunks': chunks } } } } assert zarr_mode in ['a', 'w'], '`zarr_mode` must be one of: `a`, `w`' extra_kwargs = zarr_mode_to_extra_kwargs[zarr_mode] ds.to_zarr(out_store, mode=zarr_mode, consolidated=True, **extra_kwargs) print('Saved file to zarr bucket') return ds
def _output_results(self, results: List[str], project: str, email: str, file: str = None, gcs_stored: bool = False) -> None: """Write the process results to a file. Args: results (List[str]): the results. project (str): project id email (str): OAuth email file (str, optional): file to process. Defaults to None. gcs_stored (bool, optional): write to GCS? Defaults to False. """ def _send(): for result in results: print(result, file=outfile) output_name = f'{file}.results' if gcs_stored: fs = gcsfs.GCSFileSystem(project=project) with fs.open(f'{self.bucket}/{output_name}', 'w') as outfile: _send() else: with open(output_name, 'w') as outfile: _send()
def __init__( self, *, region: Region, fs: DirectIngestGCSFileSystem, ingest_directory_path: GcsfsDirectoryPath, temp_output_directory_path: GcsfsDirectoryPath, big_query_client: BigQueryClient, region_raw_file_config: Optional[ DirectIngestRegionRawFileConfig] = None, upload_chunk_size: int = _DEFAULT_BQ_UPLOAD_CHUNK_SIZE, ): self.region = region self.fs = fs self.ingest_directory_path = ingest_directory_path self.temp_output_directory_path = temp_output_directory_path self.big_query_client = big_query_client self.region_raw_file_config = ( region_raw_file_config if region_raw_file_config else DirectIngestRegionRawFileConfig( region_code=self.region.region_code, region_module=self.region.region_module, )) self.upload_chunk_size = upload_chunk_size self.csv_reader = GcsfsCsvReader( gcsfs.GCSFileSystem(project=metadata.project_id(), cache_timeout=GCSFS_NO_CACHING)) self.raw_table_migrations = DirectIngestRawTableMigrationCollector( region_code=self.region.region_code, regions_module_override=self.region.region_module, ).collect_raw_table_migration_queries()
def get_data_nhs_region(date_today): 'get the prevalence data for the LADs within a specific window' #get most recent uploaded map fs = gcsfs.GCSFileSystem() final = date_today month = "0" + str(final.month) if final.month < 10 else str(final.month) day = final.day year = final.year start_date = datetime(2020, 6, 12) end_date = datetime.strptime(f"{year}{month}{day}", "%Y%m%d") #declare the different maps maps = [] end_date_str = datetime.strftime(end_date, '%Y%m%d') for day in pd.date_range(start_date, end_date, freq="24H"): #get the right format date_file = str(day).split(" ")[0].replace("-", "") with fs.open( os.path.join( f'covid-internal-data/covid-predictions/extrapolations/prevalence_history_{end_date_str}/corrected_prevalence_{date_file}.csv' )) as fileptr: #read the file file_prev = pd.read_csv(fileptr).groupby('nhser19nm')[ 'respondent_count', 'predicted_covid_positive_count', 'population', 'corrected_covid_positive'].sum().reset_index() #create the date file_prev['day_updated_at'] = str(day).split(" ")[0] maps.append(file_prev) return pd.concat(maps)
def load_trusted(message): # Normalize json and rename columns data = flatten(message) df = pd.DataFrame(data, index=[0]) df_result = df.rename( columns={ 'content_column_1': 'column_1', 'content_column_2': 'column_2', 'content_column_3': 'column_3' }) # Load data to a trusted folder tz = pytz.timezone('America/Sao_Paulo') now = datetime.now() aware = tz.localize(now, is_dst=None) client = storage.Client() bucket = client.get_bucket('bexs_trusted_data') dt_processamento = 'dt=' + aware.strftime("%Y-%m-%d") partition_dir = 'test/test/profile/' + dt_processamento + '/' file_name = 'profile-' + aware.strftime("%Y-%m-%d_%H:%M:%S") + '.parquet' gcs = gcsfs.GCSFileSystem(project='example_staging', token=None) df_result.to_parquet('gs://bucket_name/' + partition_dir + file_name, compression='SNAPPY')
def delete_extra_file(name: chr, bucket: str) -> None: """ Function was created to load all the files in the delete and remove those files by name in the raw folders. Those files are removed since they are not labeled with either Gender, Age, or Country of Origin. The files in the deleted bucket are saved in one of 6 folders.'validated', 'train', 'test', 'dev', 'other', 'invalidated' They represent the original folders in commonvoice-voice-voice dataset :param name: File name to upload :param bucket: Bucket name Example: file_list = ['validated', 'train', 'test', 'dev', 'other', 'invalidated'] for file in file_list: delete_extra_file(name = file, bucket = config.Bucket.RAW_DATA) """ storage_client = storage.Client() project = storage_client.project fs = gcsfs.GCSFileSystem(project=project) with fs.open("{}/delete/{}".format(config.Bucket.META_DATA, name)) as f: data = pd.read_csv(f) mp3_to_remove = data.iloc[:, 1].to_list() bucket = storage_client.bucket(bucket) delete_mp3_from_bucket(file_list=mp3_to_remove, bucket=bucket)
def get_service_configs(service=None, project_name=None): """Utility function to set configurations for the service.""" print("get_service_configs()") if service == "clearbit": try: # set configs from the env vars of the machine PROJECT_ID = os.environ["GCP_PROJECT_ID"] LOCATION_ID = os.environ["GCP_LOCATION_ID"] KEYRING_ID = os.environ["GCP_KEYRING_ID"] CRYPTO_KEY_ID = os.environ["GCP_KEY_ID"] CIPHERTEXT_BLOB = os.environ["GCP_CIPHERTEXT_BLOB"] BUCKET_NAME = os.environ["GCS_BUCKET"] # DEPRECATED # set clearbit api versions # clearbit.Person.set_version("2018-06-06") # clearbit.Company.set_version("2017-09-12") # clearbit.Reveal.set_version("2018-03-28") # # not in use # clearbit.Watchlist.set_version("2015-11-13") # # not in use # clearbit.Prospector.set_version("2016-10-04") fs = gcsfs.GCSFileSystem( project=project_name, access="full_control", token="cloud", # consistency="md5", cache_timeout=None, secure_serialize=True, check_connection=True) fs.retries = 7 fs.connect(method="cloud") cipher_string = fs.cat(BUCKET_NAME + "/" + CIPHERTEXT_BLOB) # DEPRECATED # download the file as a string in-memory # st_client = storage.Client() # bucket = st_client.get_bucket(BUCKET_NAME) # cipher_blob = bucket.blob(CIPHERTEXT_BLOB) # cipher_string = cipher_blob.download_as_string() # decrypt the kms key stored in gcs and set the key attr clearbit.key = decrypt_with_kms(project_id=PROJECT_ID, location_id=LOCATION_ID, key_ring_id=KEYRING_ID, crypto_key_id=CRYPTO_KEY_ID, ciphertext_string=cipher_string) print("configs_set: True") # catch-all except Exception as e: print("config_set: False") error = {"error_message": e, "status_code": "Unknown"} print(error) # flags for services if service == "crawler": # NOT IMPLEMENTED/NEEDED pass
def rechunk_dataset( ds: Dataset, output: str, contig: Contig, fn: Callable, chunks: Tuple[int, int], max_mem: str, progress_update_seconds: int = 60, remote: bool = True, **kwargs, ) -> Dataset: logger.info(f"Rechunking dataset for contig {contig} " f"to {output} (chunks = {chunks}):\n{ds}") if remote: gcs = gcsfs.GCSFileSystem() output = gcsfs.GCSMap(output, gcs=gcs, check=False, create=False) # Save to local zarr store with desired sample chunking with ProgressBar(dt=progress_update_seconds): res = fn( ds, output=output, chunk_length=chunks[0], chunk_width=chunks[1], max_mem=max_mem, **kwargs, ) logger.info(f"Rechunked dataset:\n{res}") return res
def read(rem_result, json_serializable=True): # compute studio results have public read access. fs = gcsfs.GCSFileSystem(token="anon") s = time.time() RemoteResult().load(rem_result) read = {"renderable": [], "downloadable": []} for category in rem_result: with fs.open(f"{BUCKET}/{rem_result[category]['ziplocation']}", "rb") as f: res = f.read() buff = io.BytesIO(res) zipfileobj = zipfile.ZipFile(buff) for rem_output in rem_result[category]["outputs"]: ser = get_serializer(rem_output["media_type"]) rem_data = ser.deserialize( zipfileobj.read(rem_output["filename"]), json_serializable ) read[category].append( { "id": rem_output.get("id", None), "title": rem_output["title"], "media_type": rem_output["media_type"], "data": rem_data, } ) f = time.time() print(f"Read finished in {f-s}s") return read