def find_tiles_for_bounding_box(min_lat, max_lat, min_lon, max_lon): """ return a list of 10x10 degree tile names covering the bounding box the tile names are in the format of {lat}_{lon} where lat, lon represent the upper left corner ocean tiles are removed """ fs = GCSFileSystem(cache_timeout=0) folder = 'gs://carbonplan-climatetrace/intermediates/ecoregions_mask/' available_tiles = [ os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder) if not path.endswith('/') ] step = 10 lat_start = math.ceil(min_lat / step) * step lat_stop = math.ceil(max_lat / step) * step all_lat_tiles = np.arange(start=lat_start, stop=lat_stop + 1, step=step) if min_lat == lat_start: all_lat_tiles = all_lat_tiles[1:] lon_start = math.floor(min_lon / step) * step lon_stop = math.floor(max_lon / step) * step all_lon_tiles = np.arange(start=lon_start, stop=lon_stop + 1, step=step) if max_lon == lon_stop: all_lon_tiles = all_lon_tiles[:-1] out = [] for lat in all_lat_tiles: for lon in all_lon_tiles: lat_tag, lon_tag = get_lat_lon_tags_from_bounding_box(lat, lon) fn = f'{lat_tag}_{lon_tag}' if fn in available_tiles: out.append(fn) return out
def open_and_combine_lat_lon_data(folder, tiles=None): """ Load lat lon data stored as 10x10 degree tiles in folder If tiles is none, load all data available If no file is available, return None """ fs = GCSFileSystem(cache_timeout=0) if not tiles: tiles = [ os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder) if not path.endswith('/') ] uris = [f'{folder}{tile}.zarr' for tile in tiles] ds_list = [] for uri in uris: if fs.exists(uri): da = open_zarr_file(uri) if da.lat[0] > da.lat[-1]: da = da.reindex(lat=da.lat[::-1]) if da.lon[0] > da.lon[-1]: da = da.reindex(lat=da.lon[::-1]) ds_list.append(da) if len(ds_list) > 0: ds = xr.combine_by_coords(ds_list, combine_attrs="drop_conflicts").chunk({ 'lat': 2000, 'lon': 2000 }) return ds # print(f'No data available at {folder} for tiles {tiles}') return None
def validated_gcs_bucket_name(self) -> str: if self._validated_gcs_bucket_name is None: if self.gcs_bucket_name is not None: bucket = self.gcs_bucket_name else: # Open the key to get the project id with open(self.google_credentials_file, "r") as open_resource: creds = json.load(open_resource) project_id = creds["project_id"] # Remove all files in bucket bucket = f"{project_id}.appspot.com" # Validate fs = GCSFileSystem(token=self.google_credentials_file) try: fs.ls(bucket) self._validated_gcs_bucket_name = bucket except FileNotFoundError: raise ValueError( f"Provided or infered GCS bucket name does not exist. ('{bucket}')" ) return self._validated_gcs_bucket_name
def __init__(self, path='.', gcs=None, **fsargs): if gcs is None: self.gcs = GCSFileSystem(**fsargs) else: self.gcs = gcs self.cache = {} self.counter = 0 self.root = path
def main(): # Make spark session global spark spark = ( pyspark.sql.SparkSession.builder #.config("parquet.summary.metadata.level", "ALL") .config("parquet.summary.metadata.level", "NONE") .getOrCreate() ) start_time = datetime.now() # Load all molecular trait sumstats # This has to be done separately, followed by unionByName as the hive # parititions differ across datasets due to different tissues # (bio_features) and chromosomes strip_path_mol = udf(lambda x: x.replace('file:', ''), StringType()) mol_dfs = [] mol_pattern = 'gs://genetics-portal-sumstats-b38/unfiltered/molecular_trait/' fs = GCSFileSystem() # List files; remove trailing '/' and deduplicate paths = list(set([s.rstrip('/') for s in fs.glob(mol_pattern)])) for inf in paths: if fs.isdir(inf): print("gs://" + inf) df = ( spark.read.parquet("gs://" + inf) .withColumn('input_name', strip_path_mol(lit(inf))) ) mol_dfs.append(df) # Take union sumstats = functools.reduce( functools.partial(pyspark.sql.DataFrame.unionByName, allowMissingColumns=True), mol_dfs ) cols_to_keep = ['study_id', 'bio_feature', 'gene_id', 'chrom', 'pos', 'ref', 'alt', 'pval'] # Calculate the number of tests and min pval per gene ---------- min_pvals = ( sumstats .select(*cols_to_keep) .groupby('study_id', 'bio_feature', 'gene_id') .agg(count(col('pval')).alias('num_tests'), min(col('pval')).alias('min_pval')) .orderBy('study_id', 'bio_feature', 'min_pval') ) # Collect all data and write using pandas min_pvals.toPandas().to_csv( 'gs://genetics-portal-dev-analysis/js29/molecular_trait/min_pvals_per_gene_old_2002.csv.gz', index=False) print('Time taken: {}'.format(datetime.now() - start_time)) return 0
def __init__(self, path='.', gcs=None, nfiles=10, **fsargs): if gcs is None: # minimum block size: still read on 5MB boundaries. self.gcs = GCSFileSystem(block_size=30 * 2 ** 20, cache_timeout=6000, **fsargs) else: self.gcs = gcs self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles) self.write_cache = {} self.counter = 0 self.root = path
def load_model_from_path(path, project_name=None, key=None): if path[:5] == 'gs://': if project_name is None: fs = GCSFileSystem() else: fs = GCSFileSystem(project_name) file = fs.open(path) else: file = path return load_model(file, custom_objects={'Swish': Swish, 'InstanceNormalization': InstanceNormalization})
def open_glah01_data(): fs = GCSFileSystem(cache_timeout=0) uris = [ f'gs://{f}' for f in fs.ls('gs://carbonplan-climatetrace/intermediates/glah01/') if not f.endswith('/') ] ds_list = [open_zarr_file(uri) for uri in uris] ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': 2000}) for k in ds: _ = ds[k].encoding.pop('chunks', None) return ds
class GcsUnstructuredProvider(UnstructuredStorageProvider): """This class allows you to upload arbitrary bytes to GCS. They will be stored under bucket_name/base_path/filename """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, ) -> None: super().__init__() self.project = project self.bucket_name = bucket_name self.base_path = base_path self.token = token self.base_path = f"{bucket_name}/{base_path}/{{filename}}" self.file_name_cache: Set[str] = set() """The set of all filenames ever uploaded, checked before uploading""" self.logger = logging.getLogger("openwpm") async def init(self) -> None: await super(GcsUnstructuredProvider, self).init() self.file_system = GCSFileSystem( project=self.project, token=self.token, access="read_write" ) async def store_blob( self, filename: str, blob: bytes, overwrite: bool = False ) -> None: target_path = self.base_path.format(filename=filename) if not overwrite and ( filename in self.file_name_cache or self.file_system.exists(target_path) ): self.logger.info("Not saving out file %s as it already exists", filename) return with self.file_system.open(target_path, mode="wb") as f: f.write(blob) self.file_name_cache.add(filename) async def flush_cache(self) -> None: pass async def shutdown(self) -> None: pass
class GcsStructuredProvider(ArrowProvider): """This class allows you to upload Parquet files to GCS. This might not actually be the thing that we want to do long term but seeing as GCS is the S3 equivalent of GCP it is the easiest way forward. Inspired by the old S3Aggregator structure the GcsStructuredProvider will by default store into base_path/visits/table_name in the given bucket. Pass a different sub_dir to change this. """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, sub_dir: str = "visits", ) -> None: super().__init__() self.project = project self.token = token self.base_path = f"{bucket_name}/{base_path}/{sub_dir}/{{table_name}}" def __str__(self) -> str: return f"GCS:{self.base_path.removesuffix('/{table_name}')}" async def init(self) -> None: await super(GcsStructuredProvider, self).init() self.file_system = GCSFileSystem(project=self.project, token=self.token, access="read_write") async def write_table(self, table_name: TableName, table: Table) -> None: self.file_system.start_transaction() pq.write_to_dataset( table, self.base_path.format(table_name=table_name), filesystem=self.file_system, ) self.file_system.end_transaction() async def shutdown(self) -> None: pass
def _get_file_to_upload( path: str, fs: gcsfs.GCSFileSystem, url: str, pdf_name: str, always_download: bool, post_data: Dict, verify_ssl: bool, ) -> Optional[str]: """This function checks first whether it needs to download, and then returns the locally downloaded pdf""" # First check if the path doesn't exist at all path_to_download = None if always_download or not fs.exists(path): if post_data: response = requests.post(url, data=post_data, verify=verify_ssl) else: response = requests.get(url, verify=verify_ssl) if response.status_code == 200: path_to_download = os.path.join(tempfile.gettempdir(), pdf_name) with open(path_to_download, "wb") as f: # Need to use content since PDF needs to write raw bytes. f.write(response.content) else: raise ScrapeAggregateError( "Could not download file {}".format(pdf_name)) return path_to_download
def _clean_cdp_filestore(google_creds_path: Path) -> None: # Connect to database fs = GCSFileSystem(token=str(google_creds_path)) # Open the key to get the project id with open(google_creds_path, "r") as open_resource: creds = json.load(open_resource) project_id = creds["project_id"] # Remove all files in bucket bucket = f"{project_id}.appspot.com" log.info(f"Cleaning bucket: {bucket}") try: fs.rm(f"{bucket}/*") # Handle empty bucket except FileNotFoundError: pass log.info("Filestore cleaning complete")
def resource_exists(uri: Optional[str], **kwargs: str) -> bool: """ Validate that the URI provided points to an existing file. None is a valid option. Parameters ---------- uri: Optional[str] The URI to validate resource existance for. Returns ------- status: bool The validation status. """ if uri is None: return True if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"): # Convert to gsutil form if necessary if uri.startswith("https://storage.googleapis"): uri = convert_gcs_json_url_to_gsutil_form(uri) # If uri is not convertible to gsutil form we can't confirm if uri == "": return False if kwargs.get("google_credentials_file"): fs = GCSFileSystem( token=str(kwargs.get("google_credentials_file", "anon"))) return fs.exists(uri) # Can't check GCS resources without creds file else: try: anon_fs = GCSFileSystem(token="anon") return anon_fs.exists(uri) except Exception: return False # Is HTTP remote resource elif uri.startswith("http"): try: # Use HEAD request to check if remote resource exists r = requests.head(uri) return r.status_code == requests.codes.ok except requests.exceptions.SSLError: return False # Get any filesystem and try try: fs, path = url_to_fs(uri) return fs.exists(path) except Exception: return False
def main(month, type_, outfile): spark = build_spark() raw_dat = spark.read.parquet('gs://spain-tweets/rehydrated/lake').where(f'month = {month}') dat = get_dat(spark, raw_dat) tweets = get_tweets(dat) if type_ == 'tweets': nodes, edges = build_tweet_graph(tweets, dat) G = create_graph(nodes, edges, 'id_str') elif type_ == 'users': nodes, edges = build_user_graph(tweets) G = create_graph(nodes, edges, 'user') else: raise TypeError(f'Unrecognized type_ parameter: {type_}') fs = GCSFileSystem(project = 'trollhunters') with fs.open(outfile, 'wb') as f: nx.write_graphml(G, f)
def load_npz(path, project_name=None, key=None): if path[:5] == 'gs://': if project_name is None: fs = GCSFileSystem(token=key) else: fs = GCSFileSystem(project_name, token=key) file = fs.open(path) else: file = path print(f'Loading file {path.rsplit("/", 1)[-1]}') with np.load(file, allow_pickle=True) as npz: print(f'Available files: {npz.files}') X = npz[npz.files[0]] X = np.expand_dims(X, -1)[0]['sunset_ims'] return X
def setUpClass(self): self.path = f"tests/{str(uuid.uuid4())}/table1" self.spark = ( pyspark.sql.SparkSession.builder.appName("deltalake").config( "spark.jars.packages", "io.delta:delta-core_2.12:0.7.0").config( "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config( "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog", ).getOrCreate()) df = (self.spark.range(0, 1000).withColumn("number", rand()).withColumn( "number2", when(col("id") < 500, 0).otherwise(1))) for i in range(12): df.write.partitionBy("number2").format("delta").mode( "append").save(self.path) self.fs = GCSFileSystem(project=GCP_PROJECT_ID) self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True) self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}", file_system=self.fs)
def initialize_gcs_file_system(credentials_file: str) -> GCSFileSystem: """ Initializes an instance of a GCSFileSystem. Parameters ---------- credentials_file: str The path to the Google Service Account credentials JSON file. Returns ------- file_system: GCSFileSystem An initialized GCSFileSystem. """ return GCSFileSystem(token=str(credentials_file))
class DeltaReaderAppendTest(TestCase): @classmethod def setUpClass(self): self.path = f"tests/{str(uuid.uuid4())}/table1" self.spark = ( pyspark.sql.SparkSession.builder.appName("deltalake").config( "spark.jars.packages", "io.delta:delta-core_2.12:0.7.0").config( "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config( "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog", ).getOrCreate()) df = (self.spark.range(0, 1000).withColumn("number", rand()).withColumn( "number2", when(col("id") < 500, 0).otherwise(1))) for i in range(12): df.write.partitionBy("number2").format("delta").mode( "append").save(self.path) self.fs = GCSFileSystem(project=GCP_PROJECT_ID) self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True) self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}", file_system=self.fs) @classmethod def tearDownClass(self): # remove folder when we are done with the test self.fs.rm(f"{GCP_BUCKET}/{self.path}", recursive=True) shutil.rmtree(self.path) def test_paths(self): assert self.table.path == f"{GCP_BUCKET}/{self.path}" assert self.table.log_path == f"{GCP_BUCKET}/{self.path}/_delta_log" def test_versions(self): assert self.table.checkpoint == 10 assert self.table.version == 11 def test_data(self): # read the parquet files using pandas df_pandas = self.table.to_pandas() # read the table using spark df_spark = self.spark.read.format("delta").load(self.path).toPandas() # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_version(self): # read the parquet files using pandas df_pandas = self.table.as_version(5, inplace=False).to_pandas() # read the table using spark df_spark = (self.spark.read.format("delta").option( "versionAsOf", 5).load(self.path).toPandas()) # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_partitioning(self): # Partition pruning should half number of rows assert self.table.to_table( filter=ds.field("number2") == 0).num_rows == 6000 def test_predicate_pushdown(self): # number is random 0-1, so we should have fewer than 12000 rows no matter what assert self.table.to_table( filter=ds.field("number") < 0.5).num_rows < 12000 def test_column_pruning(self): t = self.table.to_table(columns=["number", "number2"]) assert t.column_names == ["number", "number2"]
class GCSFS(Operations): def __init__(self, path='.', gcs=None, **fsargs): if gcs is None: self.gcs = GCSFileSystem(**fsargs) else: self.gcs = gcs self.cache = {} self.counter = 0 self.root = path def getattr(self, path, fh=None): try: info = self.gcs.info(''.join([self.root, path])) except FileNotFoundError: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']: data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data def readdir(self, path, fh): path = ''.join([self.root, path]) files = self.gcs.ls(path) files = [f.rstrip('/').rsplit('/', 1)[1] for f in files] return ['.', '..'] + files def mkdir(self, path, mode): bucket, key = core.split_path(path) if not self.gcs.info(path): self.gcs.dirs['bucket'].append({ 'bucket': bucket, 'kind': 'storage#object', 'size': 0, 'storageClass': 'DIRECTORY', 'name': path.rstrip('/') + '/' }) def rmdir(self, path): info = self.gcs.info(path) if info['storageClass':'DIRECTORY']: self.gcs.rm(path, False) def read(self, path, size, offset, fh): print('read', path, size, offset, fh) fn = ''.join([self.root, path]) f = self.cache[fn] f.seek(offset) out = f.read(size) return out def write(self, path, data, offset, fh): print('write', path, offset, fh) f = self.cache[fh] f.write(data) return len(data) def create(self, path, flags): print('create', path, oct(flags)) fn = ''.join([self.root, path]) self.gcs.touch( fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def open(self, path, flags): print('open', path, oct(flags)) fn = ''.join([self.root, path]) if flags % 2 == 0: # read f = self.gcs.open(fn, 'rb') else: # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def truncate(self, path, length, fh=None): print('truncate', path, length, fh) fn = ''.join([self.root, path]) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) def unlink(self, path): print('delete', path) fn = ''.join([self.root, path]) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) def release(self, path, fh): print('close', path, fh) try: f = self.cache[fh] f.close() self.cache.pop(fh, None) # should release any cache memory except Exception as e: print(e) return 0 def chmod(self, path, mode): raise NotImplementedError
def read_schema(path): fs = GCSFileSystem(project='trollhunters') with fs.open(path, 'rb') as f: schema = pickle.load(f) return schema
def _get_client(self): if self._client is None: self._client = GCSFileSystem() return self._client
async def init(self) -> None: await super(GcsUnstructuredProvider, self).init() self.file_system = GCSFileSystem( project=self.project, token=self.token, access="read_write" )
# auto-generate some GCS metrics from gcsfs import GCSFileSystem fs = GCSFileSystem('pangeo-181919') # https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size def sizeof_fmt(num, suffix='B'): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) # get disk usage of each folder in gs://pangeo-data with open('du-pangeo-data.csv', 'w') as f: f.write('directory, size, nbytes') print('directory, size, nbytes') for folder in fs.ls('pangeo-data'): nbytes = fs.du(folder) f.write(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}') print(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}') # upload CSV to gs://pangeo-data fs.put('du-pangeo-data.csv', 'pangeo-data/du-pangeo-data.csv')
import os from collections import defaultdict import dask import fsspec import xarray as xr import zarr from dask.distributed import Client from gcsfs import GCSFileSystem from carbonplan_trace.v1.glas_extract import extract_GLAH01_data, extract_GLAH14_data fs = GCSFileSystem() skip_existing = True chunksize = 2000 drop_keys = { 'GLAH01': ['rec_bin', 'shot_number', 'tx_bin'], 'GLAH14': ['n_gaussian_peaks', 'shot_number'], } def get_mapper(uri): key = os.path.splitext(os.path.split(uri)[-1])[0] muri = f'gs://carbonplan-scratch/glas-zarr-cache/{key}.zarr' mapper = fsspec.get_mapper(muri) return mapper @dask.delayed
def read_transcripts_and_generate_grams( event_transcripts: EventTranscripts, n_grams: int, credentials_file: str) -> List[ContextualizedGram]: """ Parse all documents and create a list of contextualized grams for later weighting. Parameters ---------- event_transcripts: EventTranscripts The EventTranscripts object to parse all transcripts for. n_grams: int N number of terms to act as a unique entity. credentials_file: str Path to Google Service Account Credentials JSON file. Returns ------- grams: List[ContextualizedGram] All grams found in all transcripts provided. """ fs = GCSFileSystem(token=credentials_file) # Store all n_gram results event_n_grams: List[ContextualizedGram] = [] # Iter over each transcript for transcript_db_file in event_transcripts.transcript_db_files: with TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir) local_transcript_filepath = temp_dir_path / transcript_db_file.name # Download transcript fs.get( rpath=transcript_db_file.uri, lpath=str(local_transcript_filepath), ) # Init transcript with open(local_transcript_filepath, "r") as open_f: transcript = Transcript.from_json( open_f.read()) # type: ignore # Get cleaned sentences by removing stop words cleaned_sentences: List[SentenceManager] = [ SentenceManager( original_details=sentence, cleaned_text=string_utils.clean_text( sentence.text, clean_stop_words=True, ), n_grams=[], ) for sentence in transcript.sentences ] # Filter any empty sentences cleaned_sentences = [ sm for sm in cleaned_sentences if len(sm.cleaned_text) > 1 ] # Get all n_grams for each sentence for sm in cleaned_sentences: sm.n_grams = [*ngrams(sm.cleaned_text.split(), n_grams)] # Init stemmer and stem all grams stemmer = SnowballStemmer("english") for sm in cleaned_sentences: for n_gram in sm.n_grams: # Join into a single n gram unstemmed_n_gram = " ".join(n_gram) # Join, lower, and stem the n gram stemmed_n_gram = " ".join( [stemmer.stem(term.lower()) for term in n_gram]) # Get context span # Because ngrams function, cleaning, and split may affect the exact # matchup of the term, use fuzzy diff to find closest closest_term = "" closest_term_score = 0.0 for term in sm.original_details.text.split(): similarity = rapidfuzz.fuzz.QRatio(term, n_gram[0]) if similarity > closest_term_score: closest_term = term closest_term_score = similarity # Get surrounding terms terms = sm.original_details.text.split() target_term_index = terms.index(closest_term) # Get left and right indices left_i = 0 if target_term_index - 8 < 0 else target_term_index - 8 right_i = (None if target_term_index + 7 >= len(terms) - 1 else target_term_index + 7) context_span = " ".join(terms[left_i:right_i]) # Append ellipsis if left_i != 0: context_span = f"... {context_span}" if right_i is not None: context_span = f"{context_span}..." # Append to event list event_n_grams.append( ContextualizedGram( event_id=event_transcripts.event_id, event_datetime=event_transcripts.event_datetime, unstemmed_gram=unstemmed_n_gram, stemmed_gram=stemmed_n_gram, context_span=context_span, )) return event_n_grams
import requests import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from gcsfs import GCSFileSystem from flask import Flask, request app = Flask(__name__) fs = GCSFileSystem(project='blog-180218') warnings.simplefilter(action='ignore', category=FutureWarning) def get_current_dt(): """ Current datetime in CST """ return pytz.utc.localize(datetime.datetime.utcnow(), is_dst=None).astimezone( pytz.timezone('America/Chicago')) def rotowire_scrape(dt=None, test=False): """
def build(cls) -> DirectIngestGCSFileSystem: return DirectIngestGCSFileSystemImpl( GCSFileSystem(project=metadata.project_id(), cache_timeout=GCSFS_NO_CACHING))
class GCSFS(Operations): def __init__(self, path='.', gcs=None, nfiles=10, **fsargs): if gcs is None: # minimum block size: still read on 5MB boundaries. self.gcs = GCSFileSystem(block_size=30 * 2 ** 20, cache_timeout=6000, **fsargs) else: self.gcs = gcs self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles) self.write_cache = {} self.counter = 0 self.root = path @_tracemethod def getattr(self, path, fh=None): path = ''.join([self.root, path]) try: info = self.gcs.info(path) except FileNotFoundError: parent = path.rsplit('/', 1)[0] if path in self.gcs.ls(parent): info = True else: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if (info is True or info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']): data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data @_tracemethod def readdir(self, path, fh): path = ''.join([self.root, path]) logger.info("List {}, {}".format(path, fh)) files = self.gcs.ls(path) files = [os.path.basename(f.rstrip('/')) for f in files] return ['.', '..'] + files @_tracemethod def mkdir(self, path, mode): path = ''.join([self.root, path]) logger.info("Mkdir {}".format(path)) parent, name = path.rsplit('/', 1) prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes'] if name not in prefixes: prefixes.append(name) return 0 @_tracemethod def rmdir(self, path): info = self.gcs.info(path) if info['storageClass': 'DIRECTORY']: self.gcs.rm(path, False) @_tracemethod def read(self, path, size, offset, fh): fn = ''.join([self.root, path]) logger.info('read #{} ({}) offset: {}, size: {}'.format( fh, fn, offset, size)) out = self.cache.read(fn, offset, size) return out @_tracemethod def write(self, path, data, offset, fh): fn = ''.join([self.root, path]) logger.info('write #{} ({}) offset'.format(fh, fn, offset)) f = self.write_cache[fh] f.write(data) return len(data) @_tracemethod def create(self, path, flags): fn = ''.join([self.root, path]) logger.info('create {} {}'.format(fn, oct(flags))) self.gcs.touch(fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def open(self, path, flags): fn = ''.join([self.root, path]) logger.info('open {} {}'.format(fn, oct(flags))) if flags % 2 == 0: # read self.cache.open(fn) else: # write (but ignore creation flags) self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def truncate(self, path, length, fh=None): fn = ''.join([self.root, path]) logger.info('truncate #{} ({}) to {}'.format(fh, fn, length)) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) @_tracemethod def unlink(self, path): fn = ''.join([self.root, path]) logger.info('delete', fn) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) @_tracemethod def release(self, path, fh): fn = ''.join([self.root, path]) logger.info('close #{} ({})'.format(fh, fn)) try: if fh in self.write_cache: # write mode f = self.write_cache[fh] f.close() self.write_cache.pop(fh, None) except Exception as e: logger.exception("exception on release:" + str(e)) return 0 @_tracemethod def chmod(self, path, mode): raise NotImplementedError
def fs(self): from gcsfs import GCSFileSystem return GCSFileSystem(**self.fs_args)
def fs(self): from gcsfs import GCSFileSystem return GCSFileSystem(**self.login_info, consistency=None)