def resource_exists(uri: Optional[str], **kwargs: str) -> bool: """ Validate that the URI provided points to an existing file. None is a valid option. Parameters ---------- uri: Optional[str] The URI to validate resource existance for. Returns ------- status: bool The validation status. """ if uri is None: return True if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"): # Convert to gsutil form if necessary if uri.startswith("https://storage.googleapis"): uri = convert_gcs_json_url_to_gsutil_form(uri) # If uri is not convertible to gsutil form we can't confirm if uri == "": return False if kwargs.get("google_credentials_file"): fs = GCSFileSystem( token=str(kwargs.get("google_credentials_file", "anon"))) return fs.exists(uri) # Can't check GCS resources without creds file else: try: anon_fs = GCSFileSystem(token="anon") return anon_fs.exists(uri) except Exception: return False # Is HTTP remote resource elif uri.startswith("http"): try: # Use HEAD request to check if remote resource exists r = requests.head(uri) return r.status_code == requests.codes.ok except requests.exceptions.SSLError: return False # Get any filesystem and try try: fs, path = url_to_fs(uri) return fs.exists(path) except Exception: return False
def _get_file_to_upload( path: str, fs: gcsfs.GCSFileSystem, url: str, pdf_name: str, always_download: bool, post_data: Dict, verify_ssl: bool, ) -> Optional[str]: """This function checks first whether it needs to download, and then returns the locally downloaded pdf""" # First check if the path doesn't exist at all path_to_download = None if always_download or not fs.exists(path): if post_data: response = requests.post(url, data=post_data, verify=verify_ssl) else: response = requests.get(url, verify=verify_ssl) if response.status_code == 200: path_to_download = os.path.join(tempfile.gettempdir(), pdf_name) with open(path_to_download, "wb") as f: # Need to use content since PDF needs to write raw bytes. f.write(response.content) else: raise ScrapeAggregateError( "Could not download file {}".format(pdf_name)) return path_to_download
def open_and_combine_lat_lon_data(folder, tiles=None): """ Load lat lon data stored as 10x10 degree tiles in folder If tiles is none, load all data available If no file is available, return None """ fs = GCSFileSystem(cache_timeout=0) if not tiles: tiles = [ os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder) if not path.endswith('/') ] uris = [f'{folder}{tile}.zarr' for tile in tiles] ds_list = [] for uri in uris: if fs.exists(uri): da = open_zarr_file(uri) if da.lat[0] > da.lat[-1]: da = da.reindex(lat=da.lat[::-1]) if da.lon[0] > da.lon[-1]: da = da.reindex(lat=da.lon[::-1]) ds_list.append(da) if len(ds_list) > 0: ds = xr.combine_by_coords(ds_list, combine_attrs="drop_conflicts").chunk({ 'lat': 2000, 'lon': 2000 }) return ds # print(f'No data available at {folder} for tiles {tiles}') return None
class GcsUnstructuredProvider(UnstructuredStorageProvider): """This class allows you to upload arbitrary bytes to GCS. They will be stored under bucket_name/base_path/filename """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, ) -> None: super().__init__() self.project = project self.bucket_name = bucket_name self.base_path = base_path self.token = token self.base_path = f"{bucket_name}/{base_path}/{{filename}}" self.file_name_cache: Set[str] = set() """The set of all filenames ever uploaded, checked before uploading""" self.logger = logging.getLogger("openwpm") async def init(self) -> None: await super(GcsUnstructuredProvider, self).init() self.file_system = GCSFileSystem(project=self.project, token=self.token, access="read_write") async def store_blob(self, filename: str, blob: bytes, overwrite: bool = False) -> None: target_path = self.base_path.format(filename=filename) if not overwrite and (filename in self.file_name_cache or self.file_system.exists(target_path)): self.logger.info("Not saving out file %s as it already exists", filename) return self.file_system.start_transaction() with self.file_system.open(target_path, mode="wb") as f: f.write(blob) self.file_system.end_transaction() self.file_name_cache.add(filename) async def flush_cache(self) -> None: pass async def shutdown(self) -> None: pass