Exemple #1
0
def resource_exists(uri: Optional[str], **kwargs: str) -> bool:
    """
    Validate that the URI provided points to an existing file.

    None is a valid option.

    Parameters
    ----------
    uri: Optional[str]
        The URI to validate resource existance for.

    Returns
    -------
    status: bool
        The validation status.
    """

    if uri is None:
        return True

    if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"):
        # Convert to gsutil form if necessary
        if uri.startswith("https://storage.googleapis"):
            uri = convert_gcs_json_url_to_gsutil_form(uri)

            # If uri is not convertible to gsutil form we can't confirm
            if uri == "":
                return False

        if kwargs.get("google_credentials_file"):
            fs = GCSFileSystem(
                token=str(kwargs.get("google_credentials_file", "anon")))
            return fs.exists(uri)

        # Can't check GCS resources without creds file
        else:
            try:
                anon_fs = GCSFileSystem(token="anon")
                return anon_fs.exists(uri)
            except Exception:
                return False

    # Is HTTP remote resource
    elif uri.startswith("http"):
        try:
            # Use HEAD request to check if remote resource exists
            r = requests.head(uri)

            return r.status_code == requests.codes.ok
        except requests.exceptions.SSLError:
            return False

    # Get any filesystem and try
    try:
        fs, path = url_to_fs(uri)
        return fs.exists(path)
    except Exception:
        return False
Exemple #2
0
def _get_file_to_upload(
    path: str,
    fs: gcsfs.GCSFileSystem,
    url: str,
    pdf_name: str,
    always_download: bool,
    post_data: Dict,
    verify_ssl: bool,
) -> Optional[str]:
    """This function checks first whether it needs to download, and then
    returns the locally downloaded pdf"""
    # First check if the path doesn't exist at all
    path_to_download = None
    if always_download or not fs.exists(path):
        if post_data:
            response = requests.post(url, data=post_data, verify=verify_ssl)
        else:
            response = requests.get(url, verify=verify_ssl)
        if response.status_code == 200:
            path_to_download = os.path.join(tempfile.gettempdir(), pdf_name)
            with open(path_to_download, "wb") as f:
                # Need to use content since PDF needs to write raw bytes.
                f.write(response.content)
        else:
            raise ScrapeAggregateError(
                "Could not download file {}".format(pdf_name))
    return path_to_download
Exemple #3
0
def open_and_combine_lat_lon_data(folder, tiles=None):
    """
    Load lat lon data stored as 10x10 degree tiles in folder
    If tiles is none, load all data available
    If no file is available, return None
    """
    fs = GCSFileSystem(cache_timeout=0)
    if not tiles:
        tiles = [
            os.path.splitext(os.path.split(path)[-1])[0]
            for path in fs.ls(folder) if not path.endswith('/')
        ]

    uris = [f'{folder}{tile}.zarr' for tile in tiles]
    ds_list = []
    for uri in uris:
        if fs.exists(uri):
            da = open_zarr_file(uri)
            if da.lat[0] > da.lat[-1]:
                da = da.reindex(lat=da.lat[::-1])
            if da.lon[0] > da.lon[-1]:
                da = da.reindex(lat=da.lon[::-1])
            ds_list.append(da)

    if len(ds_list) > 0:
        ds = xr.combine_by_coords(ds_list,
                                  combine_attrs="drop_conflicts").chunk({
                                      'lat':
                                      2000,
                                      'lon':
                                      2000
                                  })
        return ds
    # print(f'No data available at {folder} for tiles {tiles}')
    return None
class GcsUnstructuredProvider(UnstructuredStorageProvider):
    """This class allows you to upload arbitrary bytes to GCS.
    They will be stored under bucket_name/base_path/filename
    """

    file_system: GCSFileSystem

    def __init__(
        self,
        project: str,
        bucket_name: str,
        base_path: str,
        token: str = None,
    ) -> None:
        super().__init__()
        self.project = project
        self.bucket_name = bucket_name
        self.base_path = base_path
        self.token = token
        self.base_path = f"{bucket_name}/{base_path}/{{filename}}"

        self.file_name_cache: Set[str] = set()
        """The set of all filenames ever uploaded, checked before uploading"""
        self.logger = logging.getLogger("openwpm")

    async def init(self) -> None:
        await super(GcsUnstructuredProvider, self).init()
        self.file_system = GCSFileSystem(project=self.project,
                                         token=self.token,
                                         access="read_write")

    async def store_blob(self,
                         filename: str,
                         blob: bytes,
                         overwrite: bool = False) -> None:
        target_path = self.base_path.format(filename=filename)
        if not overwrite and (filename in self.file_name_cache
                              or self.file_system.exists(target_path)):
            self.logger.info("Not saving out file %s as it already exists",
                             filename)
            return
        self.file_system.start_transaction()

        with self.file_system.open(target_path, mode="wb") as f:
            f.write(blob)

        self.file_system.end_transaction()

        self.file_name_cache.add(filename)

    async def flush_cache(self) -> None:
        pass

    async def shutdown(self) -> None:
        pass