def __download_climate_observations_data(remote_file: str) -> bytes: try: zip_file = download_file_from_dwd(remote_file) except InvalidURL as e: raise InvalidURL( f"Error: the station data {remote_file} could not be reached." ) from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zip_file_opened = ZipFile(zip_file) # Files of archive archive_files = zip_file_opened.namelist() for file in archive_files: # If found file load file in bytes, close zipfile and return bytes if file.startswith(PRODUCT_FILE_IDENTIFIER): file_in_bytes = zip_file_opened.open(file).read() zip_file_opened.close() return file_in_bytes # If whatsoever no file was found and returned already throw exception raise ProductFileNotFound( f"The archive of {remote_file} does not hold a 'produkt' file.") except BadZipFile as e: raise BadZipFile( f"The archive of {remote_file} seems to be corrupted.") from e
def _download_stations() -> Tuple[BytesIO, int]: """ Download station list from ECCC FTP server. :return: CSV payload, source identifier """ gdrive_url = "https://drive.google.com/uc?id=1HDRnj41YBWpMioLPwAFiLlK4SK8NV72C" http_url = ( "https://github.com/earthobservations/testdata/raw/main/ftp.tor.ec.gc.ca/Pub/" "Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv.gz" ) payload = None source = None try: payload = download_file(gdrive_url, CacheExpiry.METAINDEX) source = 0 except Exception: log.exception(f"Unable to access Google drive server at {gdrive_url}") # Fall back to different source. try: response = download_file(http_url, CacheExpiry.METAINDEX) with gzip.open(response, mode="rb") as f: payload = BytesIO(f.read()) source = 1 except Exception: log.exception(f"Unable to access HTTP server at {http_url}") if payload is None: raise FailedDownload("Unable to acquire ECCC stations_result list") return payload, source
def _extract_radolan_data( date_time: datetime, archive_in_bytes: BytesIO ) -> RadarResult: """ Function used to extract RADOLAN_CDC file for the requested datetime from the downloaded archive. Args: date_time: requested datetime of RADOLAN archive_in_bytes: downloaded archive of RADOLAN file Returns: the datetime formatted as string and the RADOLAN file for the datetime """ # Need string of datetime to check if one of the files in the archive contains # the requested datetime date_time_string = date_time.strftime(DatetimeFormat.ymdhm.value) # First try to unpack archive from archive (case for historical data) try: # Have to seek(0) as the archive might be reused archive_in_bytes.seek(0) with gzip.GzipFile(fileobj=archive_in_bytes, mode="rb") as gz_file: file_in_archive = BytesIO(gz_file.read()) with tarfile.open(fileobj=file_in_archive) as tar_file: for file in tar_file.getmembers(): if date_time_string in file.name: return RadarResult( data=BytesIO(tar_file.extractfile(file).read()), timestamp=date_time, filename=file.name, ) raise FileNotFoundError( f"RADOLAN file for {date_time_string} not found." ) # pragma: no cover except EOFError as ex: raise FailedDownload( f"RADOLAN file for {date_time_string} is invalid: {ex}" ) # pragma: no cover # Otherwise if there's an error the data is from recent time period and only has to # be unpacked once except tarfile.ReadError: # Seek again for reused purpose archive_in_bytes.seek(0) with gzip.GzipFile(fileobj=archive_in_bytes, mode="rb") as gz_file: return RadarResult( data=BytesIO(gz_file.read()), timestamp=date_time, filename=gz_file.name )
def _download_climate_observations_data_parallel( remote_file: Union[str, Path]) -> BytesIO: """ This function downloads the station data for which the link is provided by the 'select_dwd' function. It checks the shortened filepath (just the zipfile) for its parameters, creates the full filepath and downloads the file(s) according to the set up folder. Args: remote_file: contains path to file that should be downloaded and the path to the folder to store the files Returns: stores data on local file system """ try: zip_file = download_file_from_dwd(remote_file, DWDCDCBase.CLIMATE_OBSERVATIONS) except InvalidURL as e: raise InvalidURL( f"Error: the station data {remote_file} couldn't be reached." ) from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zip_file_opened = ZipFile(zip_file) # Files of archive archive_files = zip_file_opened.namelist() for file in archive_files: # If found file load file in bytes, close zipfile and return bytes if file.startswith(PRODUCT_FILE_IDENTIFIER): file_in_bytes = BytesIO(zip_file_opened.open(file).read()) zip_file_opened.close() return file_in_bytes # If whatsoever no file was found and returned already throw exception raise ProductFileNotFound( f"The archive of {remote_file} does not hold a 'produkt' file.") except BadZipFile as e: raise BadZipFile( f"The archive of {remote_file} seems to be corrupted.") from e
def __download_climate_observations_data(remote_file: str) -> bytes: try: file = download_file(remote_file, ttl=CacheExpiry.FIVE_MINUTES) except InvalidURL as e: raise InvalidURL(f"Error: the station data {remote_file} could not be reached.") from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zfs = ZipFileSystem(file) except BadZipFile as e: raise BadZipFile(f"The archive of {remote_file} seems to be corrupted.") from e product_file = zfs.glob("produkt*") if len(product_file) != 1: raise ProductFileNotFound(f"The archive of {remote_file} does not hold a 'produkt' file.") return zfs.open(product_file[0]).read()
def _download_stations() -> bytes: """ Download station list from ECCC FTP server. :return: CSV payload """ ftp_url = ( "ftp://*****:*****@ftp.tor.ec.gc.ca" "/Pub/Get_More_Data_Plus_de_donnees/Station Inventory EN.csv" ) http_url = ( "https://raw.githubusercontent.com/earthobservations/testdata" "/main/ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees" "/Station%20Inventory%20EN.csv.gz" ) payload = None # Try original source. session = FTPSession() try: response = session.retr(ftp_url) payload = response.content except Exception: log.exception(f"Unable to access FTP server at {ftp_url}") # Fall back to different source. try: response = requests.get(http_url) response.raise_for_status() with gzip.open(BytesIO(response.content), mode="rb") as f: payload = f.read() except Exception: log.exception(f"Unable to access HTTP server at {http_url}") if payload is None: raise FailedDownload("Unable to acquire ECCC stations list") return payload