def query(self, start_date: str, end_date: str, product: str, provider: str = 'LPDAAC_ECS', bbox: List = []) -> List[Dict]: """ Search CMR database for spectral MODIS tiles matching a temporal range, defined by a start date and end date. Returns metadata containing the URL of each image. Parameters ---------- start_date: string Start date yyyy-mm-dd end_date: string End date yyyy-mm-dd product: string Product name provider: string Provider (default is 'LPDAAC_ECS') bbox: List[float] Bounding box [lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat] Returns ---------- List[Dict] List of granules """ q = GranuleQuery() prod, ver = product['products'][0].split('.') q.short_name(prod).version(ver) q.temporal(f"{start_date}T00:00:00Z", f"{end_date}T23:59:59Z") if (len(bbox) >= 4): q.bounding_box(*bbox[:4]) _granules = q.get_all() # filter dates if 'day_offset' in product.keys(): day_offset = products[product]['day_offset'] else: day_offset = 0 granules = [] for gran in _granules: # CMR uses day 1 of window - correct this to be middle of window date = (dateparser(gran['time_start'].split('T')[0]) + datetime.timedelta(days=day_offset)).date() if (dateparser(start_date).date() <= date <= dateparser(end_date).date()): granules.append(gran) logger.info("%s granules found within %s - %s" % (len(granules), start_date, end_date)) return granules
class ModisQuery(object): """Class for querying and downloading MODIS data. The user can create a query and send it to NASA's CMR servers. The response can be either just printed to console or passed to the `download` method, to fetch the resulting HDF files to local disk. """ def __init__(self, products: List[str], aoi: List[Union[float, int]] = None, begindate: datetime = None, enddate: datetime = None, tile_filter: List[str] = None, version: str = "006") -> None: """Initialize instance ModisQuery class. This creates an instance of `ModisQuery` with the basic query parameters. The `aoi` needs to be a list if either 2 or 4 coordinates as `float` or `int`, depending on if a point or bounding box is requested. The tile_filter needs to be specified as list of MODIS tile IDs in format hXXvXX (e.g. `h20v08`). Args: products (List[str]): List of product codes to be queried / downloaded. aoi (List[Union[float, int]]): Area of interes (point as lon/lat or bounding box as xmin, ymin, xmax, ymax). begindate (datetime): Start date for query. enddate (datetime): End date for query. tile_filter (List[str]): List of tiles to be queried / downloaded (refines initial results). version (str): MODIS collection version. Raises: AssertionError: If no product code is supplied or `if len(aoi) not in [2, 4]`. """ assert products, "No product IDs supplied!" self.begin = begindate self.end = enddate self.tile_filter = tile_filter self.api = GranuleQuery() # construct query self.api.parameters( short_name=products, version=version, temporal=(begindate, enddate) ) if aoi is not None: if len(aoi) == 2: self.api.point(*aoi) elif len(aoi) == 4: self.api.bounding_box(*aoi) else: raise ValueError("Expected point or bounding box as AOI") def search(self, match_begin: bool = True) -> None: """Send quert to MODIS CMR servers. Constructs the query from parameters passed to `__init__` and sends the query to the NASA servers. The returned results will be stored in a class variable. To deal with overlapping date ranges of composite products, the specified start date can be matched to the MODIS native timestamp. Args: match_begin (bool): Flag to match begin date with native MODIS timestamp (no data with timestamp earlier than begindate is allowed). """ # init results dict self.results = {} # if no dates supplied, we can't be match if self.begin is None and self.end is None: match_begin = False log.debug("Starting query") # get all results results_all = self.api.get_all() log.debug("Query complete, filtering results") for result in self._parse_response(results_all): # skip tiles outside of filter if self.tile_filter and result["tile"]: if result["tile"] not in self.tile_filter: continue # enforce dates if required if match_begin: if self.begin is not None: if result["time_start"] < self.begin.date(): continue if self.end is not None: if result["time_start"] > self.end.date(): continue filename = result["filename"] del result["filename"] self.results.update({filename: result}) # final results self.nresults = len(self.results) log.debug("Search complete. Total results: %s, filtered: %s", len(results_all), self.nresults) @staticmethod def _parse_response(query: List[dict]) -> dict: """Generator for parsing API response. Args: query (List[dict]): Query returned by CMR API. Returns: dict: Parsed query as dict. """ tile_regxp = re.compile(r".+(h\d+v\d+).+") for entry in query: entry_parsed = dict( filename=entry["producer_granule_id"], time_start=pd.Timestamp(entry["time_start"]).date(), time_end=pd.Timestamp(entry["time_end"]).date(), updated=entry["updated"], link=entry["links"][0]["href"], ) try: tile = tile_regxp.search(entry_parsed["filename"]).group(1) except AttributeError: tile = None entry_parsed.update({"tile": tile}) yield entry_parsed @staticmethod def _parse_hdfxml(response): result = {} tree = ElementTree.fromstring(response.content) for entry in tree.iter(tag='GranuleURMetaData'): for datafile in entry.iter(tag="DataFiles"): for datafilecont in datafile.iter(tag="DataFileContainer"): for content in datafilecont: if content.tag in ["Checksum", "FileSize"]: result.update({content.tag: int(content.text)}) return result def _fetch(self, session: SessionWithHeaderRedirection, url: str, destination: Path, overwrite: bool, check: bool, ) -> Tuple[str, Union[None, Exception]]: """Helper function to fetch HDF files Args: session (SessionWithHeaderRedirection): requests session to fetch file. url (str): URL for file. destination (Path): Target directory. overwrite (bool): Overwrite existing. check (bool): Check file size and checksum. Returns: Tuple[str, Union[None, Exception]]: Returns tuple with either (filename, None) for success and (URL, Exception) for error. """ filename = url.split("/")[-1] filename_full = destination.joinpath(filename) if not exists(filename_full) or overwrite: filename_temp = filename_full.with_suffix(".modapedl") try: with session.get(url, stream=True, allow_redirects=True) as response: response.raise_for_status() with open(filename_temp, "wb") as openfile: shutil.copyfileobj(response.raw, openfile, length=16*1024*1024) if check: with session.get(url + ".xml", allow_redirects=True) as response: response.raise_for_status() file_metadata = self._parse_hdfxml(response) # check filesize assert filename_temp.stat().st_size == file_metadata["FileSize"] with open(filename_temp, "rb") as openfile: checksum = cksum(openfile) # check checksum assert checksum == file_metadata["Checksum"] shutil.move(filename_temp, filename_full) except (HTTPError, ConnectionError, AssertionError, FileNotFoundError) as e: try: filename_temp.unlink() except FileNotFoundError: pass return (filename, e) else: log.info("%s exists in target. Please set overwrite to True.", filename_full) return (filename, None) def download(self, targetdir: Path, username: str, password: str, overwrite: bool = False, multithread: bool = False, nthreads: int = 4, max_retries: int = -1, robust: bool = False, ) -> List: """Download MODIS HDF files. This method downloads the MODIS HDF files contained in the server response to the `search` call. This requires NASA earthdata credentials. To speed up download, multiple threads can be used. Args: targetdir (Path): Target directory for files being downloaded. username (str): Earthdata username. password (str): Earthdata password. overwrite (bool): Replace existing files. multithread (bool): Use multiple threads for downloading. nthreads (int): Number of threads. max_retries (int): Maximum number of retries for failed downloads (for no max, set -1). robust (bool): Perform robust downloading (checks file size and checksum). Raises: DownloadError: If one or more errors were encountered during downloading. Returns: List of downloaded MODIS HDF filenames. """ # make sure target directory is dir and exists assert targetdir.is_dir() assert targetdir.exists() retry_count = 0 to_download = self.results.copy() downloaded = [] while True: with SessionWithHeaderRedirection(username, password) as session: backoff = min(450, 2**retry_count) retries = Retry(total=5, backoff_factor=backoff, status_forcelist=[502, 503, 504]) session.mount( "https://", HTTPAdapter(pool_connections=nthreads, pool_maxsize=nthreads*2, max_retries=retries) ) if multithread: log.debug("Multithreaded download using %s threads. Warming up connection pool.", nthreads) # warm up pool _ = session.get(list(to_download.values())[0]["link"], stream=True, allow_redirects=True) with ThreadPoolExecutor(nthreads) as executor: futures = [executor.submit(self._fetch, session, values["link"], targetdir, overwrite, robust) for key, values in to_download.items()] downloaded_temp = [x.result() for x in futures] else: log.debug("Serial download") downloaded_temp = [] for _, values in to_download.items(): downloaded_temp.append( self._fetch(session, values["link"], targetdir, overwrite, robust) ) # check if downloads are OK for fid, err in downloaded_temp: if err is None: del to_download[fid] downloaded.append(fid) if to_download: if retry_count < max_retries or max_retries == -1: retry_count += 1 log.debug("Retrying downloads! Files left: %s", len(to_download)) if max_retries > 0: log.debug("Try %s of %s", retry_count, max_retries) continue raise DownloadError(list(to_download.keys())) break return downloaded