def _get_all_granule_pages(session, url, entry_map, max_paging_depth=1000000): """Helper function for searching through all pages for a collection. Parameters ----------- session: :class:`requets.Session`, optional An authenticated Earthdata login session url: str URL to website entry_map: function Function for mapping the entries to a desired format max_paging_depth """ page_size = int([ q for q in url.split("?")[1].split("&") if "page_size" in q ][0].split("=")[1]) max_pages = int(max_paging_depth / page_size) pydict = _get_from_url(url, session).json() entries = list(map(entry_map, pydict["feed"]["entry"])) for i in range(1, max_pages): page_url = url + "&page_num=%d" % (i + 1) page_entries = _get_from_url(page_url, session).json()["feed"]["entry"] if not page_entries: break entries.extend(list(map(entry_map, page_entries))) return entries
def from_url(cls, url): """ Create podpac Node from a WMS/WCS request. Arguments --------- url : str, dict The raw WMS/WCS request url, or a dictionary of query parameters Returns ------- :class:`Node` A full Node with sub-nodes based on the definition of the node from the URL Notes ------- The request can specify the PODPAC node by four different mechanism: * Direct node name: PODPAC will look for an appropriate node in podpac.datalib * JSON definition passed using the 'PARAMS' query string: Need to specify the special LAYER/COVERAGE value of "%PARAMS%" * By pointing at the JSON definition retrievable with a http GET request: e.g. by setting LAYER/COVERAGE value to https://my-site.org/pipeline_definition.json * By pointing at the JSON definition retrievable from an S3 bucket that the user has access to: e.g by setting LAYER/COVERAGE value to s3://my-bucket-name/pipeline_definition.json """ params = _get_query_params_from_url(url) if _get_param(params, "SERVICE") == "WMS": layer = _get_param(params, "LAYERS") elif _get_param(params, "SERVICE") == "WCS": layer = _get_param(params, "COVERAGE") d = None if layer.startswith("https://"): d = _get_from_url(layer).json() elif layer.startswith("s3://"): parts = layer.split("/") bucket = parts[2] key = "/".join(parts[3:]) s3 = S3CacheStore(s3_bucket=bucket) s = s3._load(key) elif layer == "%PARAMS%": s = _get_param(params, "PARAMS") else: p = _get_param(params, "PARAMS") if p is None: p = "{}" d = OrderedDict({ layer.replace(".", "-"): { "node": layer, "attrs": json.loads(p) } }) if d is None: d = json.loads(s, object_pairs_hook=OrderedDict) return cls.from_definition(d)
def calibration_data(self): cd = _get_from_url(self.station_calibration_url) if cd is None: raise ConnectionError( "COSMOS data cannot be retrieved. Is the site {} down?".format( self.station_calibration_url)) cd = cd.json() cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]] return cd
def _property_source_default(self): v = _infer_SMAP_product_version("SPL4SMLM", SMAP_BASE_URL(), self.session) url = SMAP_BASE_URL() + "/SPL4SMLM.%03d/2015.03.31/" % (v) r = _get_from_url(url, session=self.session) if not r: return "None" n = self.file_url_re.search(r.text).group() return url + n
def search_granule_json(session=None, entry_map=None, **kwargs): """Search for specific files from NASA CMR for a particular collection Parameters ----------- session: :class:`requets.Session`, optional An authenticated Earthdata login session entry_map: function A function applied to each individual entry. Could be used to filter out certain data in an entry **kwargs: dict Additional query string parameters. At minimum the provider, provider_id, concept_id, collection_concept_id, short_name, version, or entry_title need to be provided for a granule search. Returns --------- list Entries for each granule in the collection based on the search terms """ base_url = CMR_URL + "granules.json?" if not np.any([ m not in kwargs for m in [ "provider", "provider_id", "concept_id", "collection_concept_id", "short_name", "version", "entry_title", ] ]): raise ValueError( "Need to provide either" " provider, provider_id, concept_id, collection_concept_id, short_name, version or entry_title" " for granule search.") if "page_size" not in kwargs: kwargs["page_size"] = "2000" if entry_map is None: entry_map = lambda x: x query_string = "&".join([k + "=" + str(v) for k, v in kwargs.items()]) if session is None: session = requests url = base_url + query_string if "page_num" not in kwargs: entries = _get_all_granule_pages(session, url, entry_map) else: pydict = _get_from_url(url, session).json() entries = list(map(entry_map, pydict["feed"]["entry"])) return entries
def raw_data(self): _logger.info("Downloading station data from {}".format( self.station_data_url)) r = _get_from_url(self.station_data_url) if r is None: raise ConnectionError( "COSMOS data cannot be retrieved. Is the site {} down?".format( self.station_calibration_url)) return r.text
def _stations_data_raw(self): url = self.url + self.stations_url r = _get_from_url(url) t = r.text # Fix the JSON t_f = re.sub(':\s?",', ': "",', t) # Missing closing parenthesis if t_f[-5:] == ",\n]}\n": # errant comma t_f = t_f[:-5] + "\n]}\n" return t_f
def site_properties(self): r = _get_from_url(self.station_properties_url) soup = bs4.BeautifulSoup(r.text, "lxml") regex = re.compile("Soil Organic Carbon") loc = soup.body.findAll(text=regex)[0].parent.parent label, value = loc.findAll("div") labels = [l.strip() for l in label.children if "br" not in str(l)] values = [l.strip() for l in value.children if "br" not in str(l) and l.strip() != ""] properties = {k: v for k, v in zip(labels, values)} return _convert_str_to_vals(properties)
def _stations_data_raw(self): url = self.url + self.stations_url r = _get_from_url(url) if r is None: raise ConnectionError( "COSMOS data cannot be retrieved. Is the site {} down?".format( url)) t = r.text # Fix the JSON t_f = re.sub(':\s?",', ': "",', t) # Missing closing parenthesis if t_f[-5:] == ",\n]}\n": # errant comma t_f = t_f[:-5] + "\n]}\n" return t_f
def get_collection_entries(session=None, short_name=None, keyword=None, **kwargs): """Uses NASA CMR to retrieve metadata about a collection Parameters ----------- session: :class:`requets.Session`, optional An authenticated Earthdata login session short_name: str, optional The short name of the dataset keyword: str, optional Any keyword search parameters **kwargs: str, optional Any additional query parameters Returns --------- list: A list of collection metadata dictionaries Examples: ----------- >>> # This make the following request https://cmr.earthdata.nasa.gov/search/collections.json?short_name=SPL2SMAP_S >>> get_collection_id(short_name='SPL2SMAP_S') ['C1522341104-NSIDC_ECS'] """ base_url = CMR_URL + "collections.json?" if short_name is not None: kwargs["short_name"] = short_name if keyword is not None: kwargs["keyword"] = keyword query_string = "&".join([k + "=" + v for k, v in kwargs.items()]) # use generic requests session if `session` is not defined if session is None: session = requests pydict = _get_from_url(base_url + query_string, session).json() entries = pydict["feed"]["entry"] return entries
def available_dates(self): """ Available dates in SMAP date format, sorted.""" url = "/".join( [self.base_url, "%s.%03d" % (self.product, self.version)]) r = _get_from_url(url, self.session) if r is None: _logger.warning( "Could not contact {} to retrieve source coordinates".format( url)) return [] soup = BeautifulSoup(r.text, "lxml") matches = [ self.date_url_re.match(a.get_text()) for a in soup.find_all("a") ] dates = [m.group() for m in matches if m] return dates
def site_properties(self): r = _get_from_url(self.station_properties_url) if r is None: raise ConnectionError( "COSMOS data cannot be retrieved. Is the site {} down?".format( self.station_properties_url)) soup = bs4.BeautifulSoup(r.text, "lxml") regex = re.compile("Soil Organic Carbon") loc = soup.body.findAll(text=regex)[0].parent.parent label, value = loc.findAll("div") labels = [l.strip() for l in label.children if "br" not in str(l)] values = [ l.strip() for l in value.children if "br" not in str(l) and l.strip() != "" ] properties = {k: v for k, v in zip(labels, values)} return _convert_str_to_vals(properties)
def _infer_SMAP_product_version(product, base_url, session): """Helper function to automatically infer the version number of SMAP products in case user did not specify a version, or the version changed Parameters ------------ product: str Name of the SMAP product (e.g. one of SMAP_PRODUCT_DICT.keys()) base_url: str URL to base SMAP product page session: :class:`requests.Session` Authenticated EDS session. Generally returned from :class:`SMAPSessionMixin`. """ r = _get_from_url(base_url, session=session) if r: m = re.search(product, r.text) return int(r.text[m.end() + 1:m.end() + 4]) return int( SMAP_PRODUCT_MAP.sel(product=product, attr="default_version").item())
def available_coords_sources(self): """Read NSIDC site for available coordinate sources Returns ------- np.ndarray Available times of sources in the folder np.ndarray Available lat lon coordinates of sources in the folder, None if empty np.ndarray The url's of the sources Raises ------ RuntimeError If the NSIDC website cannot be accessed """ r = _get_from_url(self.folder_url, self.session) if r is None: _logger.warning( "Could not contact {} to retrieve source coordinates".format( self.folder_url)) return np.array([]), None, np.array([]) soup = BeautifulSoup(r.text, "lxml") a = soup.find_all("a") file_regex = self.file_url_re file_regex2 = self.file_url_re2 date_time_regex = self.date_time_url_re date_regex = self.date_url_re latlon_regex = self.latlon_url_re times = [] latlons = [] sources = [] for aa in a: t = aa.get_text().strip("\n") if "h5.iso.xml" in t: continue m = file_regex.match(t) m2 = file_regex2.match(t) lonlat = None if m: date_time = date_time_regex.search(m.group()).group() times.append(smap2np_date(date_time)) elif m2: m = m2 date = date_regex.search(m.group()).group() times.append(smap2np_date(date)) if m: sources.append(m.group()) lonlat = latlon_regex.search(m.group()) if lonlat: lonlat = lonlat.group() latlons.append(( float(lonlat[4:6]) * (1 - 2 * (lonlat[6] == "S")), float(lonlat[:3]) * (1 - 2 * (lonlat[3] == "W")), )) times = np.atleast_1d(np.array(times).squeeze()) latlons = np.array(latlons) sources = np.array(sources) I = np.argsort(times) if latlons.shape[0] == times.size: return times[I], latlons[I], sources[I] return times[I], None, sources[I]
def calibration_data(self): cd = _get_from_url(self.station_calibration_url).json() cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]] return cd