Example #1
0
def _get_all_granule_pages(session, url, entry_map, max_paging_depth=1000000):
    """Helper function for searching through all pages for a collection.

    Parameters
    -----------
    session: :class:`requets.Session`, optional
        An authenticated Earthdata login session
    url: str
        URL to website
    entry_map: function
        Function for mapping the entries to a desired format
    max_paging_depth
    """
    page_size = int([
        q for q in url.split("?")[1].split("&") if "page_size" in q
    ][0].split("=")[1])
    max_pages = int(max_paging_depth / page_size)

    pydict = _get_from_url(url, session).json()
    entries = list(map(entry_map, pydict["feed"]["entry"]))

    for i in range(1, max_pages):
        page_url = url + "&page_num=%d" % (i + 1)
        page_entries = _get_from_url(page_url, session).json()["feed"]["entry"]
        if not page_entries:
            break
        entries.extend(list(map(entry_map, page_entries)))
    return entries
Example #2
0
    def from_url(cls, url):
        """
        Create podpac Node from a WMS/WCS request.

        Arguments
        ---------
        url : str, dict
            The raw WMS/WCS request url, or a dictionary of query parameters

        Returns
        -------
        :class:`Node`
            A full Node with sub-nodes based on the definition of the node from the URL

        Notes
        -------
        The request can specify the PODPAC node by four different mechanism:

        * Direct node name: PODPAC will look for an appropriate node in podpac.datalib
        * JSON definition passed using the 'PARAMS' query string: Need to specify the special LAYER/COVERAGE value of
          "%PARAMS%"
        * By pointing at the JSON definition retrievable with a http GET request:
          e.g. by setting LAYER/COVERAGE value to https://my-site.org/pipeline_definition.json
        * By pointing at the JSON definition retrievable from an S3 bucket that the user has access to:
          e.g by setting LAYER/COVERAGE value to s3://my-bucket-name/pipeline_definition.json
        """
        params = _get_query_params_from_url(url)

        if _get_param(params, "SERVICE") == "WMS":
            layer = _get_param(params, "LAYERS")
        elif _get_param(params, "SERVICE") == "WCS":
            layer = _get_param(params, "COVERAGE")

        d = None
        if layer.startswith("https://"):
            d = _get_from_url(layer).json()
        elif layer.startswith("s3://"):
            parts = layer.split("/")
            bucket = parts[2]
            key = "/".join(parts[3:])
            s3 = S3CacheStore(s3_bucket=bucket)
            s = s3._load(key)
        elif layer == "%PARAMS%":
            s = _get_param(params, "PARAMS")
        else:
            p = _get_param(params, "PARAMS")
            if p is None:
                p = "{}"
            d = OrderedDict({
                layer.replace(".", "-"): {
                    "node": layer,
                    "attrs": json.loads(p)
                }
            })

        if d is None:
            d = json.loads(s, object_pairs_hook=OrderedDict)

        return cls.from_definition(d)
Example #3
0
 def calibration_data(self):
     cd = _get_from_url(self.station_calibration_url)
     if cd is None:
         raise ConnectionError(
             "COSMOS data cannot be retrieved. Is the site {} down?".format(
                 self.station_calibration_url))
     cd = cd.json()
     cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]]
     return cd
Example #4
0
 def _property_source_default(self):
     v = _infer_SMAP_product_version("SPL4SMLM", SMAP_BASE_URL(),
                                     self.session)
     url = SMAP_BASE_URL() + "/SPL4SMLM.%03d/2015.03.31/" % (v)
     r = _get_from_url(url, session=self.session)
     if not r:
         return "None"
     n = self.file_url_re.search(r.text).group()
     return url + n
Example #5
0
def search_granule_json(session=None, entry_map=None, **kwargs):
    """Search for specific files from NASA CMR for a particular collection

    Parameters
    -----------
    session: :class:`requets.Session`, optional
        An authenticated Earthdata login session
    entry_map: function
        A function applied to each individual entry. Could be used to filter out certain data in an entry
    **kwargs: dict
        Additional query string parameters.
        At minimum the provider, provider_id, concept_id, collection_concept_id, short_name, version, or entry_title
        need to be provided for a granule search.

    Returns
    ---------
    list
        Entries for each granule in the collection based on the search terms
    """
    base_url = CMR_URL + "granules.json?"

    if not np.any([
            m not in kwargs for m in [
                "provider",
                "provider_id",
                "concept_id",
                "collection_concept_id",
                "short_name",
                "version",
                "entry_title",
            ]
    ]):
        raise ValueError(
            "Need to provide either"
            " provider, provider_id, concept_id, collection_concept_id, short_name, version or entry_title"
            " for granule search.")

    if "page_size" not in kwargs:
        kwargs["page_size"] = "2000"

    if entry_map is None:
        entry_map = lambda x: x

    query_string = "&".join([k + "=" + str(v) for k, v in kwargs.items()])

    if session is None:
        session = requests

    url = base_url + query_string
    if "page_num" not in kwargs:
        entries = _get_all_granule_pages(session, url, entry_map)
    else:
        pydict = _get_from_url(url, session).json()
        entries = list(map(entry_map, pydict["feed"]["entry"]))

    return entries
Example #6
0
    def raw_data(self):
        _logger.info("Downloading station data from {}".format(
            self.station_data_url))

        r = _get_from_url(self.station_data_url)
        if r is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(
                    self.station_calibration_url))
        return r.text
Example #7
0
    def _stations_data_raw(self):
        url = self.url + self.stations_url
        r = _get_from_url(url)
        t = r.text

        # Fix the JSON
        t_f = re.sub(':\s?",', ': "",', t)  # Missing closing parenthesis
        if t_f[-5:] == ",\n]}\n":  # errant comma
            t_f = t_f[:-5] + "\n]}\n"

        return t_f
Example #8
0
    def site_properties(self):
        r = _get_from_url(self.station_properties_url)
        soup = bs4.BeautifulSoup(r.text, "lxml")
        regex = re.compile("Soil Organic Carbon")
        loc = soup.body.findAll(text=regex)[0].parent.parent
        label, value = loc.findAll("div")
        labels = [l.strip() for l in label.children if "br" not in str(l)]
        values = [l.strip() for l in value.children if "br" not in str(l) and l.strip() != ""]

        properties = {k: v for k, v in zip(labels, values)}

        return _convert_str_to_vals(properties)
Example #9
0
    def _stations_data_raw(self):
        url = self.url + self.stations_url
        r = _get_from_url(url)
        if r is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(
                    url))

        t = r.text

        # Fix the JSON
        t_f = re.sub(':\s?",', ': "",', t)  # Missing closing parenthesis
        if t_f[-5:] == ",\n]}\n":  # errant comma
            t_f = t_f[:-5] + "\n]}\n"

        return t_f
Example #10
0
def get_collection_entries(session=None,
                           short_name=None,
                           keyword=None,
                           **kwargs):
    """Uses NASA CMR to retrieve metadata about a collection

    Parameters
    -----------
    session: :class:`requets.Session`, optional
        An authenticated Earthdata login session
    short_name: str, optional
        The short name of the dataset
    keyword: str, optional
        Any keyword search parameters
    **kwargs: str, optional
        Any additional query parameters

    Returns
    ---------
    list:
        A list of collection metadata dictionaries

    Examples:
    -----------
    >>> # This make the following request https://cmr.earthdata.nasa.gov/search/collections.json?short_name=SPL2SMAP_S
    >>> get_collection_id(short_name='SPL2SMAP_S')
    ['C1522341104-NSIDC_ECS']
    """

    base_url = CMR_URL + "collections.json?"
    if short_name is not None:
        kwargs["short_name"] = short_name
    if keyword is not None:
        kwargs["keyword"] = keyword

    query_string = "&".join([k + "=" + v for k, v in kwargs.items()])

    # use generic requests session if `session` is not defined
    if session is None:
        session = requests

    pydict = _get_from_url(base_url + query_string, session).json()

    entries = pydict["feed"]["entry"]

    return entries
Example #11
0
 def available_dates(self):
     """ Available dates in SMAP date format, sorted."""
     url = "/".join(
         [self.base_url,
          "%s.%03d" % (self.product, self.version)])
     r = _get_from_url(url, self.session)
     if r is None:
         _logger.warning(
             "Could not contact {} to retrieve source coordinates".format(
                 url))
         return []
     soup = BeautifulSoup(r.text, "lxml")
     matches = [
         self.date_url_re.match(a.get_text()) for a in soup.find_all("a")
     ]
     dates = [m.group() for m in matches if m]
     return dates
Example #12
0
    def site_properties(self):
        r = _get_from_url(self.station_properties_url)
        if r is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(
                    self.station_properties_url))
        soup = bs4.BeautifulSoup(r.text, "lxml")
        regex = re.compile("Soil Organic Carbon")
        loc = soup.body.findAll(text=regex)[0].parent.parent
        label, value = loc.findAll("div")
        labels = [l.strip() for l in label.children if "br" not in str(l)]
        values = [
            l.strip() for l in value.children
            if "br" not in str(l) and l.strip() != ""
        ]

        properties = {k: v for k, v in zip(labels, values)}

        return _convert_str_to_vals(properties)
Example #13
0
def _infer_SMAP_product_version(product, base_url, session):
    """Helper function to automatically infer the version number of SMAP
    products in case user did not specify a version, or the version changed

    Parameters
    ------------
    product: str
        Name of the SMAP product (e.g. one of SMAP_PRODUCT_DICT.keys())
    base_url: str
        URL to base SMAP product page
    session: :class:`requests.Session`
        Authenticated EDS session. Generally returned from :class:`SMAPSessionMixin`.
    """

    r = _get_from_url(base_url, session=session)
    if r:
        m = re.search(product, r.text)
        return int(r.text[m.end() + 1:m.end() + 4])
    return int(
        SMAP_PRODUCT_MAP.sel(product=product, attr="default_version").item())
Example #14
0
    def available_coords_sources(self):
        """Read NSIDC site for available coordinate sources

        Returns
        -------
        np.ndarray
            Available times of sources in the folder
        np.ndarray
            Available lat lon coordinates of sources in the folder, None if empty
        np.ndarray
            The url's of the sources

        Raises
        ------
        RuntimeError
            If the NSIDC website cannot be accessed
        """
        r = _get_from_url(self.folder_url, self.session)
        if r is None:
            _logger.warning(
                "Could not contact {} to retrieve source coordinates".format(
                    self.folder_url))
            return np.array([]), None, np.array([])
        soup = BeautifulSoup(r.text, "lxml")
        a = soup.find_all("a")
        file_regex = self.file_url_re
        file_regex2 = self.file_url_re2
        date_time_regex = self.date_time_url_re
        date_regex = self.date_url_re
        latlon_regex = self.latlon_url_re
        times = []
        latlons = []
        sources = []
        for aa in a:
            t = aa.get_text().strip("\n")
            if "h5.iso.xml" in t:
                continue
            m = file_regex.match(t)
            m2 = file_regex2.match(t)

            lonlat = None
            if m:
                date_time = date_time_regex.search(m.group()).group()
                times.append(smap2np_date(date_time))

            elif m2:
                m = m2
                date = date_regex.search(m.group()).group()
                times.append(smap2np_date(date))
            if m:
                sources.append(m.group())
                lonlat = latlon_regex.search(m.group())
            if lonlat:
                lonlat = lonlat.group()
                latlons.append((
                    float(lonlat[4:6]) * (1 - 2 * (lonlat[6] == "S")),
                    float(lonlat[:3]) * (1 - 2 * (lonlat[3] == "W")),
                ))

        times = np.atleast_1d(np.array(times).squeeze())
        latlons = np.array(latlons)
        sources = np.array(sources)
        I = np.argsort(times)
        if latlons.shape[0] == times.size:
            return times[I], latlons[I], sources[I]
        return times[I], None, sources[I]
Example #15
0
 def calibration_data(self):
     cd = _get_from_url(self.station_calibration_url).json()
     cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]]
     return cd