Beispiel #1
0
    def __init__(self, dataset_url):
        # Get an etree object
        r = requests.get(dataset_url)
        tree = etree.XML(str(r.text))

        dataset = tree.find("{%s}dataset" % INV_NS)
        self.id = dataset.get("ID")
        self.name = dataset.get("name")
        self.metadata = dataset.find("{%s}metadata" % INV_NS)
        self.catalog_url = dataset_url.split("?")[0]
        service_tag = dataset.find("{%s}serviceName" % INV_NS)
        if service_tag is None:
            service_tag = self.metadata.find("{%s}serviceName" % INV_NS)
        service_name = service_tag.text

        self.services = []

        for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)):
            if service.get("serviceType") == "Compound":
                for s in service.findall("{%s}service" % INV_NS):
                    url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath")
                    if s.get("suffix") is not None:
                        url += s.get("suffix")
                    self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } )
            else:
                url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix","")
                self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
Beispiel #2
0
    def __init__(self, dataset_url):

        self.services    = []
        self.id          = None
        self.name        = None
        self.metadata    = None
        self.catalog_url = None
        self.data_size   = None

        # Get an etree object
        r = requests.get(dataset_url)
        try:
            tree = etree.XML(r.text.encode('utf-8'))
        except etree.XMLSyntaxError:
            logger.error("Error procesing %s, invalid XML" % dataset_url)
        else:
            dataset = tree.find("{%s}dataset" % INV_NS)
            self.id = dataset.get("ID")
            self.name = dataset.get("name")
            self.metadata = dataset.find("{%s}metadata" % INV_NS)
            self.catalog_url = dataset_url.split("?")[0]

            # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize
            data_size = dataset.find("{%s}dataSize" % INV_NS)
            if data_size is not None:
                self.data_size = float(data_size.text)
                data_units = data_size.get('units')
                # Convert to MB
                if data_units == "bytes":
                    self.data_size *= 1e-6
                elif data_units == "Kbytes":
                    self.data_size *= 0.001
                elif data_units == "Gbytes":
                    self.data_size /= 0.001
                elif data_units == "Tbytes":
                    self.data_size /= 1e-6

            # Services
            service_tag = dataset.find("{%s}serviceName" % INV_NS)
            if service_tag is None:
                service_tag = self.metadata.find("{%s}serviceName" % INV_NS)
            service_name = service_tag.text

            for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)):
                if service.get("serviceType") == "Compound":
                    for s in service.findall("{%s}service" % INV_NS):
                        url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath")
                        if s.get("suffix") is not None:
                            url += s.get("suffix")
                        # ISO like services need additional parameters
                        if s.get('name') in ["iso", "ncml", "uddc"]:
                            url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url))
                        self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } )
                else:
                    url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "")
                    # ISO like services need additional parameters
                    if service.get('name') in ["iso", "ncml", "uddc"]:
                        url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url))
                    self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
Beispiel #3
0
    def __init__(self, dataset_url):

        self.services    = []
        self.id          = None
        self.name        = None
        self.metadata    = None
        self.catalog_url = None

        # Get an etree object
        r = requests.get(dataset_url)
        try:
            tree = etree.XML(str(r.text))
        except etree.XMLSyntaxError:
            logger.error("Error procesing %s, invalid XML" % dataset_url)
        else:
            dataset = tree.find("{%s}dataset" % INV_NS)
            self.id = dataset.get("ID")
            self.name = dataset.get("name")
            self.metadata = dataset.find("{%s}metadata" % INV_NS)
            self.catalog_url = dataset_url.split("?")[0]
            service_tag = dataset.find("{%s}serviceName" % INV_NS)
            if service_tag is None:
                service_tag = self.metadata.find("{%s}serviceName" % INV_NS)
            service_name = service_tag.text

            for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)):
                if service.get("serviceType") == "Compound":
                    for s in service.findall("{%s}service" % INV_NS):
                        url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath")
                        if s.get("suffix") is not None:
                            url += s.get("suffix")
                        # ISO like services need additional parameters
                        if s.get('name') in ["iso", "ncml", "uddc"]:
                            url += "?dataset=%s&catalog=%s" % (self.id, urllib.quote_plus(self.catalog_url))
                        self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } )
                else:
                    url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "")
                    # ISO like services need additional parameters
                    if s.get('name') in ["iso", "ncml", "uddc"]:
                            url += "?dataset=%s&catalog=%s" % (self.id, urllib.quote_plus(self.catalog_url))
                    self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
    def _run(self, url):
        if url in self.visited:
            logger.debug("Skipping %s (already crawled)" % url)
            return
        self.visited.append(url)

        logger.info("Crawling: %s" % url)

        u = urlparse.urlsplit(url)
        name, ext = os.path.splitext(u.path)
        if ext == ".html":
            u = urlparse.urlsplit(url.replace(".html", ".xml"))
        url = u.geturl()
        # Get an etree object
        try:
            r = requests.get(url)
            tree = etree.XML(str(r.text))
        except BaseException:
            logger.error("Skipping %s (error parsing getting XML)" % url)
            return

        # Crawl the catalogRefs:
        for ref in tree.findall('.//{%s}catalogRef' % INV_NS):
            # Check skips
            title = ref.get("{%s}title" % XLINK_NS)
            if not any([x.match(title) for x in self.skip]):
                for ds in self._run(url=construct_url(url, ref.get("{%s}href" % XLINK_NS))):
                    yield ds
            else:
                logger.info("Skipping catalogRef based on 'skips'.  Title: %s" % title)
                continue

        # Get the leaf datasets
        ds = []
        for leaf in tree.findall('.//{%s}dataset[@urlPath]' % INV_NS):
            # Subset by the skips
            name = leaf.get("name")
            if any([x.match(name) for x in self.skip]):
                logger.info("Skipping dataset based on 'skips'.  Name: %s" % name)
                continue

            # Subset by the Selects defined
            gid = leaf.get('ID')
            if self.select is not None:
                if gid is not None and any([x.match(gid) for x in self.select]):
                    logger.debug("Processing %s" % gid)
                    yield "%s?dataset=%s" % (url, gid)
                else:
                    logger.info("Ignoring dataset based on 'selects'.  ID: %s" % gid)
                    continue
            else:
                logger.debug("Processing %s" % gid)
                yield "%s?dataset=%s" % (url, gid)
Beispiel #5
0
 def _compile_references(self, url, tree):
     '''
     Returns a list of catalog reference URLs for the current catalog
     :param str url: URL for the current catalog
     :param lxml.etree.Eleemnt tree: Current XML Tree
     '''
     references = []
     for ref in tree.findall('.//{%s}catalogRef' % INV_NS):
         # Check skips
         title = ref.get("{%s}title" % XLINK_NS)
         if any([x.match(title) for x in self.skip]):
             logger.info("Skipping catalogRef based on 'skips'.  Title: %s" % title)
             continue
         references.append(construct_url(url, ref.get("{%s}href" % XLINK_NS)))
     return references
Beispiel #6
0
 def _compile_references(self, url, tree):
     '''
     Returns a list of catalog reference URLs for the current catalog
     :param str url: URL for the current catalog
     :param lxml.etree.Eleemnt tree: Current XML Tree
     '''
     references = []
     for ref in tree.findall('.//{%s}catalogRef' % INV_NS):
         # Check skips
         title = ref.get("{%s}title" % XLINK_NS)
         if any([x.match(title) for x in self.skip]):
             logger.info(
                 "Skipping catalogRef based on 'skips'.  Title: %s" % title)
             continue
         references.append(
             construct_url(url, ref.get("{%s}href" % XLINK_NS)))
     return references
    def __init__(self, dataset_url, auth=None):

        self.services    = []
        self.id          = None
        self.name        = None
        self.catalog_url = None
        self.data_size   = None

        # Get an etree object
        r = requests.get(dataset_url, auth=auth, verify=False)
        try:
            tree = etree.XML(r.text.encode('utf-8'))
        except etree.XMLSyntaxError:
            logger.error("Error procesing %s, invalid XML" % dataset_url)
        else:
            try:
                dataset = tree.find("{%s}dataset" % INV_NS)
                self.id = dataset.get("ID")
                self.name = dataset.get("name")
                metadata = dataset.find("{%s}metadata" % INV_NS)
                self.catalog_url = dataset_url.split("?")[0]

                # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize
                data_size = dataset.find("{%s}dataSize" % INV_NS)
                if data_size is not None:
                    self.data_size = float(data_size.text)
                    data_units = data_size.get('units')
                    # Convert to MB
                    if data_units == "bytes":
                        self.data_size *= 1e-6
                    elif data_units == "Kbytes":
                        self.data_size *= 0.001
                    elif data_units == "Gbytes":
                        self.data_size /= 0.001
                    elif data_units == "Tbytes":
                        self.data_size /= 1e-6

                # Services
                service_tag = dataset.find("{%s}serviceName" % INV_NS)
                if service_tag is None:
                    if metadata is not None:
                        service_tag = metadata.find("{%s}serviceName" % INV_NS)

                if service_tag is None:
                    # Use services found in the file. FMRC aggs do this.
                    services = tree.findall(".//{%s}service[@serviceType='Compound']" % INV_NS)
                else:
                    # Use specific named services
                    services = tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_tag.text))

                for service in services:
                    if service.get("serviceType") == "Compound":
                        for s in service.findall("{%s}service" % INV_NS):
                            url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath")
                            if s.get("suffix") is not None:
                                url += s.get("suffix")
                            # ISO like services need additional parameters
                            if s.get('name') in ["iso", "ncml", "uddc"]:
                                url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url))
                            self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } )
                    else:
                        url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "")
                        # ISO like services need additional parameters
                        if service.get('name') in ["iso", "ncml", "uddc"]:
                            url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url))
                        self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )

                # Element objects are not pickable to save as a string
                try:
                    self.metadata = etree.tostring(metadata)
                except TypeError:
                    self.metadata = None
            except BaseException as e:
                logger.exception('Could not process {}. {}.'.format(dataset_url, e))