def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file):
        """Read a component sitemap of a Resource List with index.

        Each component must be a sitemap with the 
        """
        if (sitemapindex_is_file):
            if (not self.is_file_uri(sitemap_uri)):
                # Attempt to map URI to local file
                remote_uri = sitemap_uri
                sitemap_uri = self.mapper.src_to_dst(remote_uri)
                self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri))
            else:
                # The individual sitemaps should be at a URL (scheme/server/path)
                # that the sitemapindex URL can speak authoritatively about
                if (self.check_url_authority and
                    not UrlAuthority(sitemapindex_uri).has_authority_over(sitemap_uri)):
                    raise ListBaseIndexError("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri,sitemap_uri))
        try:
            fh = URLopener().open(sitemap_uri)
            self.num_files += 1
        except IOError as e:
            raise ListBaseIndexError("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,sitemapindex_uri,str(e)))
        # Get the Content-Length if we can (works fine for local files)
        try:
            self.content_length = int(fh.info()['Content-Length'])
            self.bytes_read += self.content_length
        except KeyError:
            # If we don't get a length then c'est la vie
            pass
        self.logger.info( "Reading sitemap from %s (%d bytes)" % (sitemap_uri,self.content_length) )
        component = sitemap.parse_xml( fh=fh, sitemapindex=False )
        # Copy resources into self, check any metadata
        for r in component:
            self.resources.add(r)
Example #2
0
    def read(self, uri=None, resources=None, index_only=False):
        """Read sitemap from a URI including handling sitemapindexes.

        If index_only is True then individual sitemaps references in a sitemapindex
        will not be read. This will result in no resources being returned and is
        useful only to read the metadata and links listed in the sitemapindex.

        Includes the subtlety that if the input URI is a local file and is a
        sitemapindex which contains URIs for the individual sitemaps, then these
        are mapped to the filesystem also.
        """
        try:
            fh = URLopener().open(uri)
            self.num_files += 1
        except IOError as e:
            raise IOError(
                "Failed to load sitemap/sitemapindex from %s (%s)" %
                (uri, str(e)))
        # Get the Content-Length if we can (works fine for local files)
        try:
            self.content_length = int(fh.info()['Content-Length'])
            self.bytes_read += self.content_length
            self.logger.debug(
                "Read %d bytes from %s" %
                (self.content_length, uri))
        except KeyError:
            # If we don't get a length then c'est la vie
            self.logger.debug("Read ????? bytes from %s" % (uri))
            pass
        self.logger.info("Read sitemap/sitemapindex from %s" % (uri))
        s = self.new_sitemap()
        s.parse_xml(fh=fh, resources=self, capability=self.capability_name)
        # what did we read? sitemap or sitemapindex?
        if (s.parsed_index):
            # sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError(
                    "Got sitemapindex from %s but support for sitemapindex disabled" %
                    (uri))
            self.logger.info(
                "Parsed as sitemapindex, %d sitemaps" %
                (len(
                    self.resources)))
            sitemapindex_is_file = self.is_file_uri(uri)
            if (index_only):
                # don't read the component sitemaps
                self.sitemapindex = True
                return
            # now loop over all entries to read each sitemap and add to
            # resources
            sitemaps = self.resources
            self.resources = self.resources_class()
            self.logger.info("Now reading %d sitemaps" % len(sitemaps.uris()))
            for sitemap_uri in sorted(sitemaps.uris()):
                self.read_component_sitemap(
                    uri, sitemap_uri, s, sitemapindex_is_file)
        else:
            # sitemap
            self.logger.info("Parsed as sitemap, %d resources" %
                             (len(self.resources)))
Example #3
0
    def read(self, uri=None, resources=None, index_only=False):
        """Read sitemap from a URI including handling sitemapindexes.

        If index_only is True then individual sitemaps references in a sitemapindex
        will not be read. This will result in no resources being returned and is
        useful only to read the metadata and links listed in the sitemapindex.

        Includes the subtlety that if the input URI is a local file and is a
        sitemapindex which contains URIs for the individual sitemaps, then these
        are mapped to the filesystem also.
        """
        try:
            fh = URLopener().open(uri)
            self.num_files += 1
        except IOError as e:
            raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" %
                          (uri, str(e)))
        # Get the Content-Length if we can (works fine for local files)
        try:
            self.content_length = int(fh.info()['Content-Length'])
            self.bytes_read += self.content_length
            self.logger.debug("Read %d bytes from %s" %
                              (self.content_length, uri))
        except KeyError:
            # If we don't get a length then c'est la vie
            self.logger.debug("Read ????? bytes from %s" % (uri))
            pass
        self.logger.info("Read sitemap/sitemapindex from %s" % (uri))
        s = self.new_sitemap()
        s.parse_xml(fh=fh, resources=self, capability=self.capability_name)
        # what did we read? sitemap or sitemapindex?
        if (s.parsed_index):
            # sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError(
                    "Got sitemapindex from %s but support for sitemapindex disabled"
                    % (uri))
            self.logger.info("Parsed as sitemapindex, %d sitemaps" %
                             (len(self.resources)))
            sitemapindex_is_file = self.is_file_uri(uri)
            if (index_only):
                # don't read the component sitemaps
                self.sitemapindex = True
                return
            # now loop over all entries to read each sitemap and add to
            # resources
            sitemaps = self.resources
            self.resources = self.resources_class()
            self.logger.info("Now reading %d sitemaps" % len(sitemaps.uris()))
            for sitemap_uri in sorted(sitemaps.uris()):
                self.read_component_sitemap(uri, sitemap_uri, s,
                                            sitemapindex_is_file)
        else:
            # sitemap
            self.logger.info("Parsed as sitemap, %d resources" %
                             (len(self.resources)))