def write(self, basename='/tmp/sitemap.xml', **kwargs):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        ( chunk, next ) = self.get_resources_chunk(resources_iter)
        s = Sitemap(**kwargs)
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled")
            # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append
            sitemap_prefix = basename
            sitemap_suffix = '.xml'
            if (basename[-4:] == '.xml'):
                sitemap_prefix = basename[:-4]
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go
            sitemaps=ListBase()
            while (len(chunk)>0):
                file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource( uri = self.mapper.dst_to_src(file),
                              path = file,
                              timestamp = os.stat(file).st_mtime,
                              md5 = compute_md5_for_file(file) )
                sitemaps.add(r)
                # Get next chunk
                ( chunk, next ) = self.get_resources_chunk(resources_iter,next)
            self.logger.info("Wrote %d sitemaps" % (len(sitemaps)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))
    def get_resources_chunk(self, resource_iter, first=None):
        """Return next chunk of resources from resource_iter, and next item
        
        If first parameter is specified then this will be prepended to
        the list.

        The chunk will contain self.max_sitemap_entries if the iterator 
        returns that many. next will have the value of the next value from
        the iterator, providing indication of whether more is available. 
        Use this as first when asking for the following chunk.
        """
        chunk = ListBase()
        chunk.capability_name = self.capability_name
        chunk.capability_md = self.capability_md
        chunk.default_capability_and_modified()
        if (first is not None):
            chunk.add(first)
        for r in resource_iter:
            chunk.add(r)
            if (len(chunk)>=self.max_sitemap_entries):
                break
        # Get next to see whether there are more resources
        try:
            next = resource_iter.next()
        except StopIteration:
            next = None
        return(chunk,next)
    def get_resources_chunk(self, resource_iter, first=None):
        """Return next chunk of resources from resource_iter, and next item
        
        If first parameter is specified then this will be prepended to
        the list.

        The chunk will contain self.max_sitemap_entries if the iterator 
        returns that many. next will have the value of the next value from
        the iterator, providing indication of whether more is available. 
        Use this as first when asking for the following chunk.
        """
        chunk = ListBase(md=self.md.copy(), ln=list(self.ln))
        chunk.capability_name = self.capability_name
        chunk.default_capability()
        if (first is not None):
            chunk.add(first)
        for r in resource_iter:
            chunk.add(r)
            if (len(chunk) >= self.max_sitemap_entries):
                break
        # Get next to see whether there are more resources
        try:
            next = resource_iter.next()
        except StopIteration:
            next = None
        return (chunk, next)
Exemple #4
0
 def as_xml_part(self, basename="/tmp/sitemap.xml", part_number=0):
     """Return a string of component sitemap part_number for a large list that is split
     
     basename is used to create "index" links to the sitemapindex
     
     Q - what timestamp should be used?
     """
     if (not self.requires_multifile()):
         raise ListBaseIndexError("Request for component sitemap for list with only %d entries when max_sitemap_entries is set to %s" % (len(self),str(self.max_sitemap_entries)))
     start = part_number * self.max_sitemap_entries
     if (start>len(self)):
         raise ListBaseIndexError("Request for component sitemap with part_number too high, would start at entry %d yet the list has only %d entries" % (start,len(self)))
     stop = start + self.max_sitemap_entries
     if (stop>len(self)):
         stop=len(self)
     part = ListBase( itertools.islice(self.resources,start,stop) )
     part.capability_name = self.capability_name
     part.default_capability()
     part.index = basename
     s = self.new_sitemap()
     return( s.resources_as_xml(part) )
 def as_xml_part(self, basename="/tmp/sitemap.xml", part_number=0):
     """Return a string of component sitemap part_number for a large list that is split
     
     basename is used to create "index" links to the sitemapindex
     
     Q - what timestamp should be used?
     """
     if (not self.requires_multifile()):
         raise ListBaseIndexError(
             "Request for component sitemap for list with only %d entries when max_sitemap_entries is set to %s"
             % (len(self), str(self.max_sitemap_entries)))
     start = part_number * self.max_sitemap_entries
     if (start > len(self)):
         raise ListBaseIndexError(
             "Request for component sitemap with part_number too high, would start at entry %d yet the list has only %d entries"
             % (start, len(self)))
     stop = start + self.max_sitemap_entries
     if (stop > len(self)):
         stop = len(self)
     part = ListBase(itertools.islice(self.resources, start, stop))
     part.capability_name = self.capability_name
     part.default_capability()
     part.index = basename
     s = self.new_sitemap()
     return (s.resources_as_xml(part))
    def as_xml_index(self, basename="/tmp/sitemap.xml"):
        """Return a string of the index for a large list that is split
        
        All we need to do is determine the number of component sitemaps will
        be is and generate their URIs based on a pattern.

        Q - should there be a flag to select generation of each component sitemap
        in order to calculate the md5sum?
        
        Q - what timestamp should be used?
        """
        num_parts = self.requires_multifile()
        if (not num_parts):
            raise ListBaseIndexError(
                "Request for sitemapindex for list with only %d entries when max_sitemap_entries is set to %s"
                % (len(self), str(self.max_sitemap_entries)))
        index = ListBase()
        index.sitemapindex = True
        index.capability_name = self.capability_name
        index.default_capability()
        for n in range(num_parts):
            r = Resource(uri=self.part_name(basename, n))
            index.add(r)
        return (index.as_xml())
Exemple #7
0
    def as_xml_index(self, basename="/tmp/sitemap.xml"):
        """Return a string of the index for a large list that is split
        
        All we need to do is determine the number of component sitemaps will
        be is and generate their URIs based on a pattern.

        Q - should there be a flag to select generation of each component sitemap
        in order to calculate the md5sum?
        
        Q - what timestamp should be used?
        """
        num_parts = self.requires_multifile()
        if (not num_parts):
            raise ListBaseIndexError("Request for sitemapindex for list with only %d entries when max_sitemap_entries is set to %s" % (len(self),str(self.max_sitemap_entries)))
        index=ListBase()
        index.sitemapindex=True
        index.capability_name = self.capability_name
        index.default_capability()
        for n in range(num_parts):
            r = Resource( uri = self.part_name(basename,n) )
            index.add(r)
        return( index.as_xml() )
Exemple #8
0
    def write(self, basename='/tmp/sitemap.xml'):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        ( chunk, next ) = self.get_resources_chunk(resources_iter)
        s = self.new_sitemap()
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled")
            # Work out URI of sitemapindex so that we can link up to
            # it from the individual sitemap files
            try:
                index_uri = self.mapper.dst_to_src(basename)
            except MapperError as e:
                raise ListBaseIndexError("Cannot map sitemapindex filename to URI (%s)" % str(e))
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go. Copy md from self into
            # the index and use this for all chunks also
            index=ListBase(md=self.md.copy(), ln=list(self.ln))
            index.capability_name = self.capability_name
            index.default_capability()
            while (len(chunk)>0):
                file = self.part_name(basename,len(index))
                # Check that we can map the filename of this sitemap into
                # URI space for the sitemapindex
                try:
                    uri = self.mapper.dst_to_src(file)
                except MapperError as e:
                    raise ListBaseIndexError("Cannot map sitemap filename to URI (%s)" % str(e))
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                chunk.index = index_uri
                chunk.md = index.md
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource( uri = uri,
                              timestamp = os.stat(file).st_mtime,
                              md5 = compute_md5_for_file(file) )
                index.add(r)
                # Get next chunk
                ( chunk, next ) = self.get_resources_chunk(resources_iter,next)
            self.logger.info("Wrote %d sitemaps" % (len(index)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(index,sitemapindex=True,fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))
    def write(self, basename='/tmp/sitemap.xml'):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        (chunk, next) = self.get_resources_chunk(resources_iter)
        s = self.new_sitemap()
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError(
                    "Too many entries for a single sitemap but multifile disabled"
                )
            # Work out URI of sitemapindex so that we can link up to
            # it from the individual sitemap files
            try:
                index_uri = self.mapper.dst_to_src(basename)
            except MapperError as e:
                raise ListBaseIndexError(
                    "Cannot map sitemapindex filename to URI (%s)" % str(e))
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go. Copy md from self into
            # the index and use this for all chunks also
            index = ListBase(md=self.md.copy(), ln=list(self.ln))
            index.capability_name = self.capability_name
            index.default_capability()
            while (len(chunk) > 0):
                file = self.part_name(basename, len(index))
                # Check that we can map the filename of this sitemap into
                # URI space for the sitemapindex
                try:
                    uri = self.mapper.dst_to_src(file)
                except MapperError as e:
                    raise ListBaseIndexError(
                        "Cannot map sitemap filename to URI (%s)" % str(e))
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                chunk.index = index_uri
                chunk.md = index.md
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource(uri=uri,
                             timestamp=os.stat(file).st_mtime,
                             md5=compute_md5_for_file(file))
                index.add(r)
                # Get next chunk
                (chunk, next) = self.get_resources_chunk(resources_iter, next)
            self.logger.info("Wrote %d sitemaps" % (len(index)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(index, sitemapindex=True, fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))