def write(self, basename='/tmp/sitemap.xml', **kwargs): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) ( chunk, next ) = self.get_resources_chunk(resources_iter) s = Sitemap(**kwargs) if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled") # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append sitemap_prefix = basename sitemap_suffix = '.xml' if (basename[-4:] == '.xml'): sitemap_prefix = basename[:-4] # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go sitemaps=ListBase() while (len(chunk)>0): file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource( uri = self.mapper.dst_to_src(file), path = file, timestamp = os.stat(file).st_mtime, md5 = compute_md5_for_file(file) ) sitemaps.add(r) # Get next chunk ( chunk, next ) = self.get_resources_chunk(resources_iter,next) self.logger.info("Wrote %d sitemaps" % (len(sitemaps))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))
def get_resources_chunk(self, resource_iter, first=None): """Return next chunk of resources from resource_iter, and next item If first parameter is specified then this will be prepended to the list. The chunk will contain self.max_sitemap_entries if the iterator returns that many. next will have the value of the next value from the iterator, providing indication of whether more is available. Use this as first when asking for the following chunk. """ chunk = ListBase() chunk.capability_name = self.capability_name chunk.capability_md = self.capability_md chunk.default_capability_and_modified() if (first is not None): chunk.add(first) for r in resource_iter: chunk.add(r) if (len(chunk)>=self.max_sitemap_entries): break # Get next to see whether there are more resources try: next = resource_iter.next() except StopIteration: next = None return(chunk,next)
def get_resources_chunk(self, resource_iter, first=None): """Return next chunk of resources from resource_iter, and next item If first parameter is specified then this will be prepended to the list. The chunk will contain self.max_sitemap_entries if the iterator returns that many. next will have the value of the next value from the iterator, providing indication of whether more is available. Use this as first when asking for the following chunk. """ chunk = ListBase(md=self.md.copy(), ln=list(self.ln)) chunk.capability_name = self.capability_name chunk.default_capability() if (first is not None): chunk.add(first) for r in resource_iter: chunk.add(r) if (len(chunk) >= self.max_sitemap_entries): break # Get next to see whether there are more resources try: next = resource_iter.next() except StopIteration: next = None return (chunk, next)
def as_xml_part(self, basename="/tmp/sitemap.xml", part_number=0): """Return a string of component sitemap part_number for a large list that is split basename is used to create "index" links to the sitemapindex Q - what timestamp should be used? """ if (not self.requires_multifile()): raise ListBaseIndexError("Request for component sitemap for list with only %d entries when max_sitemap_entries is set to %s" % (len(self),str(self.max_sitemap_entries))) start = part_number * self.max_sitemap_entries if (start>len(self)): raise ListBaseIndexError("Request for component sitemap with part_number too high, would start at entry %d yet the list has only %d entries" % (start,len(self))) stop = start + self.max_sitemap_entries if (stop>len(self)): stop=len(self) part = ListBase( itertools.islice(self.resources,start,stop) ) part.capability_name = self.capability_name part.default_capability() part.index = basename s = self.new_sitemap() return( s.resources_as_xml(part) )
def as_xml_part(self, basename="/tmp/sitemap.xml", part_number=0): """Return a string of component sitemap part_number for a large list that is split basename is used to create "index" links to the sitemapindex Q - what timestamp should be used? """ if (not self.requires_multifile()): raise ListBaseIndexError( "Request for component sitemap for list with only %d entries when max_sitemap_entries is set to %s" % (len(self), str(self.max_sitemap_entries))) start = part_number * self.max_sitemap_entries if (start > len(self)): raise ListBaseIndexError( "Request for component sitemap with part_number too high, would start at entry %d yet the list has only %d entries" % (start, len(self))) stop = start + self.max_sitemap_entries if (stop > len(self)): stop = len(self) part = ListBase(itertools.islice(self.resources, start, stop)) part.capability_name = self.capability_name part.default_capability() part.index = basename s = self.new_sitemap() return (s.resources_as_xml(part))
def as_xml_index(self, basename="/tmp/sitemap.xml"): """Return a string of the index for a large list that is split All we need to do is determine the number of component sitemaps will be is and generate their URIs based on a pattern. Q - should there be a flag to select generation of each component sitemap in order to calculate the md5sum? Q - what timestamp should be used? """ num_parts = self.requires_multifile() if (not num_parts): raise ListBaseIndexError( "Request for sitemapindex for list with only %d entries when max_sitemap_entries is set to %s" % (len(self), str(self.max_sitemap_entries))) index = ListBase() index.sitemapindex = True index.capability_name = self.capability_name index.default_capability() for n in range(num_parts): r = Resource(uri=self.part_name(basename, n)) index.add(r) return (index.as_xml())
def as_xml_index(self, basename="/tmp/sitemap.xml"): """Return a string of the index for a large list that is split All we need to do is determine the number of component sitemaps will be is and generate their URIs based on a pattern. Q - should there be a flag to select generation of each component sitemap in order to calculate the md5sum? Q - what timestamp should be used? """ num_parts = self.requires_multifile() if (not num_parts): raise ListBaseIndexError("Request for sitemapindex for list with only %d entries when max_sitemap_entries is set to %s" % (len(self),str(self.max_sitemap_entries))) index=ListBase() index.sitemapindex=True index.capability_name = self.capability_name index.default_capability() for n in range(num_parts): r = Resource( uri = self.part_name(basename,n) ) index.add(r) return( index.as_xml() )
def write(self, basename='/tmp/sitemap.xml'): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) ( chunk, next ) = self.get_resources_chunk(resources_iter) s = self.new_sitemap() if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled") # Work out URI of sitemapindex so that we can link up to # it from the individual sitemap files try: index_uri = self.mapper.dst_to_src(basename) except MapperError as e: raise ListBaseIndexError("Cannot map sitemapindex filename to URI (%s)" % str(e)) # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go. Copy md from self into # the index and use this for all chunks also index=ListBase(md=self.md.copy(), ln=list(self.ln)) index.capability_name = self.capability_name index.default_capability() while (len(chunk)>0): file = self.part_name(basename,len(index)) # Check that we can map the filename of this sitemap into # URI space for the sitemapindex try: uri = self.mapper.dst_to_src(file) except MapperError as e: raise ListBaseIndexError("Cannot map sitemap filename to URI (%s)" % str(e)) self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') chunk.index = index_uri chunk.md = index.md s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource( uri = uri, timestamp = os.stat(file).st_mtime, md5 = compute_md5_for_file(file) ) index.add(r) # Get next chunk ( chunk, next ) = self.get_resources_chunk(resources_iter,next) self.logger.info("Wrote %d sitemaps" % (len(index))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(index,sitemapindex=True,fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))
def write(self, basename='/tmp/sitemap.xml'): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) (chunk, next) = self.get_resources_chunk(resources_iter) s = self.new_sitemap() if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError( "Too many entries for a single sitemap but multifile disabled" ) # Work out URI of sitemapindex so that we can link up to # it from the individual sitemap files try: index_uri = self.mapper.dst_to_src(basename) except MapperError as e: raise ListBaseIndexError( "Cannot map sitemapindex filename to URI (%s)" % str(e)) # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go. Copy md from self into # the index and use this for all chunks also index = ListBase(md=self.md.copy(), ln=list(self.ln)) index.capability_name = self.capability_name index.default_capability() while (len(chunk) > 0): file = self.part_name(basename, len(index)) # Check that we can map the filename of this sitemap into # URI space for the sitemapindex try: uri = self.mapper.dst_to_src(file) except MapperError as e: raise ListBaseIndexError( "Cannot map sitemap filename to URI (%s)" % str(e)) self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') chunk.index = index_uri chunk.md = index.md s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource(uri=uri, timestamp=os.stat(file).st_mtime, md5=compute_md5_for_file(file)) index.add(r) # Get next chunk (chunk, next) = self.get_resources_chunk(resources_iter, next) self.logger.info("Wrote %d sitemaps" % (len(index))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(index, sitemapindex=True, fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))