Beispiel #1
0
 def add_file(self, resource_list=None, dir=None, file=None):
     """Add a single file to resource_list
     
     Follows object settings of set_path, set_md5 and set_length.
     """
     try:
         if self.exclude_file(file):
             self.logger.debug("Excluding file %s" % (file))
             return
         # get abs filename and also URL
         if (dir is not None):
             file = os.path.join(dir,file)
         if (not os.path.isfile(file) or not (self.include_symlinks or not os.path.islink(file))):
             return
         uri = self.mapper.dst_to_src(file)
         if (uri is None):
             raise Exception("Internal error, mapping failed")
         file_stat=os.stat(file)
     except OSError as e:
         sys.stderr.write("Ignoring file %s (error: %s)" % (file,str(e)))
         return
     timestamp = file_stat.st_mtime #UTC
     r = Resource(uri=uri,timestamp=timestamp)
     if (self.set_path):
         # add full local path
         r.path=file
     if (self.set_md5):
         # add md5
         r.md5=compute_md5_for_file(file)
     if (self.set_length):
         # add length
         r.length=file_stat.st_size
     resource_list.add(r)
 def add_file(self, resource_list=None, dir=None, file=None):
     """Add a single file to resource_list
     
     Follows object settings of set_path, set_md5 and set_length.
     """
     try:
         if self.exclude_file(file):
             self.logger.debug("Excluding file %s" % (file))
             return
         # get abs filename and also URL
         if (dir is not None):
             file = os.path.join(dir, file)
         if (not os.path.isfile(file) or
                 not (self.include_symlinks or not os.path.islink(file))):
             return
         uri = self.mapper.dst_to_src(file)
         if (uri is None):
             raise Exception("Internal error, mapping failed")
         file_stat = os.stat(file)
     except OSError as e:
         sys.stderr.write("Ignoring file %s (error: %s)" % (file, str(e)))
         return
     timestamp = file_stat.st_mtime  #UTC
     r = Resource(uri=uri, timestamp=timestamp)
     if (self.set_path):
         # add full local path
         r.path = file
     if (self.set_md5):
         # add md5
         r.md5 = compute_md5_for_file(file)
     if (self.set_length):
         # add length
         r.length = file_stat.st_size
     resource_list.add(r)
    def write(self, basename='/tmp/sitemap.xml', **kwargs):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        ( chunk, next ) = self.get_resources_chunk(resources_iter)
        s = Sitemap(**kwargs)
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled")
            # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append
            sitemap_prefix = basename
            sitemap_suffix = '.xml'
            if (basename[-4:] == '.xml'):
                sitemap_prefix = basename[:-4]
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go
            sitemaps=ListBase()
            while (len(chunk)>0):
                file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource( uri = self.mapper.dst_to_src(file),
                              path = file,
                              timestamp = os.stat(file).st_mtime,
                              md5 = compute_md5_for_file(file) )
                sitemaps.add(r)
                # Get next chunk
                ( chunk, next ) = self.get_resources_chunk(resources_iter,next)
            self.logger.info("Wrote %d sitemaps" % (len(sitemaps)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))
    def from_disk_add_map(self, resource_list=None, map=None, set_path=False):
        """Add to resource_list with resources from disk scan based one map

        If set_path is True then the path attribue will be set with the
        local path for each Resource.
        """
        # sanity
        if (resource_list is None or map is None):
            raise ValueError("Must specify resource_list and map")
        path=map.dst_path
        #print "walking: %s" % (path)
        # for each file: create Resource object, add, increment counter
	num_files=0
        for dirpath, dirs, files in os.walk(path,topdown=True):
            for file_in_dirpath in files:
		num_files+=1
		if (num_files%50000 == 0):
		    self.logger.info("ResourceListBuilder.from_disk_add_map: %d files..." % (num_files))
                try:
                    if self.exclude_file(file_in_dirpath):
                        self.logger.debug("Excluding file %s" % (file_in_dirpath))
                        continue
                    # get abs filename and also URL
                    file = os.path.join(dirpath,file_in_dirpath)
                    if (not os.path.isfile(file) or not (self.include_symlinks or not os.path.islink(file))):
                        continue
                    uri = map.dst_to_src(file)
                    if (uri is None):
                        raise Exception("Internal error, mapping failed")
                    file_stat=os.stat(file)
                except OSError as e:
                    sys.stderr.write("Ignoring file %s (error: %s)" % (file,str(e)))
                    continue
                timestamp = file_stat.st_mtime #UTC
                r = Resource(uri=uri,timestamp=timestamp)
                if (set_path):
                    r.path=file
                if (self.do_md5):
                    # add md5
                    r.md5=compute_md5_for_file(file)
                if (self.do_length):
                    # add length
                    r.length=file_stat.st_size
                resource_list.add(r)
            # prune list of dirs based on self.exclude_dirs
            for exclude in self.exclude_dirs:
                if exclude in dirs:
                    self.logger.debug("Excluding dir %s" % (exclude))
                    dirs.remove(exclude)
Beispiel #5
0
    def write(self, basename='/tmp/sitemap.xml'):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        ( chunk, next ) = self.get_resources_chunk(resources_iter)
        s = self.new_sitemap()
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled")
            # Work out URI of sitemapindex so that we can link up to
            # it from the individual sitemap files
            try:
                index_uri = self.mapper.dst_to_src(basename)
            except MapperError as e:
                raise ListBaseIndexError("Cannot map sitemapindex filename to URI (%s)" % str(e))
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go. Copy md from self into
            # the index and use this for all chunks also
            index=ListBase(md=self.md.copy(), ln=list(self.ln))
            index.capability_name = self.capability_name
            index.default_capability()
            while (len(chunk)>0):
                file = self.part_name(basename,len(index))
                # Check that we can map the filename of this sitemap into
                # URI space for the sitemapindex
                try:
                    uri = self.mapper.dst_to_src(file)
                except MapperError as e:
                    raise ListBaseIndexError("Cannot map sitemap filename to URI (%s)" % str(e))
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                chunk.index = index_uri
                chunk.md = index.md
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource( uri = uri,
                              timestamp = os.stat(file).st_mtime,
                              md5 = compute_md5_for_file(file) )
                index.add(r)
                # Get next chunk
                ( chunk, next ) = self.get_resources_chunk(resources_iter,next)
            self.logger.info("Wrote %d sitemaps" % (len(index)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(index,sitemapindex=True,fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))
    def write(self, basename='/tmp/sitemap.xml'):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        (chunk, next) = self.get_resources_chunk(resources_iter)
        s = self.new_sitemap()
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError(
                    "Too many entries for a single sitemap but multifile disabled"
                )
            # Work out URI of sitemapindex so that we can link up to
            # it from the individual sitemap files
            try:
                index_uri = self.mapper.dst_to_src(basename)
            except MapperError as e:
                raise ListBaseIndexError(
                    "Cannot map sitemapindex filename to URI (%s)" % str(e))
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go. Copy md from self into
            # the index and use this for all chunks also
            index = ListBase(md=self.md.copy(), ln=list(self.ln))
            index.capability_name = self.capability_name
            index.default_capability()
            while (len(chunk) > 0):
                file = self.part_name(basename, len(index))
                # Check that we can map the filename of this sitemap into
                # URI space for the sitemapindex
                try:
                    uri = self.mapper.dst_to_src(file)
                except MapperError as e:
                    raise ListBaseIndexError(
                        "Cannot map sitemap filename to URI (%s)" % str(e))
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                chunk.index = index_uri
                chunk.md = index.md
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource(uri=uri,
                             timestamp=os.stat(file).st_mtime,
                             md5=compute_md5_for_file(file))
                index.add(r)
                # Get next chunk
                (chunk, next) = self.get_resources_chunk(resources_iter, next)
            self.logger.info("Wrote %d sitemaps" % (len(index)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(index, sitemapindex=True, fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))