def from_disk(self, resource_list=None, paths=None):
        """Create or extend resource_list with resources from disk scan

        Assumes very simple disk path to URL mapping (in self.mapping): chop 
        path and replace with url_path. Returns the new or extended ResourceList
        object.

        If a resource_list is specified then items are added to that rather
        than creating a new one.

        If paths is specified then these are used instead of the set 
        of local paths in self.mapping.

        Example usage with mapping start paths:
        
        mapper=Mapper('http://example.org/path','/path/to/files')
        rlb = ResourceListBuilder(mapper=mapper)
        m = rlb.from_disk()

        Example usage with explicit paths:

        mapper=Mapper('http://example.org/path','/path/to/files')
        rlb = ResourceListBuilder(mapper=mapper)
        m = rlb.from_disk(paths=['/path/to/files/a','/path/to/files/b'])
        """
        num = 0
        # Either use resource_list passed in or make a new one
        if (resource_list is None):
            resource_list = ResourceList()
        # Compile exclude pattern matches
        self.compile_excludes()
        # Work out start paths from map if not explicitly specified
        if (paths is None):
            paths = []
            for map in self.mapper.mappings:
                paths.append(map.dst_path)
        # Set start time unless already set (perhaps because building in chunks)
        if (resource_list.md_at is None):
            resource_list.md_at = datetime_to_str()
        # Run for each map in the mappings
        for path in paths:
            self.logger.info("Scanning disk from %s" % (path))
            self.from_disk_add_path(path=path, resource_list=resource_list)
        # Set end time
        resource_list.md_completed = datetime_to_str()
        return (resource_list)
Exemple #2
0
    def from_disk(self, resource_list=None, paths=None):
        """Create or extend resource_list with resources from disk scan

        Assumes very simple disk path to URL mapping (in self.mapping): chop 
        path and replace with url_path. Returns the new or extended ResourceList
        object.

        If a resource_list is specified then items are added to that rather
        than creating a new one.

        If paths is specified then these are used instead of the set 
        of local paths in self.mapping.

        Example usage with mapping start paths:
        
        mapper=Mapper('http://example.org/path','/path/to/files')
        rlb = ResourceListBuilder(mapper=mapper)
        m = rlb.from_disk()

        Example usage with explicit paths:

        mapper=Mapper('http://example.org/path','/path/to/files')
        rlb = ResourceListBuilder(mapper=mapper)
        m = rlb.from_disk(paths=['/path/to/files/a','/path/to/files/b'])
        """
        num=0
        # Either use resource_list passed in or make a new one
        if (resource_list is None):
            resource_list = ResourceList()
        # Compile exclude pattern matches
        self.compile_excludes()
        # Work out start paths from map if not explicitly specified
        if (paths is None):
            paths=[]
            for map in self.mapper.mappings:
                paths.append(map.dst_path)
        # Set start time unless already set (perhaps because building in chunks)
        if (resource_list.md_at is None):
            resource_list.md_at = datetime_to_str()
        # Run for each map in the mappings
        for path in paths:
            self.logger.info("Scanning disk from %s" % (path))
            self.from_disk_add_path(path=path, resource_list=resource_list)
        # Set end time
        resource_list.md_completed = datetime_to_str()
        return(resource_list)
Exemple #3
0
 def _str_datetime_now(self, x=None):
     """Return datetime string for use with time attributes
     
     Handling depends on input:
       'now'   - returns datetime for now
       number  - assume datetime values, generate string
       other   - no change, return same value
     """
     if (x == 'now'):
         # Now, this is wht datetime_to_str() with no arg gives
         return( datetime_to_str() )
     try:
         # Test for number
         junk = x + 0.0
         return datetime_to_str(x)
     except TypeError:
         # Didn't look like a number, treat as string
         return x
 def _str_datetime_now(self, x=None):
     """Return datetime string for use with time attributes
     
     Handling depends on input:
       'now'   - returns datetime for now
       number  - assume datetime values, generate string
       other   - no change, return same value
     """
     if (x == 'now'):
         # Now, this is wht datetime_to_str() with no arg gives
         return (datetime_to_str())
     try:
         # Test for number
         junk = x + 0.0
         return datetime_to_str(x)
     except TypeError:
         # Didn't look like a number, treat as string
         return x
    def default_capability_and_modified(self):
        """Set capability name and modified time in md

        Every ResourceSync document should have these two top-level
        metadata attributes.
        """
        if ('capability' not in self.md and self.capability_md is not None):
            self.md['capability']=self.capability_md
        if ('modified' not in self.md):
            self.md['modified']=datetime_to_str(no_fractions=True)
Exemple #6
0
 def head_on_file(self,file):
     """Mock up requests.head(..) response on local file
     """
     response = HeadResponse()
     if (not os.path.isfile(file)):
         response.status_code='404'
     else:
         response.status_code='200'
         response.headers['last-modified']=datetime_to_str(os.path.getmtime(file))
         response.headers['content-length']=os.path.getsize(file)
     return(response)
Exemple #7
0
    def incremental(self,
                    allow_deletion=False,
                    change_list_uri=None,
                    from_datetime=None):
        """Incremental synchronization

        Use Change List to do incremental sync
        """
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (self.mapper.unsafe()):
            raise ClientFatalError(
                "Source to destination mappings unsafe: %s" % str(self.mapper))
        from_timestamp = None
        if (from_datetime is not None):
            try:
                from_timestamp = str_to_datetime(from_datetime)
            except ValueError:
                raise ClientFatalError("Bad datetime in --from (%s)" %
                                       from_datetime)
    ### 1. Work out where to start from
        if (from_timestamp is None):
            from_timestamp = ClientState().get_state(self.sitemap)
            if (from_timestamp is None):
                raise ClientFatalError(
                    "Cannot do incremental sync. No stored timestamp for this site, and no explicit --from."
                )
    ### 2. Get URI of change list, from sitemap or explicit
        if (change_list_uri):
            # Translate as necessary using maps
            change_list = self.sitemap_uri(change_list_uri)
        else:
            # Try default name
            change_list = self.sitemap_uri(self.change_list_name)
    ### 3. Read change list from source
        try:
            self.logger.info("Reading change list %s" % (change_list))
            src_change_list = ChangeList()
            src_change_list.read(uri=change_list)
            self.logger.debug("Finished reading change list")
        except Exception as e:
            raise ClientFatalError(
                "Can't read source change list from %s (%s)" %
                (change_list, str(e)))
        self.logger.info("Read source change list, %d changes listed" %
                         (len(src_change_list)))
        #if (len(src_change_list)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_change_list.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source change list"
            )
    # Check all changes have timestamp and record last
        self.last_timestamp = 0
        for resource in src_change_list:
            if (resource.timestamp is None):
                raise ClientFatalError(
                    "Aborting - missing timestamp for change in %s" % (uri))
            if (resource.timestamp > self.last_timestamp):
                self.last_timestamp = resource.timestamp
    ### 4. Check that the change list has authority over URIs listed
    # FIXME - What does authority mean for change list? Here use both the
    # change list URI and, if we used it, the sitemap URI
        if (not self.noauth):
            uauth_cs = UrlAuthority(change_list, self.strictauth)
            if (not change_list_uri):
                uauth_sm = UrlAuthority(self.sitemap)
                for resource in src_change_list:
                    if (not uauth_cs.has_authority_over(resource.uri) and
                        (change_list_uri
                         or not uauth_sm.has_authority_over(resource.uri))):
                        raise ClientFatalError(
                            "Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                            % (change_list, resource.uri))
    ### 5. Prune entries before starting timestamp and dupe changes for a resource
        num_skipped = src_change_list.prune_before(from_timestamp)
        if (num_skipped > 0):
            self.logger.info("Skipped %d changes before %s" %
                             (num_skipped, datetime_to_str(from_timestamp)))
        num_dupes = src_change_list.prune_dupes()
        if (num_dupes > 0):
            self.logger.info("Removed %d prior changes" % (num_dupes))
    # Review and log status before
    # FIXME - should at this stage prune the change list to pick out
    # only the last change for each resource
        to_update = 0
        to_create = 0
        to_delete = 0
        for resource in src_change_list:
            if (resource.change == 'updated'):
                to_update += 1
            elif (resource.change == 'created'):
                to_create += 1
            elif (resource.change == 'deleted'):
                to_delete += 1
            else:
                raise ClientError("Unknown change type %s" % (resource.change))
    # Log status based on what we know from the Change List. Exit if
    # either there are no changes or if there are only deletions and
    # we don't allow deletion
        in_sync = ((to_update + to_delete + to_create) == 0)
        self.log_status(in_sync=in_sync,
                        incremental=True,
                        created=to_create,
                        updated=to_update,
                        deleted=to_delete)
        if (in_sync or ((to_update + to_create) == 0 and not allow_deletion)):
            self.logger.debug("Completed incremental")
            return
    ### 6. Apply changes at same time or after from_timestamp
        delete_msg = (", and delete %d resources" %
                      to_delete) if (allow_deletion) else ''
        self.logger.warning("Will apply %d changes%s" %
                            (len(src_change_list), delete_msg))
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_change_list:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'updated')
                num_updated += 1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri, file))
                self.update_resource(resource, file, 'created')
                num_created += 1
            elif (resource.change == 'deleted'):
                num_deleted += self.delete_resource(resource, file,
                                                    allow_deletion)
            else:
                raise ClientError("Unknown change type %s" % (resource.change))
    ### 7. Report status and planned actions
        self.log_status(incremental=True,
                        created=num_created,
                        updated=num_updated,
                        deleted=num_deleted,
                        to_delete=to_delete)
        ### 8. Record last timestamp we have seen
        if (self.last_timestamp > 0):
            ClientState().set_state(self.sitemap, self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" %
                             (datetime_to_str(self.last_timestamp)))

    ### 9. Done
        self.logger.debug("Completed incremental sync")
Exemple #8
0
    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ('audit' if (audit_only) else 'baseline sync')
        self.logger.debug("Starting " + action)
        ### 0. Sanity checks
        if (len(self.mapper) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        if (not audit_only and self.mapper.unsafe()):
            raise ClientFatalError(
                "Source to destination mappings unsafe: %s" % str(self.mapper))
        ### 1. Get inventories from both src and dst
        # 1.a source resource list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(
                allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError(
                "Can't read source resource list from %s (%s)" %
                (self.sitemap, str(e)))
        self.logger.info("Read source resource list, %d resources listed" %
                         (len(src_resource_list)))
        if (len(src_resource_list) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum = False
            self.logger.info(
                "Not calculating checksums on destination as not present in source resource list"
            )
        # 1.b destination resource list mapped back to source URIs
        rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper)
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource lists respecting any comparison options
        (same, updated, deleted,
         created) = dst_resource_list.compare(src_resource_list)
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        audit=True,
                        same=len(same),
                        created=len(created),
                        updated=len(updated),
                        deleted=len(deleted))
        if (audit_only or len(created) + len(updated) + len(deleted) == 0):
            self.logger.debug("Completed " + action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        if (not self.noauth):
            uauth = UrlAuthority(self.sitemap, strict=self.strictauth)
            for resource in src_resource_list:
                if (not uauth.has_authority_over(resource.uri)):
                    raise ClientFatalError(
                        "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                        % (self.sitemap, resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" %
                      len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" %
                            (len(created) + len(updated), delete_msg))
        self.last_timestamp = 0
        num_created = 0
        num_updated = 0
        num_deleted = 0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri, file))
            num_created += self.update_resource(resource, file, 'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri, file))
            num_updated += self.update_resource(resource, file, 'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted += self.delete_resource(resource, file, allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp > 0):
            ClientState().set_state(self.sitemap, self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" %
                             (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated) + len(deleted) +
                                 len(created) == 0),
                        same=len(same),
                        created=num_created,
                        updated=num_updated,
                        deleted=num_deleted,
                        to_delete=len(deleted))
        self.logger.debug("Completed %s" % (action))
Exemple #9
0
 def lastmod(self):
     """The Last-Modified data in W3C Datetime syntax, Z notation"""
     if self.timestamp is None:
         return None
     return datetime_to_str(self.timestamp)
Exemple #10
0
    def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None):
	"""Incremental synchronization

        """
        self.logger.debug("Starting incremental sync")
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        from_timestamp = None
        if (from_datetime is not None):
            try:
                from_timestamp = str_to_datetime(from_datetime)
            except ValueError:
                raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime)
        ### 1. Work out where to start from
        if (from_timestamp is None):
            from_timestamp=ClientState().get_state(self.sitemap)
            if (from_timestamp is None):
                raise ClientFatalError("No stored timestamp for this site, and no explicit --from")
        ### 2. Get URI of change list, from sitemap or explicit
        if (change_list_uri):
            # Translate as necessary using maps
            change_list = self.sitemap_uri(change_list_uri)
        else:
            # Try default name
            change_list = self.sitemap_uri(self.change_list_name)
        ### 3. Read change list from source
        try:
            self.logger.info("Reading change list %s" % (change_list))
            src_change_list = ChangeList()
            src_change_list.read(uri=change_list)
            self.logger.debug("Finished reading change list")
        except Exception as e:
            raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e)))
        self.logger.info("Read source change list, %d changes listed" % (len(src_change_list)))
        #if (len(src_change_list)==0):
        #    raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_change_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source change list")
        # Check all changes have timestamp and record last
        self.last_timestamp = 0
        for resource in src_change_list:
            if (resource.timestamp is None):
                raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri))
            if (resource.timestamp > self.last_timestamp):
                self.last_timestamp = resource.timestamp
        ### 4. Check that the change list has authority over URIs listed
        # FIXME - What does authority mean for change list? Here use both the
        # change list URI and, if we used it, the sitemap URI
        uauth_cs = UrlAuthority(change_list)
        if (not change_list_uri):
            uauth_sm = UrlAuthority(self.sitemap)
        for resource in src_change_list:
            if (not uauth_cs.has_authority_over(resource.uri) and 
                (change_list_uri or not uauth_sm.has_authority_over(resource.uri))):
                if (self.noauth):
                    #self.logger.info("Change list (%s) mentions resource at a location it does not have authority over (%s)" % (change_list,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri))
        ### 5. Prune entries before starting timestamp and dupe changes for a resource
        num_skipped = src_change_list.prune_before(from_timestamp)
        if (num_skipped>0):
            self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp)))
        num_dupes = src_change_list.prune_dupes()
        if (num_dupes>0):
            self.logger.info("Removed %d prior changes" % (num_dupes))
        ### 6. Apply changes at same time or after from_timestamp
        self.logger.info("Applying %d changes" % (len(src_change_list)))
        num_updated = 0
        num_deleted = 0
        num_created = 0
        for resource in src_change_list:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            if (resource.change == 'updated'):
                self.logger.info("updated: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'updated')
                num_updated+=1
            elif (resource.change == 'created'):
                self.logger.info("created: %s -> %s" % (uri,file))
                self.update_resource(resource,file,'created')
                num_created+=1
            elif (resource.change == 'deleted'):
                self.delete_resource(resource,file,allow_deletion)
                num_deleted+=1
            else:
                raise ClientError("Unknown change type %s" % (resource.change) )
        ### 7. Report status and planned actions
        self.log_status(in_sync=((num_updated+num_deleted+num_created)==0),
                        incremental=True,created=num_created, updated=num_updated, 
                        deleted=num_deleted)
        ### 8. Record last timestamp we have seen
        if (self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 9. Done
        self.logger.debug("Completed incremental sync")
Exemple #11
0
    def baseline_or_audit(self, allow_deletion=False, audit_only=False):
        """Baseline synchonization or audit

	Both functions implemented in this routine because audit is a prerequisite
	for a baseline sync. In the case of baseline sync the last timestamp seen
        is recorded as client state.
	"""
        action = ( 'audit' if (audit_only) else 'baseline sync' ) 
        self.logger.debug("Starting "+action)
        ### 0. Sanity checks
        if (len(self.mappings)<1):
            raise ClientFatalError("No source to destination mapping specified")
        ### 1. Get inventories from both src and dst 
        # 1.a source resource_list
        try:
            self.logger.info("Reading sitemap %s" % (self.sitemap))
            src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper)
            src_resource_list.read(uri=self.sitemap)
            self.logger.debug("Finished reading sitemap")
        except Exception as e:
            raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e)))
        self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list)))
        if (len(src_resource_list)==0):
            raise ClientFatalError("Aborting as there are no resources to sync")
        if (self.checksum and not src_resource_list.has_md5()):
            self.checksum=False
            self.logger.info("Not calculating checksums on destination as not present in source resource_list")
        # 1.b destination resource_list mapped back to source URIs
        rlb = ResourceListBuilder(mapper=self.mapper)
        rlb.do_md5=self.checksum
        dst_resource_list = rlb.from_disk()
        ### 2. Compare these resource_lists respecting any comparison options
        (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list)   
        ### 3. Report status and planned actions
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        audit=True,same=len(same),created=len(created),
                        updated=len(updated),deleted=len(deleted))
        if (audit_only or len(created)+len(updated)+len(deleted)==0):
            self.logger.debug("Completed "+action)
            return
        ### 4. Check that sitemap has authority over URIs listed
        uauth = UrlAuthority(self.sitemap)
        for resource in src_resource_list:
            if (not uauth.has_authority_over(resource.uri)):
                if (self.noauth):
                    #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri))
                    pass
                else:
                    raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri))
        ### 5. Grab files to do sync
        delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else ''
        self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg))
        self.last_timestamp = 0
        num_created=0
        num_updated=0
        num_deleted=0
        for resource in created:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("created: %s -> %s" % (uri,file))
            num_created+=self.update_resource(resource,file,'created')
        for resource in updated:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            self.logger.info("updated: %s -> %s" % (uri,file))
            num_updated+=self.update_resource(resource,file,'updated')
        for resource in deleted:
            uri = resource.uri
            file = self.mapper.src_to_dst(uri)
            num_deleted+=self.delete_resource(resource,file,allow_deletion)
        ### 6. Store last timestamp to allow incremental sync
        if (not audit_only and self.last_timestamp>0):
            ClientState().set_state(self.sitemap,self.last_timestamp)
            self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp)))
        ### 7. Done
        self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0),
                        same=len(same),created=num_created,
                        updated=num_updated,deleted=num_deleted)
        self.logger.debug("Completed %s" % (action))
Exemple #12
0
 def md_until(self):
     """md_until value in W3C Datetime syntax, Z notation"""
     return datetime_to_str(self._get_extra('ts_until'))
Exemple #13
0
 def md_from(self):
     """md_from value in W3C Datetime syntax, Z notation"""
     return datetime_to_str(self._get_extra('ts_from'))
Exemple #14
0
 def md_completed(self):
     """md_completed value in W3C Datetime syntax, Z notation"""
     return datetime_to_str(self._get_extra('ts_completed'))
Exemple #15
0
 def md_at(self):
     """md_at values in W3C Datetime syntax, Z notation"""
     return datetime_to_str(self._get_extra('ts_at'))
Exemple #16
0
 def lastmod(self):
     """The Last-Modified data in W3C Datetime syntax, Z notation"""
     return datetime_to_str(self.timestamp)