def test07_no_init_data(self): uauth = UrlAuthority() self.assertEqual(uauth.master_scheme, 'none') self.assertFalse(uauth.has_authority_over( 'http://a.example.org/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'http://sub.a.example.org/sitemap.xml'))
def test02_strict_no_authority(self): uauth = UrlAuthority('http://example.org/dir/sitemap.xml', True) self.assertFalse(uauth.has_authority_over( 'http://example.org/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'http://sub.example.org/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'https://example.org/dir/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'unknown://example.org/dir/sitemap.xml'))
def test06_lax_domains(self): uauth = UrlAuthority('http://a.example.org/sitemap.xml') self.assertTrue(uauth.has_authority_over( 'http://a.example.org/sitemap.xml')) self.assertTrue(uauth.has_authority_over( 'http://sub.a.example.org/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'http://b.example.org/sitemap.xml')) self.assertFalse(uauth.has_authority_over( 'http://sub.b.example.org/sitemap.xml'))
def test04_lax_authority(self): # Default is server check only uauth = UrlAuthority( 'http://example.org/dir/sitemap.xml' ) self.assertTrue( uauth.has_authority_over( 'http://example.org/sitemap.xml' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/sitemap.xml?anything' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/sitemap.xml#frag' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/dir/same_level' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/dir/one/deeper' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/dir/one/two/deeper' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/shallower' ) ) self.assertTrue( uauth.has_authority_over( 'http://example.org/' ) ) self.assertTrue( uauth.has_authority_over( 'http://sub.example.org/subdomain' ) ) self.assertTrue( uauth.has_authority_over( 'http://sub.sub.example.org/subsubdomain' ) )
def test1(self): uauth = UrlAuthority("http://example.org/sitemap.xml") self.assertTrue(uauth.has_authority_over("http://example.org/sitemap.xml")) self.assertTrue(uauth.has_authority_over("http://example.org/sitemap.xml?anything")) self.assertTrue(uauth.has_authority_over("http://example.org/sitemap.xml#frag")) self.assertTrue(uauth.has_authority_over("http://example.org/same_level")) self.assertTrue(uauth.has_authority_over("http://example.org/one/deeper")) self.assertTrue(uauth.has_authority_over("http://example.org/one/two/deeper")) self.assertTrue(uauth.has_authority_over("http://example.org/")) self.assertTrue(uauth.has_authority_over("http://sub.example.org/subdomain")) self.assertTrue(uauth.has_authority_over("http://sub.sub.example.org/subsubdomain"))
def test3_domains(self): uauth = UrlAuthority('http://a.example.org/sitemap.xml') self.assertTrue( uauth.has_authority_over('http://a.example.org/sitemap.xml')) self.assertTrue( uauth.has_authority_over('http://sub.a.example.org/sitemap.xml')) self.assertFalse( uauth.has_authority_over('http://b.example.org/sitemap.xml')) self.assertFalse( uauth.has_authority_over('http://sub.b.example.org/sitemap.xml'))
def test2_no_authority(self): uauth = UrlAuthority('http://example.org/dir/sitemap.xml') self.assertFalse( uauth.has_authority_over('http://example.org/sitemap.xml')) self.assertFalse( uauth.has_authority_over('http://sub.example.org/sitemap.xml')) self.assertFalse( uauth.has_authority_over('https://example.org/dir/sitemap.xml')) self.assertFalse( uauth.has_authority_over('unknown://example.org/dir/sitemap.xml'))
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resourcelist ib = ResourceListBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resourcelist = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resourcelist from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resourcelist, %d resources listed" % (len(src_resourcelist))) if (len(src_resourcelist)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resourcelist.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resourcelist") # 1.b destination resourcelist mapped back to source URIs ib.do_md5=self.checksum dst_resourcelist = ib.from_disk() ### 2. Compare these resourcelists respecting any comparison options (same,updated,deleted,created)=dst_resourcelist.compare(src_resourcelist) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated)>0 or len(deleted)>0 or len(created)>0): status = "NOT IN SYNC" self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,len(same),len(updated),len(deleted),len(created))) if (audit_only): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resourcelist: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) self.delete_resource(resource,file,allow_deletion) ### 6. For sync reset any incremental status for site if (not audit_only): links = self.extract_links(src_resourcelist) if ('next' in links): self.write_incremental_status(self.sitemap,links['next']) self.logger.info("Written config with next incremental at %s" % (links['next'])) else: self.write_incremental_status(self.sitemap) self.logger.debug("Completed "+action)
def incremental(self, allow_deletion=False, changelist_uri=None): """Incremental synchronization""" self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") # Get current config inc_config_next=self.read_incremental_status(self.sitemap) ### 1. Get URI of changelist, from sitemap or explicit if (inc_config_next is not None): # We have config from last run for this site changelist = inc_config_next self.logger.info("ChangeList location from last incremental run %s" % (changelist)) elif (changelist_uri): # Translate as necessary using maps changelist = self.sitemap_changelist_uri(changelist_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resourcelist = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap,str(e))) # Extract changelist location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_resourcelist) if ('current' not in links): raise ClientFatalError("Failed to extract changelist location from sitemap %s" % (self.sitemap)) changelist = links['current'] ### 2. Read changelist from source ib = ResourceListBuilder(mapper=self.mapper) try: self.logger.info("Reading changelist %s" % (changelist)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changelist = src_sitemap.read(uri=changelist, changelist=True) self.logger.debug("Finished reading changelist") except Exception as e: raise ClientFatalError("Can't read source changelist from %s (%s)" % (changelist,str(e))) self.logger.info("Read source changelist, %d resources listed" % (len(src_changelist))) #if (len(src_changelist)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_changelist.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resourcelist") ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changelist? Here use both the # changelist URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changelist) if (not changelist_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_changelist: if (not uauth_cs.has_authority_over(resource.uri) and (changelist_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): #self.logger.info("ChangeList (%s) mentions resource at a location it does not have authority over (%s)" % (changelist,resource.uri)) pass else: raise ClientFatalError("Aborting as changelist (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changelist,resource.uri)) ### 3. Apply changes num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_changelist: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): self.delete_resource(resource,file,allow_deletion) num_deleted+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) # 4. Report status and planned actions status = "NO CHANGES" if ((num_updated+num_deleted+num_created)>0): status = " CHANGES " self.logger.warning("Status: %s (updated=%d, deleted=%d, created=%d)" %\ (status,num_updated,num_deleted,num_created)) # 5. Store next link if available if ((num_updated+num_deleted+num_created)>0): links = self.extract_links(src_changelist) if ('next' in links): self.write_incremental_status(self.sitemap,links['next']) self.logger.info("Written config with next incremental at %s" % (links['next'])) else: self.logger.warning("Failed to extract next changelist location from changelist %s" % (changelist)) # 6. Done self.logger.debug("Completed incremental sync")
def sync_or_audit(self, allow_deletion=False, audit_only=False): action = ('audit' if (audit_only) else 'sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory))) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0): status = "NOT IN SYNC" self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,len(same),len(updated),len(deleted),len(created))) if (audit_only): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_inventory: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): self.logger.info( "Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap, resource.uri)) else: raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') for resource in deleted: uri = resource.uri if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) self.logger.debug("Completed " + action)
def incremental(self, allow_deletion=False, changeset_uri=None): self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get URI of changeset, from sitemap or explicit if (changeset_uri): # Translate as necessary using maps changeset = self.sitemap_changeset_uri(changeset_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError( "Can't read source sitemap from %s (%s)" % (self.sitemap, str(e))) # Extract changeset location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_inventory.capabilities) if ('current' not in links): raise ClientFatalError( "Failed to extract changeset location from sitemap %s" % (self.sitemap)) changeset = links['current'] ### 2. Read changeset from source ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading changeset %s" % (changeset)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changeset = src_sitemap.read(uri=changeset, changeset=True) self.logger.debug("Finished reading changeset") except Exception as e: raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e))) self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset))) if (len(src_changeset) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_changeset.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changeset? Here use both the # changeset URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changeset) if (not changeset_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_changeset: if (not uauth_cs.has_authority_over(resource.uri) and (changeset_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): self.logger.warning( "Changeset (%s) mentions resource at a location it does not have authority over (%s)" % (changeset, resource.uri)) else: raise ClientFatalError( "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changeset, resource.uri)) ### 3. Apply changes for resource in src_changeset: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.changetype == 'UPDATED'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') elif (resource.changetype == 'CREATED'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') elif (resource.changetype == 'DELETED'): if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) else: raise ClientError("Unknown change type %s" % (resource.changetype)) self.logger.debug("Completed incremental stuff")
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ('audit' if (audit_only) else 'baseline sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (not audit_only and self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) ### 1. Get inventories from both src and dst # 1.a source resource list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList( allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError( "Can't read source resource list from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source resource list" ) # 1.b destination resource list mapped back to source URIs rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) dst_resource_list = rlb.from_disk() ### 2. Compare these resource lists respecting any comparison options (same, updated, deleted, created) = dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), audit=True, same=len(same), created=len(created), updated=len(updated), deleted=len(deleted)) if (audit_only or len(created) + len(updated) + len(deleted) == 0): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed if (not self.noauth): uauth = UrlAuthority(self.sitemap, strict=self.strictauth) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created) + len(updated), delete_msg)) self.last_timestamp = 0 num_created = 0 num_updated = 0 num_deleted = 0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) num_created += self.update_resource(resource, file, 'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) num_updated += self.update_resource(resource, file, 'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted += self.delete_resource(resource, file, allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), same=len(same), created=num_created, updated=num_updated, deleted=num_deleted, to_delete=len(deleted)) self.logger.debug("Completed %s" % (action))
def incremental(self, allow_deletion=False, changeset_uri=None): self.logger.debug("Starting incremental sync") ### 0. Sanity checks if len(self.mappings) < 1: raise ClientFatalError("No source to destination mapping specified") ### 1. Get URI of changeset, from sitemap or explicit if changeset_uri: # Translate as necessary using maps changeset = self.sitemap_changeset_uri(changeset_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap, str(e))) # Extract changeset location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_inventory.capabilities) if "current" not in links: raise ClientFatalError("Failed to extract changeset location from sitemap %s" % (self.sitemap)) changeset = links["current"] ### 2. Read changeset from source ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading changeset %s" % (changeset)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changeset = src_sitemap.read(uri=changeset, changeset=True) self.logger.debug("Finished reading changeset") except Exception as e: raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e))) self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset))) if len(src_changeset) == 0: raise ClientFatalError("Aborting as there are no resources to sync") if self.checksum and not src_changeset.has_md5(): self.checksum = False self.logger.info("Not calculating checksums on destination as not present in source inventory") ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changeset? Here use both the # changeset URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changeset) if not changeset_uri: uauth_sm = UrlAuthority(self.sitemap) for resource in src_changeset: if not uauth_cs.has_authority_over(resource.uri) and ( changeset_uri or not uauth_sm.has_authority_over(resource.uri) ): if self.noauth: self.logger.warning( "Changeset (%s) mentions resource at a location it does not have authority over (%s)" % (changeset, resource.uri) ) else: raise ClientFatalError( "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changeset, resource.uri) ) ### 3. Apply changes for resource in src_changeset: uri = resource.uri file = self.mapper.src_to_dst(uri) if resource.changetype == "UPDATED": self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, "UPDATED") elif resource.changetype == "CREATED": self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, "CREATED") elif resource.changetype == "DELETED": if allow_deletion: file = self.mapper.src_to_dst(uri) if self.dryrun: self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event(ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) else: raise ClientError("Unknown change type %s" % (resource.changetype)) self.logger.debug("Completed incremental stuff")
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resource_list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resource_list") # 1.b destination resource_list mapped back to source URIs rlb = ResourceListBuilder(mapper=self.mapper) rlb.do_md5=self.checksum dst_resource_list = rlb.from_disk() ### 2. Compare these resource_lists respecting any comparison options (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), audit=True,same=len(same),created=len(created), updated=len(updated),deleted=len(deleted)) if (audit_only or len(created)+len(updated)+len(deleted)==0): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg)) self.last_timestamp = 0 num_created=0 num_updated=0 num_deleted=0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) num_created+=self.update_resource(resource,file,'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) num_updated+=self.update_resource(resource,file,'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted+=self.delete_resource(resource,file,allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), same=len(same),created=num_created, updated=num_updated,deleted=num_deleted) self.logger.debug("Completed %s" % (action))
def test1(self): uauth = UrlAuthority('http://example.org/sitemap.xml') self.assertTrue( uauth.has_authority_over('http://example.org/sitemap.xml')) self.assertTrue( uauth.has_authority_over( 'http://example.org/sitemap.xml?anything')) self.assertTrue( uauth.has_authority_over('http://example.org/sitemap.xml#frag')) self.assertTrue( uauth.has_authority_over('http://example.org/same_level')) self.assertTrue( uauth.has_authority_over('http://example.org/one/deeper')) self.assertTrue( uauth.has_authority_over('http://example.org/one/two/deeper')) self.assertTrue(uauth.has_authority_over('http://example.org/')) self.assertTrue( uauth.has_authority_over('http://sub.example.org/subdomain')) self.assertTrue( uauth.has_authority_over( 'http://sub.sub.example.org/subsubdomain'))
def test01_strict_authority(self): """Default is server check only.""" uauth = UrlAuthority('http://example.org/sitemap.xml', True) self.assertTrue(uauth.has_authority_over( 'http://example.org/sitemap.xml')) self.assertTrue(uauth.has_authority_over( 'http://example.org/sitemap.xml?anything')) self.assertTrue(uauth.has_authority_over( 'http://example.org/sitemap.xml#frag')) self.assertTrue(uauth.has_authority_over( 'http://example.org/same_level')) self.assertTrue(uauth.has_authority_over( 'http://example.org/one/deeper')) self.assertTrue(uauth.has_authority_over( 'http://example.org/one/two/deeper')) self.assertTrue(uauth.has_authority_over('http://example.org/')) self.assertTrue(uauth.has_authority_over( 'http://sub.example.org/subdomain')) self.assertTrue(uauth.has_authority_over( 'http://sub.sub.example.org/subsubdomain'))
def test05_lax_no_authority(self): uauth = UrlAuthority( 'http://example.org/dir/sitemap.xml' ) self.assertFalse( uauth.has_authority_over( 'http://other.org/sitemap.xml' ) ) self.assertFalse( uauth.has_authority_over( 'unknown://example.org/dir/sitemap.xml' ) )
def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp=ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError("No stored timestamp for this site, and no explicit --from") ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source change list") # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(change_list) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): #self.logger.info("Change list (%s) mentions resource at a location it does not have authority over (%s)" % (change_list,resource.uri)) pass else: raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped>0): self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes>0): self.logger.info("Removed %d prior changes" % (num_dupes)) ### 6. Apply changes at same time or after from_timestamp self.logger.info("Applying %d changes" % (len(src_change_list))) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): self.delete_resource(resource,file,allow_deletion) num_deleted+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) ### 7. Report status and planned actions self.log_status(in_sync=((num_updated+num_deleted+num_created)==0), incremental=True,created=num_created, updated=num_updated, deleted=num_deleted) ### 8. Record last timestamp we have seen if (self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync")
def sync_or_audit(self, allow_deletion=False, audit_only=False): action = "audit" if (audit_only) else "sync" self.logger.debug("Starting " + action) ### 0. Sanity checks if len(self.mappings) < 1: raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory))) if len(src_inventory) == 0: raise ClientFatalError("Aborting as there are no resources to sync") if self.checksum and not src_inventory.has_md5(): self.checksum = False self.logger.info("Not calculating checksums on destination as not present in source inventory") # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if len(updated) > 0 or len(deleted) > 0 or len(created) > 0: status = "NOT IN SYNC" self.logger.warning( "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" % (status, len(same), len(updated), len(deleted), len(created)) ) if audit_only: self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_inventory: if not uauth.has_authority_over(resource.uri): if self.noauth: self.logger.info( "Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap, resource.uri) ) else: raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri) ) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, "UPDATED") for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, "CREATED") for resource in deleted: uri = resource.uri if allow_deletion: file = self.mapper.src_to_dst(uri) if self.dryrun: self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event(ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) self.logger.debug("Completed " + action)
def test2_no_authority(self): uauth = UrlAuthority("http://example.org/dir/sitemap.xml") self.assertFalse(uauth.has_authority_over("http://example.org/sitemap.xml")) self.assertFalse(uauth.has_authority_over("http://sub.example.org/sitemap.xml")) self.assertFalse(uauth.has_authority_over("https://example.org/dir/sitemap.xml")) self.assertFalse(uauth.has_authority_over("unknown://example.org/dir/sitemap.xml"))
def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization Use Change List to do incremental sync """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp = ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError( "Cannot do incremental sync. No stored timestamp for this site, and no explicit --from." ) ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError( "Can't read source change list from %s (%s)" % (change_list, str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source change list" ) # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError( "Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI if (not self.noauth): uauth_cs = UrlAuthority(change_list, self.strictauth) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): raise ClientFatalError( "Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list, resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped > 0): self.logger.info("Skipped %d changes before %s" % (num_skipped, datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes > 0): self.logger.info("Removed %d prior changes" % (num_dupes)) # Review and log status before # FIXME - should at this stage prune the change list to pick out # only the last change for each resource to_update = 0 to_create = 0 to_delete = 0 for resource in src_change_list: if (resource.change == 'updated'): to_update += 1 elif (resource.change == 'created'): to_create += 1 elif (resource.change == 'deleted'): to_delete += 1 else: raise ClientError("Unknown change type %s" % (resource.change)) # Log status based on what we know from the Change List. Exit if # either there are no changes or if there are only deletions and # we don't allow deletion in_sync = ((to_update + to_delete + to_create) == 0) self.log_status(in_sync=in_sync, incremental=True, created=to_create, updated=to_update, deleted=to_delete) if (in_sync or ((to_update + to_create) == 0 and not allow_deletion)): self.logger.debug("Completed incremental") return ### 6. Apply changes at same time or after from_timestamp delete_msg = (", and delete %d resources" % to_delete) if (allow_deletion) else '' self.logger.warning("Will apply %d changes%s" % (len(src_change_list), delete_msg)) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'updated') num_updated += 1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'created') num_created += 1 elif (resource.change == 'deleted'): num_deleted += self.delete_resource(resource, file, allow_deletion) else: raise ClientError("Unknown change type %s" % (resource.change)) ### 7. Report status and planned actions self.log_status(incremental=True, created=num_created, updated=num_updated, deleted=num_deleted, to_delete=to_delete) ### 8. Record last timestamp we have seen if (self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync")
def test04_lax_authority(self): # Default is server check only uauth = UrlAuthority('http://example.org/dir/sitemap.xml') self.assertTrue( uauth.has_authority_over('http://example.org/sitemap.xml')) self.assertTrue( uauth.has_authority_over( 'http://example.org/sitemap.xml?anything')) self.assertTrue( uauth.has_authority_over('http://example.org/sitemap.xml#frag')) self.assertTrue( uauth.has_authority_over('http://example.org/dir/same_level')) self.assertTrue( uauth.has_authority_over('http://example.org/dir/one/deeper')) self.assertTrue( uauth.has_authority_over('http://example.org/dir/one/two/deeper')) self.assertTrue( uauth.has_authority_over('http://example.org/shallower')) self.assertTrue(uauth.has_authority_over('http://example.org/')) self.assertTrue( uauth.has_authority_over('http://sub.example.org/subdomain')) self.assertTrue( uauth.has_authority_over( 'http://sub.sub.example.org/subsubdomain'))