def test1_simple_output(self): ib = InventoryBuilder() i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t') self.assertEqual( Sitemap().inventory_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url></urlset>' )
def test1_simple_output(self): ib = InventoryBuilder(verbose=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual( Sitemap().resources_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url></urlset>' )
def inventory(self): """Return inventory on disk based on current mappings Return inventory. Uses existing self.mappings settings. """ ib = InventoryBuilder(do_md5=self.checksum) m = Inventory() for base_path in sorted(self.mappings.keys()): base_uri = self.mappings[base_path] m = ib.from_disk(base_path, base_uri, inventory=m) return m
def test4_data(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual(len(i), 2) r1 = i.resources.get('http://example.org/t/file_a') self.assertTrue(r1 is not None) self.assertEqual(r1.uri, 'http://example.org/t/file_a') self.assertEqual(r1.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r1.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
def test4_data(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual(len(i), 2) r1 = i.resources.get('http://example.org/t/file_a') self.assertTrue(r1 is not None) self.assertEqual(r1.uri, 'http://example.org/t/file_a') self.assertEqual(r1.lastmod, '2012-03-14T17:46:04') self.assertEqual(r1.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
def inventory(self): """Return inventory on disk based on current mappings Return inventory. Uses existing self.mapper settings. """ ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Build from disk ib = InventoryBuilder(do_md5=self.checksum, mapper=self.mapper) ib.add_exclude_files(self.exclude_patterns) return (ib.from_disk())
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t') s = Sitemap() xml = s.inventory_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>', xml), 'size/checksum for file_a') self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>', xml), 'size/checksum for file_b')
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>', xml)) #must escape + in md5 self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>', xml))
def sync_or_audit(self, src_uri, dst_path, allow_deletion=False, audit_only=False): ### 1. Get inventorys from both src and dst # 1.a source inventory ib = InventoryBuilder() try: src_inventory = ib.get(src_uri) except IOError as e: raise ClientFatalError("Can't read source inventory (%s)" % str(e)) if (self.verbose): print "Read src inventory from %s, %d resources listed" % ( src_uri, len(src_inventory)) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False print "Not calculating checksums on destination as not present in source inventory" # 1.b destination inventory mapped back to source URIs segments = src_uri.split('/') segments.pop() url_prefix = '/'.join(segments) ib.do_md5 = self.checksum dst_inventory = ib.from_disk(dst_path, url_prefix) ### 2. Compare these inventorys respecting any comparison options (num_same, changed, deleted, added) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(changed) > 0 or len(deleted) > 0 or len(added) > 0): status = "NOT IN SYNC" print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\ (status,num_same,len(changed),len(deleted),len(added)) if (audit_only): return ### 4. Grab files to do sync mapper = Mapper(url_prefix, dst_path) for uri in changed: file = mapper.src_to_dst(uri) if (self.verbose): print "changed: %s -> %s" % (uri, file) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in added: file = mapper.src_to_dst(uri) if (self.verbose): print "added: %s -> %s" % (uri, file) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in deleted: if (allow_deletion): file = mapper.src_to_dst(uri) if (self.verbose): print "deleted: %s -> %s" % (uri, file) os.unlink(file) else: if (self.verbose): print "would delete %s (--delete to enable)" % uri
def sync_or_audit(self, allow_deletion=False, audit_only=False): action = ('audit' if (audit_only) else 'sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory))) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0): status = "NOT IN SYNC" self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,len(same),len(updated),len(deleted),len(created))) if (audit_only): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_inventory: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): self.logger.info( "Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap, resource.uri)) else: raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') for resource in deleted: uri = resource.uri if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) self.logger.debug("Completed " + action)
def incremental(self, allow_deletion=False, changeset_uri=None): self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get URI of changeset, from sitemap or explicit if (changeset_uri): # Translate as necessary using maps changeset = self.sitemap_changeset_uri(changeset_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError( "Can't read source sitemap from %s (%s)" % (self.sitemap, str(e))) # Extract changeset location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_inventory.capabilities) if ('current' not in links): raise ClientFatalError( "Failed to extract changeset location from sitemap %s" % (self.sitemap)) changeset = links['current'] ### 2. Read changeset from source ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading changeset %s" % (changeset)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changeset = src_sitemap.read(uri=changeset, changeset=True) self.logger.debug("Finished reading changeset") except Exception as e: raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e))) self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset))) if (len(src_changeset) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_changeset.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changeset? Here use both the # changeset URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changeset) if (not changeset_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_changeset: if (not uauth_cs.has_authority_over(resource.uri) and (changeset_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): self.logger.warning( "Changeset (%s) mentions resource at a location it does not have authority over (%s)" % (changeset, resource.uri)) else: raise ClientFatalError( "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changeset, resource.uri)) ### 3. Apply changes for resource in src_changeset: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.changetype == 'UPDATED'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') elif (resource.changetype == 'CREATED'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') elif (resource.changetype == 'DELETED'): if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) else: raise ClientError("Unknown change type %s" % (resource.changetype)) self.logger.debug("Completed incremental stuff")
def sync_or_audit(self, allow_deletion=False, audit_only=False): ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(verbose=self.verbose, mapper=self.mapper) try: if (self.verbose): print "Reading sitemap %s ..." % (self.sitemap) src_inventory = ib.get(self.sitemap) except IOError as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) if (self.verbose): print "Read source inventory, %d resources listed" % ( len(src_inventory)) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False print "Not calculating checksums on destination as not present in source inventory" # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (num_same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0): status = "NOT IN SYNC" print "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,num_same,len(updated),len(deleted),len(created)) if (audit_only): return ### 4. Grab files to do sync for uri in updated: file = self.mapper.src_to_dst(uri) if (self.verbose): print "updated: %s -> %s" % (uri, file) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in created: file = self.mapper.src_to_dst(uri) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in deleted: if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): print "dryrun: would delete %s -> %s" % (uri, file) else: os.unlink(file) if (self.verbose): print "deleted: %s -> %s" % (uri, file) else: if (self.verbose): print "nodelete: would delete %s (--delete to enable)" % uri