def test05_from_disk_paths(self): rlb = ResourceListBuilder() rlb.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) # no path, should get no resources rl = rlb.from_disk(paths=[]) self.assertEqual(len(rl), 0) # full path, 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual(len(rl), 2) # new object with mapper covering larger space of disk rlb = ResourceListBuilder(set_path=True) rlb.mapper = Mapper(['http://example.org/t', 'resync/test/testdata']) # same path with 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual(len(rl), 2) # same path with 2 resources rl = rlb.from_disk( paths=['resync/test/testdata/dir1', 'resync/test/testdata/dir2']) self.assertEqual(len(rl), 3) # path that is just a single file rl = rlb.from_disk(paths=['resync/test/testdata/dir1/file_a']) self.assertEqual(len(rl), 1) rli = iter(rl) r = rli.next() self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/dir1/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 20) self.assertEqual(r.path, 'resync/test/testdata/dir1/file_a')
def test02_mapper_dst_to_src(self): m = Mapper(['http://e.org/p/', '/tmp/q/']) self.assertEqual(m.dst_to_src('/tmp/q/'), 'http://e.org/p/') self.assertEqual(m.dst_to_src('/tmp/q/bb'), 'http://e.org/p/bb') self.assertEqual(m.dst_to_src('/tmp/q/bb/cc'), 'http://e.org/p/bb/cc') self.assertRaises(MapperError, m.dst_to_src, '/tmp/q') self.assertRaises(MapperError, m.dst_to_src, '/tmp/qa') self.assertRaises(MapperError, m.dst_to_src, 'nomatch')
def test01_mapper_src_to_dst(self): m=Mapper( ['http://e.org/p/','/tmp/q/'] ) self.assertEqual( m.src_to_dst('http://e.org/p/'), '/tmp/q/') self.assertEqual( m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa') self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb') self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb/'), '/tmp/q/aa/bb/') self.assertRaises( MapperError, m.src_to_dst, 'http://e.org/p' ) self.assertRaises( MapperError, m.src_to_dst, 'http://e.org/pa' ) self.assertRaises( MapperError, m.src_to_dst, 'nomatch' )
def test05_path_from_uri(self): m=Mapper() self.assertEqual( m.path_from_uri('a_file'), 'a_file' ) self.assertEqual( m.path_from_uri('some_path/a_file'), 'some_path/a_file' ) self.assertEqual( m.path_from_uri('http://localhost/p'), 'localhost_p' ) self.assertEqual( m.path_from_uri('http://localhost:8888/p'), 'localhost_8888_p' ) self.assertEqual( m.path_from_uri('https://localhost:8888/p'), 'localhost_8888_p' ) self.assertEqual( m.path_from_uri('http://example.com'), 'example.com' ) self.assertEqual( m.path_from_uri('http://example.com/'), 'example.com' ) self.assertEqual( m.path_from_uri('http://example.com/ex1'), 'example.com_ex1' ) self.assertEqual( m.path_from_uri('http://example.com/ex1/'), 'example.com_ex1' )
def sync_or_audit(self, src_uri, dst_path, allow_deletion=False, audit_only=False): ### 1. Get inventorys from both src and dst # 1.a source inventory ib = InventoryBuilder() try: src_inventory = ib.get(src_uri) except IOError as e: raise ClientFatalError("Can't read source inventory (%s)" % str(e)) if (self.verbose): print "Read src inventory from %s, %d resources listed" % (src_uri,len(src_inventory)) if (len(src_inventory)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum=False print "Not calculating checksums on destination as not present in source inventory" # 1.b destination inventory mapped back to source URIs segments = src_uri.split('/') segments.pop() url_prefix='/'.join(segments) ib.do_md5=self.checksum dst_inventory = ib.from_disk(dst_path,url_prefix) ### 2. Compare these inventorys respecting any comparison options (num_same,changed,deleted,added)=dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(changed)>0 or len(deleted)>0 or len(added)>0): status = "NOT IN SYNC" print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\ (status,num_same,len(changed),len(deleted),len(added)) if (audit_only): return ### 4. Grab files to do sync mapper = Mapper(url_prefix,dst_path) for uri in changed: file = mapper.src_to_dst(uri) if (self.verbose): print "changed: %s -> %s" % (uri,file) self.update_resource(uri,file,src_inventory.resources[uri].timestamp) for uri in added: file = mapper.src_to_dst(uri) if (self.verbose): print "added: %s -> %s" % (uri,file) self.update_resource(uri,file,src_inventory.resources[uri].timestamp) for uri in deleted: if (allow_deletion): file = mapper.src_to_dst(uri) if (self.verbose): print "deleted: %s -> %s" % (uri,file) os.unlink(file) else: if (self.verbose): print "would delete %s (--delete to enable)" % uri
def test06_mapper_unsafe(self): self.assertFalse(Mapper(['http://example.com/=/tmp/a']).unsafe()) self.assertFalse( Mapper(['http://example.com/=http://example.com/']).unsafe()) self.assertFalse( Mapper(['http://example.com/'], use_default_path=True).unsafe()) # Following hits case of single local arg supplied self.assertTrue(Mapper(['/tmp/a'], use_default_path=True).unsafe()) # One good, one bad -> bad self.assertTrue( Mapper(['http://example.com/=/tmp/a', '/tmp/a=/tmp']).unsafe())
def test00_mapper_creation(self): m1 = Mapper(['http://e.org/p/', '/tmp/q/']) self.assertEqual(len(m1), 1) m2 = Mapper(mappings=['http://e.org/p', '/tmp/q']) self.assertEqual(len(m2), 1) self.assertEqual(str(m1), str(m2)) m3 = Mapper(['http://e.org/p/=/tmp/q/']) self.assertEqual(len(m3), 1) self.assertEqual(str(m1), str(m3)) m4 = Mapper(['http://e.org/p/=/tmp/q/', 'http://e.org/r/=/tmp/s/']) m5 = Mapper(['http://e.org/r/=/tmp/s/', 'http://e.org/p/=/tmp/q/']) self.assertEqual(len(m4), 2) self.assertEqual(len(m5), 2) self.assertNotEqual(str(m4), str(m5))
def test_src_to_dst(self): m=Mapper('http://e.org/p','/tmp/q') self.assertEqual( m.src_to_dst('http://e.org/p'), '/tmp/q') self.assertEqual( m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa') self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb') self.assertEqual( m.src_to_dst('http://e.org/p/aa/bb/'), '/tmp/q/aa/bb/') self.assertEqual( m.src_to_dst('http://e.org/pa'), '/tmp/qa') #should throw error
def test1_simple_output(self): ib = InventoryBuilder(verbose=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual( Sitemap().inventory_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url></urlset>' )
def test1_simple_output(self): ib = InventoryBuilder(verbose=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual( Sitemap().resources_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url></urlset>' )
def test04_data(self): rlb = ResourceListBuilder(set_path=True, set_hashes=['md5']) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) r = rl.resources.get('http://example.org/t/file_a') self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
def test_11_write_multifile(self): tempdir = tempfile.mkdtemp(prefix='test_resource_list_multifile') rl = ResourceList() rl.mapper = Mapper(['http://localhost/=%s/' % (tempdir)]) rl.add(Resource(uri='http://localhost/a')) rl.add(Resource(uri='http://localhost/b')) rl.add(Resource(uri='http://localhost/c')) rl.add(Resource(uri='http://localhost/d')) rl.max_sitemap_entries = 2 # first try writing without mutlifile allowed rl.allow_multifile = False self.assertRaises(ListBaseIndexError, rl.write, basename=os.path.join(tempdir, 'sitemap.xml')) # second actually do it rl.allow_multifile = True rl.write(basename=os.path.join(tempdir, 'sitemap.xml')) # check the two component sitemaps rl1 = ResourceList() rl1.read(os.path.join(tempdir, 'sitemap00000.xml')) self.assertEquals(len(rl1), 2) self.assertEquals(rl1.capability, 'resourcelist') self.assertFalse(rl1.sitemapindex) i = iter(rl1) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') rl2 = ResourceList() rl2.read(os.path.join(tempdir, 'sitemap00001.xml')) self.assertEquals(len(rl2), 2) i = iter(rl2) self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # check the sitemapindex (read just as index) rli = ResourceList() rli.read(os.path.join(tempdir, 'sitemap.xml'), index_only=True) self.assertEquals(len(rli), 2) i = iter(rli) self.assertEquals(rli.capability, 'resourcelist') self.assertTrue(rli.sitemapindex) self.assertEquals(next(i).uri, 'http://localhost/sitemap00000.xml') self.assertEquals(next(i).uri, 'http://localhost/sitemap00001.xml') # check the sitemapindex and components rli = ResourceList(mapper=rl.mapper) rli.read(os.path.join(tempdir, 'sitemap.xml')) self.assertEquals(len(rli), 4) self.assertEquals(rli.capability, 'resourcelist') self.assertFalse(rli.sitemapindex) i = iter(rli) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # cleanup tempdir shutil.rmtree(tempdir)
def test04_data(self): rlb = ResourceListBuilder(set_path=True, set_md5=True) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) r = rl.resources.get('http://example.org/t/file_a') self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
def test_02_read_with_mapper(self): rl = ResourceList() rl.mapper = Mapper(['http://localhost/=tests/testdata/sitemapindex2/']) rl.read('tests/testdata/sitemapindex2/sitemap_mapper.xml') self.assertEqual(len(rl.resources), 17, '17 resources from 3 sitemaps listed') sr = sorted(rl.uris()) self.assertEqual(sr[0], 'http://localhost:8888/resources/1') self.assertEqual(sr[1], 'http://localhost:8888/resources/10') self.assertEqual(sr[2], 'http://localhost:8888/resources/100') self.assertEqual(sr[3], 'http://localhost:8888/resources/1000') self.assertEqual(sr[16], 'http://localhost:8888/resources/826')
def test4_data(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual(len(i), 2) r1 = i.resources.get('http://example.org/t/file_a') self.assertTrue(r1 is not None) self.assertEqual(r1.uri, 'http://example.org/t/file_a') self.assertEqual(r1.lastmod, '2012-03-14T17:46:04') self.assertEqual(r1.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
def test4_data(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() self.assertEqual(len(i), 2) r1 = i.resources.get('http://example.org/t/file_a') self.assertTrue(r1 is not None) self.assertEqual(r1.uri, 'http://example.org/t/file_a') self.assertEqual(r1.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r1.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger('resync.client') self.mapper = Mapper() self.resource_list_name = 'resourcelist.xml' self.change_list_name = 'changelist.xml' self.dump_format = None self.exclude_patterns = [] self.sitemap_name = None self.allow_multifile = True self.noauth = False self.strictauth = False self.max_sitemap_entries = None self.ignore_failures = False self.pretty_xml = True # Default file names self.status_file = '.resync-client-status.cfg' self.default_resource_dump = 'resourcedump.zip' self.default_change_dump = 'changedump.zip'
def test01_mapper_src_to_dst(self): m = Mapper(['http://e.org/p/', '/tmp/q/']) self.assertEqual(m.src_to_dst('http://e.org/p/'), '/tmp/q/') self.assertEqual(m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa') self.assertEqual(m.src_to_dst('http://e.org/p/aa/bb'), '/tmp/q/aa/bb') self.assertEqual(m.src_to_dst('http://e.org/p/aa/bb/'), '/tmp/q/aa/bb/') self.assertRaises(MapperError, m.src_to_dst, 'http://e.org/p') self.assertRaises(MapperError, m.src_to_dst, 'http://e.org/pa') self.assertRaises(MapperError, m.src_to_dst, 'nomatch')
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.inventory_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>', xml), 'size/checksum for file_a') self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>', xml), 'size/checksum for file_b')
def test06_odd_file_names(self): """Verfify we can read unicode file names properly.""" rlb = ResourceListBuilder() rlb.mapper = Mapper(['x:', 'tests/testdata/odd_file_names']) rl = rlb.from_disk(paths=['tests/testdata/odd_file_names']) # Get list of URIs to test uris = [x.uri for x in rl] self.assertTrue('x:/not_odd.txt' in uris) self.assertTrue('x:/with&ersand.txt' in uris) self.assertTrue('x:/with spaces.txt' in uris) # File names for accented chars represented with combining chars self.assertTrue(u'x:/Pi\u006e\u0303a_Colada.txt' in uris) self.assertFalse(u'x:/Pi\u00f1a_Colada.txt' in uris) self.assertTrue(u'x:/A_\u0041\u0303_tilde.txt' in uris) self.assertFalse(u'x:/A_\u00c3_tilde.txt' in uris) # Snowman is single char self.assertFalse(u'x:snowman_\u2603.txt' in uris)
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>', xml)) #must escape + in md5 self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>', xml))
def test03_set_md5(self): rlb = ResourceListBuilder(set_md5=True) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, 'RS5Uva4WJqxdbnvoGzneIQ==') self.assertEqual(r.length, 45) self.assertEqual(r.path, None)
def test02_no_length(self): rlb = ResourceListBuilder(set_length=False) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, None) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, None) self.assertEqual(r.path, None)
def test03_set_hashes(self): rlb = ResourceListBuilder(set_hashes=['md5']) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, '452e54bdae1626ac5d6e7be81b39de21') self.assertEqual(r.length, 45) self.assertEqual(r.path, None)
def test01_simple_scan(self): rlb = ResourceListBuilder() rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 45) self.assertEqual(r.path, None) # Make sure at and completed were set self.assertTrue(rl.md_at is not None) self.assertTrue(rl.md_completed is not None)
def test00_mapper_creation(self): m1 = Mapper(['http://e.org/p/', '/tmp/q/']) self.assertEqual(len(m1), 1) m2 = Mapper(mappings=['http://e.org/p', '/tmp/q']) self.assertEqual(len(m2), 1) self.assertEqual(str(m1), str(m2)) m3 = Mapper(['http://e.org/p/=/tmp/q/']) self.assertEqual(len(m3), 1) self.assertEqual(str(m1), str(m3)) m4 = Mapper(['http://e.org/p/=/tmp/q/', 'http://e.org/r/=/tmp/s/']) m5 = Mapper(['http://e.org/r/=/tmp/s/', 'http://e.org/p/=/tmp/q/']) self.assertEqual(len(m4), 2) self.assertEqual(len(m5), 2) self.assertNotEqual(str(m4), str(m5)) # error cases m6 = Mapper() # too many equals self.assertRaises(MapperError, m6.parse, ['a=b=c']) self.assertRaises(MapperError, m6.parse, ['a=b=c=d']) # dupes self.assertRaises(MapperError, m6.parse, ['a=b', 'a=c']) self.assertRaises(MapperError, m6.parse, ['x=z', 'y=z'])
def test03_mapper2_src_to_dst(self): m = Mapper(['http://e.org/p=/tmp/q', 'http://e.org/r=/tmp/s']) self.assertEqual(m.src_to_dst('http://e.org/p/'), '/tmp/q/') self.assertEqual(m.src_to_dst('http://e.org/p/aa'), '/tmp/q/aa') self.assertEqual(m.src_to_dst('http://e.org/r/'), '/tmp/s/') self.assertEqual(m.src_to_dst('http://e.org/r/aa'), '/tmp/s/aa')
class Client(): """Implementation of a ResourceSync client""" def __init__(self, checksum=False, verbose=False, dryrun=False): self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.mapper = None self.sitemap_name = 'sitemap.xml' self.dump_format = None self.allow_multifile = False self.max_sitemap_entries = None @property def mappings(self): """Provide access to mappings list within Mapper object""" if (self.mapper is None): raise ClientFatalError("No mappings specified") return(self.mapper.mappings) # @mappings.setter def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings) @property def sitemap(self): """Return the sitemap URI base on maps or explicit settings""" if (re.match(r"\w+:",self.sitemap_name)): # looks like URI return(self.sitemap_name) elif (re.match(r"/",self.sitemap_name)): # looks like full path return(self.sitemap_name) else: # build from mapping with name appended return(self.mappings[0].src_uri + '/' + self.sitemap_name) @property def inventory(self): """Return inventory on disk based on current mappings Return inventory. Uses existing self.mapper settings. """ ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Build from disk ib = InventoryBuilder(do_md5=self.checksum,verbose=self.verbose,mapper=self.mapper) return( ib.from_disk() ) def sync_or_audit(self, allow_deletion=False, audit_only=False): ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(verbose=self.verbose,mapper=self.mapper) try: if (self.verbose): print "Reading sitemap %s ..." % (self.sitemap) src_inventory = ib.get(self.sitemap) except IOError as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap,str(e))) if (self.verbose): print "Read source inventory, %d resources listed" % (len(src_inventory)) if (len(src_inventory)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum=False print "Not calculating checksums on destination as not present in source inventory" # 1.b destination inventory mapped back to source URIs ib.do_md5=self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (num_same,updated,deleted,created)=dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated)>0 or len(deleted)>0 or len(created)>0): status = "NOT IN SYNC" print "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,num_same,len(updated),len(deleted),len(created)) if (audit_only): return ### 4. Grab files to do sync for uri in updated: file = self.mapper.src_to_dst(uri) if (self.verbose): print "updated: %s -> %s" % (uri,file) self.update_resource(uri,file,src_inventory.resources[uri].timestamp) for uri in created: file = self.mapper.src_to_dst(uri) self.update_resource(uri,file,src_inventory.resources[uri].timestamp) for uri in deleted: if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): print "dryrun: would delete %s -> %s" % (uri,file) else: os.unlink(file) if (self.verbose): print "deleted: %s -> %s" % (uri,file) else: if (self.verbose): print "nodelete: would delete %s (--delete to enable)" % uri def update_resource(self, uri, file, timestamp=None): """Update resource from uri to file on local system Update means two things: 1. GET resources 2. set mtime to be equal to timestamp (should probably use LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the inventory """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) if (self.dryrun): print "dryrun: would GET %s --> %s" % (uri,file) else: urllib.urlretrieve(uri,file) if (self.verbose): print "created: %s -> %s" % (uri,file) if (timestamp is not None): unixtime=int(timestamp) #get rid of any fractional seconds os.utime(file,(unixtime,unixtime)) def parse_sitemap(self): s=Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile) if (self.verbose): print "Reading sitemap(s) from %s ..." % (sitemap) i = s.read(sitemap) num_entries = len(i) print "Read sitemap with %d entries in %d sitemaps" % (num_entries,s.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in i.resource_uris(): print i.resources[r] n+=1 if ( n >= to_show ): break def write_sitemap(self,outfile=None,capabilities=None,dump=None): # Set up base_path->base_uri mappings, get inventory from disk i = self.inventory i.capabilities = capabilities s=Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.inventory_as_xml(i) else: s.write(i,basename=outfile) self.write_dump_if_requested(i,dump) def changeset_sitemap(self,outfile=None,ref_sitemap=None,capabilities=None, dump=None): # 1. Get and parse reference sitemap rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.verbose): print "Reading sitemap(s) from %s ..." % (ref_sitemap) ri = rs.read(ref_sitemap) num_entries = len(ri) print "Read reference sitemap with %d entries in %d sitemaps" % (num_entries,rs.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in ri.resource_uris(): print ri.resources[r] n+=1 if ( n >= to_show ): break # 2. Set up base_path->base_uri mappings, get inventory from disk disk_inventory = self.inventory # 3. Calculate changeset (num_same,updated,deleted,created)=ri.compare(disk_inventory) changeset = Inventory() changeset.capabilities = capabilities changeset.add( disk_inventory.changeset( updated, changetype='updated' ) ) changeset.add( ri.changeset( deleted, changetype='deleted' ) ) changeset.add( disk_inventory.changeset( created, changetype='created' ) ) # 4. Write out changeset s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.inventory_as_xml(changeset) else: s.write(changeset,basename=outfile) self.write_dump_if_requested(changeset,dump) def write_dump_if_requested(self,inventory,dump): if (dump is None): return if (self.verbose): print "Writing dump to %s..." % (dump) d = Dump(format=self.dump_format) d.write(inventory=inventory,dumpfile=dump)
def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings)
def test04_mapper2_dst_to_src(self): m = Mapper(['http://e.org/p=/tmp/q', 'http://e.org/r=/tmp/s']) self.assertEqual(m.dst_to_src('/tmp/q/'), 'http://e.org/p/') self.assertEqual(m.dst_to_src('/tmp/q/bb'), 'http://e.org/p/bb') self.assertEqual(m.dst_to_src('/tmp/s/'), 'http://e.org/r/') self.assertEqual(m.dst_to_src('/tmp/s/bb'), 'http://e.org/r/bb')
class Client(object): """Implementation of a ResourceSync client Logging is used for both console output and for detailed logs for automated analysis. Levels used: warning - usually shown to user info - verbose output debug - very verbose for automated analysis """ def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger('resync.client') self.mapper = Mapper() self.resource_list_name = 'resourcelist.xml' self.change_list_name = 'changelist.xml' self.dump_format = None self.exclude_patterns = [] self.sitemap_name = None self.allow_multifile = True self.noauth = False self.strictauth = False self.max_sitemap_entries = None self.ignore_failures = False self.pretty_xml = True # Default file names self.status_file = '.resync-client-status.cfg' self.default_resource_dump = 'resourcedump.zip' self.default_change_dump = 'changedump.zip' def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings, use_default_path=True) def sitemap_uri(self,basename): """Get full URI (filepath) for sitemap based on basename""" if (re.match(r"\w+:",basename)): # looks like URI return(basename) elif (re.match(r"/",basename)): # looks like full path return(basename) else: # build from mapping with name appended return(self.mapper.default_src_uri() + '/' + basename) @property def sitemap(self): """Return the sitemap URI based on maps or explicit settings""" if (self.sitemap_name is not None): return(self.sitemap_name) return(self.sitemap_uri(self.resource_list_name)) def build_resource_list(self, paths=None, set_path=False): """Return a resource list for files on local disk The set of files is taken by disk scan from the paths specified or else defaults to the paths specified in the current mappings paths - override paths from mappings if specified set_path - set true to set the path information for each resource included. This is used to build a resource list as the basis for creating a dump. Return ResourceList. Uses existing self.mapper settings. """ # 0. Sanity checks, parse paths is specified if (len(self.mapper)<1): raise ClientFatalError("No source to destination mapping specified") if (paths is not None): # Expect comma separated list of paths paths=paths.split(',') # 1. Build from disk rlb = ResourceListBuilder(set_md5=self.checksum,mapper=self.mapper) rlb.set_path=set_path rlb.add_exclude_files(self.exclude_patterns) rl = rlb.from_disk(paths=paths) # 2. Set defaults and overrides rl.allow_multifile = self.allow_multifile rl.pretty_xml = self.pretty_xml rl.mapper = self.mapper if (self.max_sitemap_entries is not None): rl.max_sitemap_entries = self.max_sitemap_entries return(rl) def log_event(self, change): """Log a Resource object as an event for automated analysis""" self.logger.debug( "Event: "+repr(change) ) def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mapper)<1): raise ClientFatalError("No source to destination mapping specified") if (not audit_only and self.mapper.unsafe()): raise ClientFatalError("Source to destination mappings unsafe: %s" % str(self.mapper)) ### 1. Get inventories from both src and dst # 1.a source resource list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resource list from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resource list") # 1.b destination resource list mapped back to source URIs rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) dst_resource_list = rlb.from_disk() ### 2. Compare these resource lists respecting any comparison options (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), audit=True,same=len(same),created=len(created), updated=len(updated),deleted=len(deleted)) if (audit_only or len(created)+len(updated)+len(deleted)==0): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed if (not self.noauth): uauth = UrlAuthority(self.sitemap, strict=self.strictauth) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg)) self.last_timestamp = 0 num_created=0 num_updated=0 num_deleted=0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) num_created+=self.update_resource(resource,file,'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) num_updated+=self.update_resource(resource,file,'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted+=self.delete_resource(resource,file,allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), same=len(same),created=num_created, updated=num_updated,deleted=num_deleted,to_delete=len(deleted)) self.logger.debug("Completed %s" % (action)) def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization Use Change List to do incremental sync """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mapper)<1): raise ClientFatalError("No source to destination mapping specified") if (self.mapper.unsafe()): raise ClientFatalError("Source to destination mappings unsafe: %s" % str(self.mapper)) from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp=ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError("Cannot do incremental sync. No stored timestamp for this site, and no explicit --from.") ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source change list") # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI if (not self.noauth): uauth_cs = UrlAuthority(change_list, self.strictauth) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped>0): self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes>0): self.logger.info("Removed %d prior changes" % (num_dupes)) # Review and log status before # FIXME - should at this stage prune the change list to pick out # only the last change for each resource to_update = 0 to_create = 0 to_delete = 0 for resource in src_change_list: if (resource.change == 'updated'): to_update+=1 elif (resource.change == 'created'): to_create+=1 elif (resource.change == 'deleted'): to_delete+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) # Log status based on what we know from the Change List. Exit if # either there are no changes or if there are only deletions and # we don't allow deletion in_sync = ((to_update+to_delete+to_create)==0) self.log_status(in_sync=in_sync, incremental=True, created=to_create, updated=to_update, deleted=to_delete) if (in_sync or ((to_update+to_create)==0 and not allow_deletion)): self.logger.debug("Completed incremental") return ### 6. Apply changes at same time or after from_timestamp delete_msg = (", and delete %d resources" % to_delete) if (allow_deletion) else '' self.logger.warning("Will apply %d changes%s" % (len(src_change_list),delete_msg)) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): num_deleted+=self.delete_resource(resource,file,allow_deletion) else: raise ClientError("Unknown change type %s" % (resource.change) ) ### 7. Report status and planned actions self.log_status(incremental=True,created=num_created, updated=num_updated, deleted=num_deleted,to_delete=to_delete) ### 8. Record last timestamp we have seen if (self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync") def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means three things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resource list 3. check that resource matches expected information Also update self.last_timestamp if the timestamp (in source frame) of this resource is later and the current value. Returns the number of resources updated/created (0 or 1) """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) num_updated=0 if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file)) else: # 1. GET try: urllib.urlretrieve(resource.uri,file) num_updated+=1 except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri,str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # 2. set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file,(unixtime,unixtime)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp self.log_event(Resource(resource=resource, change=change)) # 3. sanity check length = os.stat(file).st_size if (resource.length != length): self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,length,resource.length)) if (self.checksum and resource.md5 is not None): file_md5 = compute_md5_for_file(file) if (resource.md5 != file_md5): self.logger.info("MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri,file_md5,resource.md5)) return(num_updated) def delete_resource(self, resource, file, allow_deletion=False): """Delete copy of resource in file on local system Will only actually do the deletion if allow_deletion is True. Regardless of whether the deletion occurs, self.last_timestamp will be updated if the resource.timestamp is later than the current value. Returns the number of files actually deleted (0 or 1). """ num_deleted=0 uri = resource.uri if (resource.timestamp is not None and resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp if (allow_deletion): if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri,file)) else: try: os.unlink(file) num_deleted+=1 except OSError as e: msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e)) #if (self.ignore_failures): self.logger.warning(msg) # return #else: # raise ClientFatalError(msg) self.logger.info("deleted: %s -> %s" % (uri,file)) self.log_event(Resource(resource=resource, change="deleted")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) return(num_deleted) def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s=Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability,num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for resource in list: print '[%d] %s' % (n,str(resource)) n+=1 if ( n >= to_show ): break def explore(self): """Explore capabilities of a server interactvely Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ uri = None if (self.sitemap_name is not None): uri = self.sitemap print "Taking location from --sitemap option" acceptable_capabilities = None #ie. any elif (len(self.mapper)>0): pu = urlparse.urlparse(self.mapper.default_src_uri()) uri = urlparse.urlunparse( [ pu[0], pu[1], '/.well-known/resourcesync', '', '', '' ] ) print "Will look for discovery information based on mappings" acceptable_capabilities = [ 'capabilitylist', 'capabilitylistindex' ] else: raise ClientFatalError("Neither explicit sitemap nor mapping specified") history = [] inp = None checks = None while (inp!='q'): print if (inp=='b'): if (len(history)<2): break #can't do this, exit history.pop() #throw away current uri=history.pop() acceptable_capabilities=None history.append(uri) (uri,checks,acceptable_capabilities,inp) = self.explore_uri(uri,checks,acceptable_capabilities,len(history)>1) print "--explore done, bye..." def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s=Sitemap() print "Reading %s" % (uri) options={} capability=None try: if (caps=='resource'): self.explore_show_head(uri,check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options,capability)=self.explore_show_summary(list,s.parsed_index,caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri,str(e)) return('','','','b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri,str(e)) return('','','','b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options)==0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) ) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return('','','',inp) checks = {} if ( options[inp].capability is None ): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in ['resourcelist','changelist', 'resourcedump','changedump']): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length']=r.length if (r.lastmod is not None): checks['last-modified']=r.lastmod # FIXME - could do sanity check here and issue warnings if odd return( options[inp].uri, checks, caps, inp ) def explore_show_summary(self,list,parsed_index,caps): """Show summary of one capability document Used as part of --explore. FIXME - should look for <rs:ln rel="up"...> link and show that """ num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] if (parsed_index): capability += 'index' print "Parsed %s document with %d entries:" % (capability,num_entries) if (caps is not None and capability not in caps): print "WARNING - expected a %s document" % (','.join(caps)) to_show = num_entries if (num_entries>21): to_show = 20 # What entries are allowed? # FIXME - not complete entry_caps = [] if (capability == 'capabilitylistindex'): entry_caps = ['capabilitylist'] elif (capability == 'capabilitylist'): entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex'] elif (capability == 'changelistindex'): entry_caps = ['changelist'] options = {} n=0 if ('up' in list.ln): options['up']=list.ln['up'] print "[%s] %s" % ('up',list.ln['up'].uri) for r in list.resources: if (n>=to_show): print "(not showing remaining %d entries)" % (num_entries-n) break n+=1 options[str(n)]=r print "[%d] %s" % (n,r.uri) if (r.capability is not None): warning = '' if (r.capability not in entry_caps): warning = " (EXPECTED %s)" % (' or '.join(entry_caps)) print " %s%s" % (r.capability,warning) elif (len(entry_caps)==1): r.capability=entry_caps[0] print " capability not specified, should be %s" % (r.capability) return(options,capability) def explore_show_head(self,uri,check_headers=None): """Do HEAD on uri and show infomation Will also check headers against any values specified in check_headers. """ print "HEAD %s" % (uri) response = requests.head(uri) print " status: %s" % (response.status_code) # generate normalized lastmod # if ('last-modified' in response.headers): # response.headers.add['lastmod'] = datetime_to_str(str_to_datetime(response.headers['last-modified'])) # print some of the headers for header in ['content-length','last-modified','lastmod','content-type','etag']: if header in response.headers: check_str='' if (check_headers is not None and header in check_headers): if (response.headers[header] == check_headers[header]): check_str=' MATCHES EXPECTED VALUE' else: check_STR=' EXPECTED %s' % (check_headers[header]) print " %s: %s%s" % (header, response.headers[header], check_str) def write_resource_list(self,paths=None,outfile=None,links=None,dump=None): """Write a Resource List or a Resource Dump for files on local disk Set of resources included is based on paths setting or else the mappings. Optionally links can be added. Output will be to stdout unless outfile is specified. If dump is true then a Resource Dump is written instead of a Resource List. If outfile is not set then self.default_resource_dump will be used. """ rl = self.build_resource_list(paths=paths,set_path=dump) if (links is not None): rl.ln = links if (dump): if (outfile is None): outfile = self.default_resource_dump self.logger.info("Writing resource dump to %s..." % (dump)) d = Dump(format=self.dump_format) d.write(resource_list=rl,dumpfile=outfile) else: if (outfile is None): try: print rl.as_xml() except ListBaseIndexError as e: raise ClientFatalError("%s. Use --output option to specify base name for output files." % str(e)) else: rl.write(basename=outfile) def write_change_list(self,paths=None,outfile=None,ref_sitemap=None,newref_sitemap=None, empty=None,links=None,dump=None): """Write a change list Unless the both ref_sitemap and newref_sitemap are specified then the Change List is calculated between the reference an the current state of files on disk. The files on disk are scanned based either on the paths setting or else on the mappings. """ cl = ChangeList(ln=links) if (not empty): # 1. Get and parse reference sitemap old_rl = self.read_reference_resource_list(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build resource list from files on disk if (newref_sitemap is None): # Get resource list from disk new_rl = self.build_resource_list(paths=paths,set_path=dump) else: new_rl = self.read_reference_resource_list(newref_sitemap,name='new reference') # 3. Calculate change list (same,updated,deleted,created)=old_rl.compare(new_rl) cl.add_changed_resources( updated, change='updated' ) cl.add_changed_resources( deleted, change='deleted' ) cl.add_changed_resources( created, change='created' ) # 4. Write out change list cl.mapper = self.mapper cl.pretty_xml = self.pretty_xml if (self.max_sitemap_entries is not None): cl.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print cl.as_xml() else: cl.write(basename=outfile) self.write_dump_if_requested(cl,dump) def write_capability_list(self,capabilities=None,outfile=None,links=None): """Write a Capability List to outfile or STDOUT""" capl = CapabilityList(ln=links) capl.pretty_xml = self.pretty_xml if (capabilities is not None): for name in capabilities.keys(): capl.add_capability(name=name, uri=capabilities[name]) if (outfile is None): print capl.as_xml() else: capl.write(basename=outfile) def write_source_description(self,capability_lists=None,outfile=None,links=None): """Write a ResourceSync Description document to outfile or STDOUT""" rsd = SourceDescription(ln=links) rsd.pretty_xml = self.pretty_xml if (capability_lists is not None): for uri in capability_lists: rsd.add_capability_list(uri) if (outfile is None): print rsd.as_xml() else: rsd.write(basename=outfile) def write_dump_if_requested(self,resource_list,dump): """Write a dump to the file dump""" if (dump is None): return def read_reference_resource_list(self,ref_sitemap,name='reference'): """Read reference resource list and return the ResourceList object name parameter just uses in output messages to say what type of resource list is being read. """ rl = ResourceList() self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap)) rl.mapper=self.mapper rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile)) num_entries = len(rl.resources) self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in rl.resources: print r n+=1 if ( n >= to_show ): break return(rl) def log_status(self, in_sync=True, incremental=False, audit=False, same=None, created=0, updated=0, deleted=0, to_delete=0): """Write log message regarding status in standard form Split this off so all messages from baseline/audit/incremental are written in a consistent form. """ if (audit): words = { 'created': 'to create', 'updated': 'to update', 'deleted': 'to delete' } else: words = { 'created': 'created', 'updated': 'updated', 'deleted': 'deleted' } if in_sync: # status rather than action status = "NO CHANGES" if incremental else "IN SYNC" else: if audit: status = "NOT IN SYNC" elif (to_delete>deleted): #will need --delete status = "PART APPLIED" if incremental else"PART SYNCED" words['deleted']='to delete (--delete)' deleted=to_delete else: status = "CHANGES APPLIED" if incremental else "SYNCED" same = "" if (same is None) else ("same=%d, " % same) self.logger.warning("Status: %15s (%s%s=%d, %s=%d, %s=%d)" %\ (status, same, words['created'], created, words['updated'], updated, words['deleted'], deleted))
def set_mappings(self, mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings, use_default_path=True)
def sync_or_audit(self, src_uri, dst_path, allow_deletion=False, audit_only=False): ### 1. Get inventorys from both src and dst # 1.a source inventory ib = InventoryBuilder() try: src_inventory = ib.get(src_uri) except IOError as e: raise ClientFatalError("Can't read source inventory (%s)" % str(e)) if (self.verbose): print "Read src inventory from %s, %d resources listed" % ( src_uri, len(src_inventory)) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False print "Not calculating checksums on destination as not present in source inventory" # 1.b destination inventory mapped back to source URIs segments = src_uri.split('/') segments.pop() url_prefix = '/'.join(segments) ib.do_md5 = self.checksum dst_inventory = ib.from_disk(dst_path, url_prefix) ### 2. Compare these inventorys respecting any comparison options (num_same, changed, deleted, added) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(changed) > 0 or len(deleted) > 0 or len(added) > 0): status = "NOT IN SYNC" print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\ (status,num_same,len(changed),len(deleted),len(added)) if (audit_only): return ### 4. Grab files to do sync mapper = Mapper(url_prefix, dst_path) for uri in changed: file = mapper.src_to_dst(uri) if (self.verbose): print "changed: %s -> %s" % (uri, file) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in added: file = mapper.src_to_dst(uri) if (self.verbose): print "added: %s -> %s" % (uri, file) self.update_resource(uri, file, src_inventory.resources[uri].timestamp) for uri in deleted: if (allow_deletion): file = mapper.src_to_dst(uri) if (self.verbose): print "deleted: %s -> %s" % (uri, file) os.unlink(file) else: if (self.verbose): print "would delete %s (--delete to enable)" % uri
class Client(object): """Implementation of a ResourceSync client Logging is used for both console output and for detailed logs for automated analysis. Levels used: warning - usually shown to user info - verbose output debug - very verbose for automated analysis """ def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger('resync.client') self.mapper = Mapper() self.resource_list_name = 'resourcelist.xml' self.change_list_name = 'changelist.xml' self.dump_format = None self.exclude_patterns = [] self.sitemap_name = None self.allow_multifile = True self.noauth = False self.strictauth = False self.max_sitemap_entries = None self.ignore_failures = False self.pretty_xml = True # Default file names self.status_file = '.resync-client-status.cfg' self.default_resource_dump = 'resourcedump.zip' self.default_change_dump = 'changedump.zip' def set_mappings(self, mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings, use_default_path=True) def sitemap_uri(self, basename): """Get full URI (filepath) for sitemap based on basename""" if (re.match(r"\w+:", basename)): # looks like URI return (basename) elif (re.match(r"/", basename)): # looks like full path return (basename) else: # build from mapping with name appended return (self.mapper.default_src_uri() + '/' + basename) @property def sitemap(self): """Return the sitemap URI based on maps or explicit settings""" if (self.sitemap_name is not None): return (self.sitemap_name) return (self.sitemap_uri(self.resource_list_name)) def build_resource_list(self, paths=None, set_path=False): """Return a resource list for files on local disk The set of files is taken by disk scan from the paths specified or else defaults to the paths specified in the current mappings paths - override paths from mappings if specified set_path - set true to set the path information for each resource included. This is used to build a resource list as the basis for creating a dump. Return ResourceList. Uses existing self.mapper settings. """ # 0. Sanity checks, parse paths is specified if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (paths is not None): # Expect comma separated list of paths paths = paths.split(',') # 1. Build from disk rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) rlb.set_path = set_path rlb.add_exclude_files(self.exclude_patterns) rl = rlb.from_disk(paths=paths) # 2. Set defaults and overrides rl.allow_multifile = self.allow_multifile rl.pretty_xml = self.pretty_xml rl.mapper = self.mapper if (self.max_sitemap_entries is not None): rl.max_sitemap_entries = self.max_sitemap_entries return (rl) def log_event(self, change): """Log a Resource object as an event for automated analysis""" self.logger.debug("Event: " + repr(change)) def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ('audit' if (audit_only) else 'baseline sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (not audit_only and self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) ### 1. Get inventories from both src and dst # 1.a source resource list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList( allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError( "Can't read source resource list from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source resource list" ) # 1.b destination resource list mapped back to source URIs rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) dst_resource_list = rlb.from_disk() ### 2. Compare these resource lists respecting any comparison options (same, updated, deleted, created) = dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), audit=True, same=len(same), created=len(created), updated=len(updated), deleted=len(deleted)) if (audit_only or len(created) + len(updated) + len(deleted) == 0): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed if (not self.noauth): uauth = UrlAuthority(self.sitemap, strict=self.strictauth) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created) + len(updated), delete_msg)) self.last_timestamp = 0 num_created = 0 num_updated = 0 num_deleted = 0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) num_created += self.update_resource(resource, file, 'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) num_updated += self.update_resource(resource, file, 'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted += self.delete_resource(resource, file, allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), same=len(same), created=num_created, updated=num_updated, deleted=num_deleted, to_delete=len(deleted)) self.logger.debug("Completed %s" % (action)) def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization Use Change List to do incremental sync """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp = ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError( "Cannot do incremental sync. No stored timestamp for this site, and no explicit --from." ) ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError( "Can't read source change list from %s (%s)" % (change_list, str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source change list" ) # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError( "Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI if (not self.noauth): uauth_cs = UrlAuthority(change_list, self.strictauth) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): raise ClientFatalError( "Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list, resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped > 0): self.logger.info("Skipped %d changes before %s" % (num_skipped, datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes > 0): self.logger.info("Removed %d prior changes" % (num_dupes)) # Review and log status before # FIXME - should at this stage prune the change list to pick out # only the last change for each resource to_update = 0 to_create = 0 to_delete = 0 for resource in src_change_list: if (resource.change == 'updated'): to_update += 1 elif (resource.change == 'created'): to_create += 1 elif (resource.change == 'deleted'): to_delete += 1 else: raise ClientError("Unknown change type %s" % (resource.change)) # Log status based on what we know from the Change List. Exit if # either there are no changes or if there are only deletions and # we don't allow deletion in_sync = ((to_update + to_delete + to_create) == 0) self.log_status(in_sync=in_sync, incremental=True, created=to_create, updated=to_update, deleted=to_delete) if (in_sync or ((to_update + to_create) == 0 and not allow_deletion)): self.logger.debug("Completed incremental") return ### 6. Apply changes at same time or after from_timestamp delete_msg = (", and delete %d resources" % to_delete) if (allow_deletion) else '' self.logger.warning("Will apply %d changes%s" % (len(src_change_list), delete_msg)) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'updated') num_updated += 1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'created') num_created += 1 elif (resource.change == 'deleted'): num_deleted += self.delete_resource(resource, file, allow_deletion) else: raise ClientError("Unknown change type %s" % (resource.change)) ### 7. Report status and planned actions self.log_status(incremental=True, created=num_created, updated=num_updated, deleted=num_deleted, to_delete=to_delete) ### 8. Record last timestamp we have seen if (self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync") def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means three things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resource list 3. check that resource matches expected information Also update self.last_timestamp if the timestamp (in source frame) of this resource is later and the current value. Returns the number of resources updated/created (0 or 1) """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) num_updated = 0 if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri, file)) else: # 1. GET try: urllib.urlretrieve(resource.uri, file) num_updated += 1 except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri, str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # 2. set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file, (unixtime, unixtime)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp self.log_event(Resource(resource=resource, change=change)) # 3. sanity check length = os.stat(file).st_size if (resource.length != length): self.logger.info( "Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri, length, resource.length)) if (self.checksum and resource.md5 is not None): file_md5 = compute_md5_for_file(file) if (resource.md5 != file_md5): self.logger.info( "MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri, file_md5, resource.md5)) return (num_updated) def delete_resource(self, resource, file, allow_deletion=False): """Delete copy of resource in file on local system Will only actually do the deletion if allow_deletion is True. Regardless of whether the deletion occurs, self.last_timestamp will be updated if the resource.timestamp is later than the current value. Returns the number of files actually deleted (0 or 1). """ num_deleted = 0 uri = resource.uri if (resource.timestamp is not None and resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp if (allow_deletion): if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: try: os.unlink(file) num_deleted += 1 except OSError as e: msg = "Failed to DELETE %s -> %s : %s" % (uri, file, str(e)) #if (self.ignore_failures): self.logger.warning(msg) # return #else: # raise ClientFatalError(msg) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event(Resource(resource=resource, change="deleted")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) return (num_deleted) def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s = Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability, num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for resource in list: print '[%d] %s' % (n, str(resource)) n += 1 if (n >= to_show): break def explore(self): """Explore capabilities of a server interactvely Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ uri = None if (self.sitemap_name is not None): uri = self.sitemap print "Taking location from --sitemap option" acceptable_capabilities = None #ie. any elif (len(self.mapper) > 0): pu = urlparse.urlparse(self.mapper.default_src_uri()) uri = urlparse.urlunparse( [pu[0], pu[1], '/.well-known/resourcesync', '', '', '']) print "Will look for discovery information based on mappings" acceptable_capabilities = ['capabilitylist', 'capabilitylistindex'] else: raise ClientFatalError( "Neither explicit sitemap nor mapping specified") history = [] inp = None checks = None while (inp != 'q'): print if (inp == 'b'): if (len(history) < 2): break #can't do this, exit history.pop() #throw away current uri = history.pop() acceptable_capabilities = None history.append(uri) (uri, checks, acceptable_capabilities, inp) = self.explore_uri(uri, checks, acceptable_capabilities, len(history) > 1) print "--explore done, bye..." def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s = Sitemap() print "Reading %s" % (uri) options = {} capability = None try: if (caps == 'resource'): self.explore_show_head(uri, check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options, capability) = self.explore_show_summary( list, s.parsed_index, caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options) == 0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt)) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return ('', '', '', inp) checks = {} if (options[inp].capability is None): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in [ 'resourcelist', 'changelist', 'resourcedump', 'changedump' ]): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length'] = r.length if (r.lastmod is not None): checks['last-modified'] = r.lastmod # FIXME - could do sanity check here and issue warnings if odd return (options[inp].uri, checks, caps, inp) def explore_show_summary(self, list, parsed_index, caps): """Show summary of one capability document Used as part of --explore. FIXME - should look for <rs:ln rel="up"...> link and show that """ num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] if (parsed_index): capability += 'index' print "Parsed %s document with %d entries:" % (capability, num_entries) if (caps is not None and capability not in caps): print "WARNING - expected a %s document" % (','.join(caps)) to_show = num_entries if (num_entries > 21): to_show = 20 # What entries are allowed? # FIXME - not complete entry_caps = [] if (capability == 'capabilitylistindex'): entry_caps = ['capabilitylist'] elif (capability == 'capabilitylist'): entry_caps = [ 'resourcelist', 'changelist', 'resourcedump', 'changedump', 'changelistindex' ] elif (capability == 'changelistindex'): entry_caps = ['changelist'] options = {} n = 0 if ('up' in list.ln): options['up'] = list.ln['up'] print "[%s] %s" % ('up', list.ln['up'].uri) for r in list.resources: if (n >= to_show): print "(not showing remaining %d entries)" % (num_entries - n) break n += 1 options[str(n)] = r print "[%d] %s" % (n, r.uri) if (r.capability is not None): warning = '' if (r.capability not in entry_caps): warning = " (EXPECTED %s)" % (' or '.join(entry_caps)) print " %s%s" % (r.capability, warning) elif (len(entry_caps) == 1): r.capability = entry_caps[0] print " capability not specified, should be %s" % ( r.capability) return (options, capability) def explore_show_head(self, uri, check_headers=None): """Do HEAD on uri and show infomation Will also check headers against any values specified in check_headers. """ print "HEAD %s" % (uri) response = requests.head(uri) print " status: %s" % (response.status_code) # generate normalized lastmod # if ('last-modified' in response.headers): # response.headers.add['lastmod'] = datetime_to_str(str_to_datetime(response.headers['last-modified'])) # print some of the headers for header in [ 'content-length', 'last-modified', 'lastmod', 'content-type', 'etag' ]: if header in response.headers: check_str = '' if (check_headers is not None and header in check_headers): if (response.headers[header] == check_headers[header]): check_str = ' MATCHES EXPECTED VALUE' else: check_STR = ' EXPECTED %s' % (check_headers[header]) print " %s: %s%s" % (header, response.headers[header], check_str) def write_resource_list(self, paths=None, outfile=None, links=None, dump=None): """Write a Resource List or a Resource Dump for files on local disk Set of resources included is based on paths setting or else the mappings. Optionally links can be added. Output will be to stdout unless outfile is specified. If dump is true then a Resource Dump is written instead of a Resource List. If outfile is not set then self.default_resource_dump will be used. """ rl = self.build_resource_list(paths=paths, set_path=dump) if (links is not None): rl.ln = links if (dump): if (outfile is None): outfile = self.default_resource_dump self.logger.info("Writing resource dump to %s..." % (dump)) d = Dump(format=self.dump_format) d.write(resource_list=rl, dumpfile=outfile) else: if (outfile is None): try: print rl.as_xml() except ListBaseIndexError as e: raise ClientFatalError( "%s. Use --output option to specify base name for output files." % str(e)) else: rl.write(basename=outfile) def write_change_list(self, paths=None, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, links=None, dump=None): """Write a change list Unless the both ref_sitemap and newref_sitemap are specified then the Change List is calculated between the reference an the current state of files on disk. The files on disk are scanned based either on the paths setting or else on the mappings. """ cl = ChangeList(ln=links) if (not empty): # 1. Get and parse reference sitemap old_rl = self.read_reference_resource_list(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build resource list from files on disk if (newref_sitemap is None): # Get resource list from disk new_rl = self.build_resource_list(paths=paths, set_path=dump) else: new_rl = self.read_reference_resource_list( newref_sitemap, name='new reference') # 3. Calculate change list (same, updated, deleted, created) = old_rl.compare(new_rl) cl.add_changed_resources(updated, change='updated') cl.add_changed_resources(deleted, change='deleted') cl.add_changed_resources(created, change='created') # 4. Write out change list cl.mapper = self.mapper cl.pretty_xml = self.pretty_xml if (self.max_sitemap_entries is not None): cl.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print cl.as_xml() else: cl.write(basename=outfile) self.write_dump_if_requested(cl, dump) def write_capability_list(self, capabilities=None, outfile=None, links=None): """Write a Capability List to outfile or STDOUT""" capl = CapabilityList(ln=links) capl.pretty_xml = self.pretty_xml if (capabilities is not None): for name in capabilities.keys(): capl.add_capability(name=name, uri=capabilities[name]) if (outfile is None): print capl.as_xml() else: capl.write(basename=outfile) def write_source_description(self, capability_lists=None, outfile=None, links=None): """Write a ResourceSync Description document to outfile or STDOUT""" rsd = SourceDescription(ln=links) rsd.pretty_xml = self.pretty_xml if (capability_lists is not None): for uri in capability_lists: rsd.add_capability_list(uri) if (outfile is None): print rsd.as_xml() else: rsd.write(basename=outfile) def write_dump_if_requested(self, resource_list, dump): """Write a dump to the file dump""" if (dump is None): return def read_reference_resource_list(self, ref_sitemap, name='reference'): """Read reference resource list and return the ResourceList object name parameter just uses in output messages to say what type of resource list is being read. """ rl = ResourceList() self.logger.info("Reading reference %s resource list from %s ..." % (name, ref_sitemap)) rl.mapper = self.mapper rl.read(uri=ref_sitemap, index_only=(not self.allow_multifile)) num_entries = len(rl.resources) self.logger.info( "Read %s resource list with %d entries in %d sitemaps" % (name, num_entries, rl.num_files)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in rl.resources: print r n += 1 if (n >= to_show): break return (rl) def log_status(self, in_sync=True, incremental=False, audit=False, same=None, created=0, updated=0, deleted=0, to_delete=0): """Write log message regarding status in standard form Split this off so all messages from baseline/audit/incremental are written in a consistent form. """ if (audit): words = { 'created': 'to create', 'updated': 'to update', 'deleted': 'to delete' } else: words = { 'created': 'created', 'updated': 'updated', 'deleted': 'deleted' } if in_sync: # status rather than action status = "NO CHANGES" if incremental else "IN SYNC" else: if audit: status = "NOT IN SYNC" elif (to_delete > deleted): #will need --delete status = "PART APPLIED" if incremental else "PART SYNCED" words['deleted'] = 'to delete (--delete)' deleted = to_delete else: status = "CHANGES APPLIED" if incremental else "SYNCED" same = "" if (same is None) else ("same=%d, " % same) self.logger.warning("Status: %15s (%s%s=%d, %s=%d, %s=%d)" %\ (status, same, words['created'], created, words['updated'], updated, words['deleted'], deleted))
def test05_path_from_uri(self): m = Mapper() self.assertEqual(m.path_from_uri('a_file'), 'a_file') self.assertEqual(m.path_from_uri('some_path/a_file'), 'some_path/a_file') self.assertEqual(m.path_from_uri('http://localhost/p'), 'localhost_p') self.assertEqual(m.path_from_uri('http://localhost:8888/p'), 'localhost_8888_p') self.assertEqual(m.path_from_uri('https://localhost:8888/p'), 'localhost_8888_p') self.assertEqual(m.path_from_uri('http://example.com'), 'example.com') self.assertEqual(m.path_from_uri('http://example.com/'), 'example.com') self.assertEqual(m.path_from_uri('http://example.com/ex1'), 'example.com_ex1') self.assertEqual(m.path_from_uri('http://example.com/ex1/'), 'example.com_ex1')
class Client(object): """Implementation of a ResourceSync client Logging is used for both console output and for detailed logs for automated analysis. Levels used: warning - usually shown to user info - verbose output debug - very verbose for automated analysis """ def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger("client") self.mapper = None self.sitemap_name = "sitemap.xml" self.dump_format = None self.exclude_patterns = [] self.allow_multifile = True self.noauth = False self.max_sitemap_entries = None self.ignore_failures = False @property def mappings(self): """Provide access to mappings list within Mapper object""" if self.mapper is None: raise ClientFatalError("No mappings specified") return self.mapper.mappings def set_mappings(self, mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings) def sitemap_changeset_uri(self, basename): """Get full URI (filepath) for sitemap/changeset based on basename""" if re.match(r"\w+:", basename): # looks like URI return basename elif re.match(r"/", basename): # looks like full path return basename else: # build from mapping with name appended return self.mappings[0].src_uri + "/" + basename @property def sitemap(self): """Return the sitemap URI based on maps or explicit settings""" return self.sitemap_changeset_uri(self.sitemap_name) @property def inventory(self): """Return inventory on disk based on current mappings Return inventory. Uses existing self.mapper settings. """ ### 0. Sanity checks if len(self.mappings) < 1: raise ClientFatalError("No source to destination mapping specified") ### 1. Build from disk ib = InventoryBuilder(do_md5=self.checksum, mapper=self.mapper) ib.add_exclude_files(self.exclude_patterns) return ib.from_disk() def log_event(self, change): """Log a ResourceChange object as an event for automated analysis""" self.logger.debug("Event: " + repr(change)) def sync_or_audit(self, allow_deletion=False, audit_only=False): action = "audit" if (audit_only) else "sync" self.logger.debug("Starting " + action) ### 0. Sanity checks if len(self.mappings) < 1: raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory))) if len(src_inventory) == 0: raise ClientFatalError("Aborting as there are no resources to sync") if self.checksum and not src_inventory.has_md5(): self.checksum = False self.logger.info("Not calculating checksums on destination as not present in source inventory") # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if len(updated) > 0 or len(deleted) > 0 or len(created) > 0: status = "NOT IN SYNC" self.logger.warning( "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" % (status, len(same), len(updated), len(deleted), len(created)) ) if audit_only: self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_inventory: if not uauth.has_authority_over(resource.uri): if self.noauth: self.logger.info( "Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap, resource.uri) ) else: raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri) ) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, "UPDATED") for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, "CREATED") for resource in deleted: uri = resource.uri if allow_deletion: file = self.mapper.src_to_dst(uri) if self.dryrun: self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event(ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) self.logger.debug("Completed " + action) def incremental(self, allow_deletion=False, changeset_uri=None): self.logger.debug("Starting incremental sync") ### 0. Sanity checks if len(self.mappings) < 1: raise ClientFatalError("No source to destination mapping specified") ### 1. Get URI of changeset, from sitemap or explicit if changeset_uri: # Translate as necessary using maps changeset = self.sitemap_changeset_uri(changeset_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap, str(e))) # Extract changeset location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_inventory.capabilities) if "current" not in links: raise ClientFatalError("Failed to extract changeset location from sitemap %s" % (self.sitemap)) changeset = links["current"] ### 2. Read changeset from source ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading changeset %s" % (changeset)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changeset = src_sitemap.read(uri=changeset, changeset=True) self.logger.debug("Finished reading changeset") except Exception as e: raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e))) self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset))) if len(src_changeset) == 0: raise ClientFatalError("Aborting as there are no resources to sync") if self.checksum and not src_changeset.has_md5(): self.checksum = False self.logger.info("Not calculating checksums on destination as not present in source inventory") ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changeset? Here use both the # changeset URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changeset) if not changeset_uri: uauth_sm = UrlAuthority(self.sitemap) for resource in src_changeset: if not uauth_cs.has_authority_over(resource.uri) and ( changeset_uri or not uauth_sm.has_authority_over(resource.uri) ): if self.noauth: self.logger.warning( "Changeset (%s) mentions resource at a location it does not have authority over (%s)" % (changeset, resource.uri) ) else: raise ClientFatalError( "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changeset, resource.uri) ) ### 3. Apply changes for resource in src_changeset: uri = resource.uri file = self.mapper.src_to_dst(uri) if resource.changetype == "UPDATED": self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, "UPDATED") elif resource.changetype == "CREATED": self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, "CREATED") elif resource.changetype == "DELETED": if allow_deletion: file = self.mapper.src_to_dst(uri) if self.dryrun: self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event(ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) else: raise ClientError("Unknown change type %s" % (resource.changetype)) self.logger.debug("Completed incremental stuff") def update_resource(self, resource, file, changetype=None): """Update resource from uri to file on local system Update means two things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the inventory """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) if self.dryrun: self.logger.info("dryrun: would GET %s --> %s" % (resource.uri, file)) else: try: urllib.urlretrieve(resource.uri, file) except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri, str(e)) if self.ignore_failures: self.logger.warning(msg) return else: raise ClientFatalError(msg) if resource.timestamp is not None: unixtime = int(resource.timestamp) # no fractional os.utime(file, (unixtime, unixtime)) self.log_event(ResourceChange(resource=resource, changetype=changetype)) def parse_sitemap(self): s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) i = s.read(self.sitemap) num_entries = len(i) self.logger.warning("Read sitemap with %d entries in %d sitemaps" % (num_entries, s.sitemaps_created)) if self.verbose: to_show = 100 override_str = " (override with --max-sitemap-entries)" if self.max_sitemap_entries: to_show = self.max_sitemap_entries override_str = "" if num_entries > to_show: print "Showing first %d entries sorted by URI%s..." % (to_show, override_str) n = 0 for r in i: print r n += 1 if n >= to_show: break def explore_links(self): """Explore links from sitemap and between changesets""" seen = dict() is_changeset, links = self.explore_links_get(self.sitemap, seen=seen) starting_changeset = self.sitemap if not is_changeset: if "current" in links: starting_changeset = links["current"] is_changeset, links = self.explore_links_get(links["current"], seen=seen) # Can we go backward? if "prev" in links and not links["prev"] in seen: self.logger.warning("Will follow links backwards...") while "prev" in links and not links["prev"] in seen: self.logger.warning('Following "prev" link') is_changeset, links = self.explore_links_get(links["prev"], seen=seen) else: self.logger.warning("No links backwards") # Can we go forward? links = seen[starting_changeset] if "next" in links and not links["next"] in seen: self.logger.warning("Will follow links forwards...") while "next" in links and not links["next"] in seen: self.logger.warning('Following "next" link') is_changeset, links = self.explore_links_get(links["next"], seen=seen) else: self.logger.warning("No links forwards") def explore_links_get(self, uri, seen=[]): # Check we haven't been here before if uri in seen: self.logger.warning("Already see %s, skipping" % (uri)) s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap from %s ..." % (uri)) i = s.read(uri, index_only=True) self.logger.warning("Read %s from %s" % (s.read_type, uri)) links = self.extract_links(i, verbose=True) if "next" in links and links["next"] == uri: self.logger.warning('- self reference "next" link') seen[uri] = links return (s.changeset_read, links) def write_sitemap(self, outfile=None, capabilities=None, dump=None): # Set up base_path->base_uri mappings, get inventory from disk i = self.inventory i.capabilities = capabilities s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if self.max_sitemap_entries is not None: s.max_sitemap_entries = self.max_sitemap_entries if outfile is None: print s.resources_as_xml(i, capabilities=i.capabilities) else: s.write(i, basename=outfile) self.write_dump_if_requested(i, dump) def changeset_sitemap( self, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, capabilities=None, dump=None ): changeset = ChangeSet() changeset.capabilities = capabilities if not empty: # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build inventory from files on disk if newref_sitemap is None: # Get inventory from disk new_inv = self.inventory else: new_inv = self.read_reference_sitemap(newref_sitemap, name="new reference") # 3. Calculate changeset (same, updated, deleted, created) = old_inv.compare(new_inv) changeset.add_changed_resources(updated, changetype="UPDATED") changeset.add_changed_resources(deleted, changetype="DELETED") changeset.add_changed_resources(created, changetype="CREATED") # 4. Write out changeset s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if self.max_sitemap_entries is not None: s.max_sitemap_entries = self.max_sitemap_entries if outfile is None: print s.resources_as_xml(changeset, changeset=True) else: s.write(changeset, basename=outfile, changeset=True) self.write_dump_if_requested(changeset, dump) def write_dump_if_requested(self, inventory, dump): if dump is None: return self.logger.info("Writing dump to %s..." % (dump)) d = Dump(format=self.dump_format) d.write(inventory=inventory, dumpfile=dump) def read_reference_sitemap(self, ref_sitemap, name="reference"): """Read reference sitemap and return the inventory name parameter just uses in output messages to say what type of sitemap is being read. """ sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) self.logger.info("Reading %s sitemap(s) from %s ..." % (name, ref_sitemap)) i = sitemap.read(ref_sitemap) num_entries = len(i) self.logger.warning( "Read %s sitemap with %d entries in %d sitemaps" % (name, num_entries, sitemap.sitemaps_created) ) if self.verbose: to_show = 100 override_str = " (override with --max-sitemap-entries)" if self.max_sitemap_entries: to_show = self.max_sitemap_entries override_str = "" if num_entries > to_show: print "Showing first %d entries sorted by URI%s..." % (to_show, override_str) n = 0 for r in i: print r n += 1 if n >= to_show: break return i def extract_links(self, rc, verbose=False): """Extract links from capabilities inventory or changeset FIXME - when we finalize the form of links this should probably go along with other capabilities functions somewhere general. """ links = dict() for href in rc.capabilities.keys(): atts = rc.capabilities[href].get("attributes") self.logger.debug("Capability: %s" % (str(rc.capabilities[href]))) if atts is not None: # split on spaces, check is changeset rel and diraction if "http://www.openarchives.org/rs/changeset" in atts: for linktype in ["next", "prev", "current"]: if linktype in atts: if linktype in links: raise ClientFatalError( "Duplicate link type %s, links to %s and %s" % (linktype, links[linktype], href) ) links[linktype] = href if verbose: self.logger.warning('- got "%s" link to %s' % (linktype, href)) return links
def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings, use_default_path=True)
def test07_default_src_uri(self): self.assertEqual(Mapper(['a=b']).default_src_uri(), 'a') self.assertEqual(Mapper(['a=b', 'b=c']).default_src_uri(), 'a') self.assertRaises(MapperError, Mapper().default_src_uri)
class Client(object): """Implementation of a ResourceSync client Logging is used for both console output and for detailed logs for automated analysis. Levels used: warning - usually shown to user info - verbose output debug - very verbose for automated analysis """ def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger('client') self.mapper = None self.sitemap_name = 'sitemap.xml' self.dump_format = None self.exclude_patterns = [] self.allow_multifile = True self.noauth = False self.max_sitemap_entries = None self.ignore_failures = False self.status_file = '.resync-client-status.cfg' @property def mappings(self): """Provide access to mappings list within Mapper object""" if (self.mapper is None): raise ClientFatalError("No mappings specified") return(self.mapper.mappings) def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings) def sitemap_changelist_uri(self,basename): """Get full URI (filepath) for sitemap/changelist based on basename""" if (re.match(r"\w+:",basename)): # looks like URI return(basename) elif (re.match(r"/",basename)): # looks like full path return(basename) else: # build from mapping with name appended return(self.mappings[0].src_uri + '/' + basename) @property def sitemap(self): """Return the sitemap URI based on maps or explicit settings""" return(self.sitemap_changelist_uri(self.sitemap_name)) @property def resourcelist(self): """Return resourcelist on disk based on current mappings Return resourcelist. Uses existing self.mapper settings. """ ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Build from disk ib = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper) ib.add_exclude_files(self.exclude_patterns) return( ib.from_disk() ) def log_event(self, change): """Log a Resource object as an event for automated analysis""" self.logger.debug( "Event: "+repr(change) ) def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resourcelist ib = ResourceListBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resourcelist = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resourcelist from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resourcelist, %d resources listed" % (len(src_resourcelist))) if (len(src_resourcelist)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resourcelist.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resourcelist") # 1.b destination resourcelist mapped back to source URIs ib.do_md5=self.checksum dst_resourcelist = ib.from_disk() ### 2. Compare these resourcelists respecting any comparison options (same,updated,deleted,created)=dst_resourcelist.compare(src_resourcelist) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated)>0 or len(deleted)>0 or len(created)>0): status = "NOT IN SYNC" self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,len(same),len(updated),len(deleted),len(created))) if (audit_only): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resourcelist: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) self.delete_resource(resource,file,allow_deletion) ### 6. For sync reset any incremental status for site if (not audit_only): links = self.extract_links(src_resourcelist) if ('next' in links): self.write_incremental_status(self.sitemap,links['next']) self.logger.info("Written config with next incremental at %s" % (links['next'])) else: self.write_incremental_status(self.sitemap) self.logger.debug("Completed "+action) def incremental(self, allow_deletion=False, changelist_uri=None): """Incremental synchronization""" self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") # Get current config inc_config_next=self.read_incremental_status(self.sitemap) ### 1. Get URI of changelist, from sitemap or explicit if (inc_config_next is not None): # We have config from last run for this site changelist = inc_config_next self.logger.info("ChangeList location from last incremental run %s" % (changelist)) elif (changelist_uri): # Translate as necessary using maps changelist = self.sitemap_changelist_uri(changelist_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resourcelist = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError("Can't read source sitemap from %s (%s)" % (self.sitemap,str(e))) # Extract changelist location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_resourcelist) if ('current' not in links): raise ClientFatalError("Failed to extract changelist location from sitemap %s" % (self.sitemap)) changelist = links['current'] ### 2. Read changelist from source ib = ResourceListBuilder(mapper=self.mapper) try: self.logger.info("Reading changelist %s" % (changelist)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changelist = src_sitemap.read(uri=changelist, changelist=True) self.logger.debug("Finished reading changelist") except Exception as e: raise ClientFatalError("Can't read source changelist from %s (%s)" % (changelist,str(e))) self.logger.info("Read source changelist, %d resources listed" % (len(src_changelist))) #if (len(src_changelist)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_changelist.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resourcelist") ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changelist? Here use both the # changelist URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changelist) if (not changelist_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_changelist: if (not uauth_cs.has_authority_over(resource.uri) and (changelist_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): #self.logger.info("ChangeList (%s) mentions resource at a location it does not have authority over (%s)" % (changelist,resource.uri)) pass else: raise ClientFatalError("Aborting as changelist (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changelist,resource.uri)) ### 3. Apply changes num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_changelist: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): self.delete_resource(resource,file,allow_deletion) num_deleted+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) # 4. Report status and planned actions status = "NO CHANGES" if ((num_updated+num_deleted+num_created)>0): status = " CHANGES " self.logger.warning("Status: %s (updated=%d, deleted=%d, created=%d)" %\ (status,num_updated,num_deleted,num_created)) # 5. Store next link if available if ((num_updated+num_deleted+num_created)>0): links = self.extract_links(src_changelist) if ('next' in links): self.write_incremental_status(self.sitemap,links['next']) self.logger.info("Written config with next incremental at %s" % (links['next'])) else: self.logger.warning("Failed to extract next changelist location from changelist %s" % (changelist)) # 6. Done self.logger.debug("Completed incremental sync") def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means two things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resourcelist """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file)) else: try: urllib.urlretrieve(resource.uri,file) except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri,str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # sanity check size = os.stat(file).st_size if (resource.size != size): self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,size,resource.size)) # set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file,(unixtime,unixtime)) self.log_event(Resource(resource=resource, change=change)) def delete_resource(self, resource, file, allow_deletion=False): """Delete copy of resource in file on local system """ uri = resource.uri if (allow_deletion): if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri,file)) else: try: os.unlink(file) except OSError as e: msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) self.logger.info("deleted: %s -> %s" % (uri,file)) self.log_event(Resource(resource=resource, change="deleted")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) def parse_sitemap(self): s=Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) i = s.read(self.sitemap) num_entries = len(i) self.logger.warning("Read sitemap with %d entries in %d sitemaps" % (num_entries,s.sitemaps_created)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in i: print r n+=1 if ( n >= to_show ): break def explore_links(self): """Explore links from sitemap and between changelists""" seen = dict() is_changelist,links = self.explore_links_get(self.sitemap, seen=seen) starting_changelist = self.sitemap if (not is_changelist): if ('current' in links): starting_changelist = links['current'] is_changelist,links = self.explore_links_get(links['current'], seen=seen) # Can we go backward? if ('prev' in links and not links['prev'] in seen): self.logger.warning("Will follow links backwards...") while ('prev' in links and not links['prev'] in seen): self.logger.warning("Following \"prev\" link") is_changelist,links = self.explore_links_get(links['prev'], seen=seen) else: self.logger.warning("No links backwards") # Can we go forward? links = seen[starting_changelist] if ('next' in links and not links['next'] in seen): self.logger.warning("Will follow links forwards...") while ('next' in links and not links['next'] in seen): self.logger.warning("Following \"next\" link") is_changelist,links = self.explore_links_get(links['next'], seen=seen) else: self.logger.warning("No links forwards") def explore_links_get(self, uri, seen=[]): # Check we haven't been here before if (uri in seen): self.logger.warning("Already see %s, skipping" % (uri)) s=Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap from %s ..." % (uri)) i = s.read(uri, index_only=True) self.logger.warning("Read %s from %s" % (s.read_type,uri)) links = self.extract_links(i, verbose=True) if ('next' in links and links['next']==uri): self.logger.warning("- self reference \"next\" link") seen[uri]=links return(s.changelist_read,links) def write_sitemap(self,outfile=None,capabilities=None,dump=None): # Set up base_path->base_uri mappings, get resourcelist from disk i = self.resourcelist i.capabilities = capabilities s=Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(i,capabilities=i.capabilities) else: s.write(i,basename=outfile) self.write_dump_if_requested(i,dump) def changelist_sitemap(self,outfile=None,ref_sitemap=None,newref_sitemap=None, empty=None,capabilities=None,dump=None): changelist = ChangeList() changelist.capabilities = capabilities if (not empty): # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build resourcelist from files on disk if (newref_sitemap is None): # Get resourcelist from disk new_inv = self.resourcelist else: new_inv = self.read_reference_sitemap(newref_sitemap,name='new reference') # 3. Calculate changelist (same,updated,deleted,created)=old_inv.compare(new_inv) changelist.add_changed_resources( updated, change='updated' ) changelist.add_changed_resources( deleted, change='deleted' ) changelist.add_changed_resources( created, change='created' ) # 4. Write out changelist s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(changelist,changelist=True) else: s.write(changelist,basename=outfile,changelist=True) self.write_dump_if_requested(changelist,dump) def write_dump_if_requested(self,resourcelist,dump): if (dump is None): return self.logger.info("Writing dump to %s..." % (dump)) d = Dump(format=self.dump_format) d.write(resourcelist=resourcelist,dumpfile=dump) def read_reference_sitemap(self,ref_sitemap,name='reference'): """Read reference sitemap and return the resourcelist name parameter just uses in output messages to say what type of sitemap is being read. """ sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) self.logger.info("Reading %s sitemap(s) from %s ..." % (name,ref_sitemap)) i = sitemap.read(ref_sitemap) num_entries = len(i) self.logger.warning("Read %s sitemap with %d entries in %d sitemaps" % (name,num_entries,sitemap.sitemaps_created)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in i: print r n+=1 if ( n >= to_show ): break return(i) def extract_links(self, rc, verbose=False): """Extract links from capabilities resourcelist or changelist FIXME - when we finalize the form of links this should probably go along with other capabilities functions somewhere general. """ links = dict() for href in rc.capabilities.keys(): atts = rc.capabilities[href].get('attributes') self.logger.debug("Capability: %s" % (str(rc.capabilities[href]))) if (atts is not None): # split on spaces, check is changelist rel and diraction if ('http://www.openarchives.org/rs/changelist' in atts): for linktype in ['next','prev','current']: if (linktype in atts): if (linktype in links): raise ClientFatalError("Duplicate link type %s, links to %s and %s" % (linktype,links[linktype],href)) links[linktype] = href; if (verbose): self.logger.warning("- got \"%s\" link to %s" % (linktype,href)) return(links) def write_incremental_status(self,site,next=None): """Write status dict to client status file FIXME - should have some file lock to avoid race """ parser = ConfigParser.SafeConfigParser() parser.read(self.status_file) status_section = 'incremental' if (not parser.has_section(status_section)): parser.add_section(status_section) if (next is None): parser.remove_option(status_section, self.config_site_to_name(site)) else: parser.set(status_section, self.config_site_to_name(site), next) with open(self.status_file, 'wb') as configfile: parser.write(configfile) configfile.close() def read_incremental_status(self,site): """Read client status file and return dict""" parser = ConfigParser.SafeConfigParser() status_section = 'incremental' parser.read(self.status_file) next = None try: next = parser.get(status_section,self.config_site_to_name(site)) except ConfigParser.NoSectionError as e: pass except ConfigParser.NoOptionError as e: pass return(next) def config_site_to_name(self, name): return( re.sub(r"[^\w]",'_',name) )
class Client(object): """Implementation of a ResourceSync client Logging is used for both console output and for detailed logs for automated analysis. Levels used: warning - usually shown to user info - verbose output debug - very verbose for automated analysis """ def __init__(self, checksum=False, verbose=False, dryrun=False): super(Client, self).__init__() self.checksum = checksum self.verbose = verbose self.dryrun = dryrun self.logger = logging.getLogger('client') self.mapper = None self.resource_list_name = 'resourcelist.xml' self.change_list_name = 'changelist.xml' self.dump_format = None self.exclude_patterns = [] self.sitemap_name = None self.allow_multifile = True self.noauth = False self.max_sitemap_entries = None self.ignore_failures = False self.status_file = '.resync-client-status.cfg' @property def mappings(self): """Provide access to mappings list within Mapper object""" if (self.mapper is None): raise ClientFatalError("No mappings specified") return(self.mapper.mappings) def set_mappings(self,mappings): """Build and set Mapper object based on input mappings""" self.mapper = Mapper(mappings, use_default_path=True) def sitemap_uri(self,basename): """Get full URI (filepath) for sitemap based on basename""" if (re.match(r"\w+:",basename)): # looks like URI return(basename) elif (re.match(r"/",basename)): # looks like full path return(basename) else: # build from mapping with name appended return(self.mappings[0].src_uri + '/' + basename) @property def sitemap(self): """Return the sitemap URI based on maps or explicit settings""" if (self.sitemap_name is not None): return(self.sitemap_name) return(self.sitemap_uri(self.resource_list_name)) @property def resource_list(self): """Return resource_list on disk based on current mappings Return resource_list. Uses existing self.mapper settings. """ ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Build from disk rlb = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper) rlb.add_exclude_files(self.exclude_patterns) return( rlb.from_disk() ) def log_event(self, change): """Log a Resource object as an event for automated analysis""" self.logger.debug( "Event: "+repr(change) ) def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resource_list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resource_list") # 1.b destination resource_list mapped back to source URIs rlb = ResourceListBuilder(mapper=self.mapper) rlb.do_md5=self.checksum dst_resource_list = rlb.from_disk() ### 2. Compare these resource_lists respecting any comparison options (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), audit=True,same=len(same),created=len(created), updated=len(updated),deleted=len(deleted)) if (audit_only or len(created)+len(updated)+len(deleted)==0): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg)) self.last_timestamp = 0 num_created=0 num_updated=0 num_deleted=0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) num_created+=self.update_resource(resource,file,'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) num_updated+=self.update_resource(resource,file,'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted+=self.delete_resource(resource,file,allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), same=len(same),created=num_created, updated=num_updated,deleted=num_deleted) self.logger.debug("Completed %s" % (action)) def incremental(self, allow_deletion=False, change_list_uri=None, from_datetime=None): """Incremental synchronization """ self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") from_timestamp = None if (from_datetime is not None): try: from_timestamp = str_to_datetime(from_datetime) except ValueError: raise ClientFatalError("Bad datetime in --from (%s)" % from_datetime) ### 1. Work out where to start from if (from_timestamp is None): from_timestamp=ClientState().get_state(self.sitemap) if (from_timestamp is None): raise ClientFatalError("No stored timestamp for this site, and no explicit --from") ### 2. Get URI of change list, from sitemap or explicit if (change_list_uri): # Translate as necessary using maps change_list = self.sitemap_uri(change_list_uri) else: # Try default name change_list = self.sitemap_uri(self.change_list_name) ### 3. Read change list from source try: self.logger.info("Reading change list %s" % (change_list)) src_change_list = ChangeList() src_change_list.read(uri=change_list) self.logger.debug("Finished reading change list") except Exception as e: raise ClientFatalError("Can't read source change list from %s (%s)" % (change_list,str(e))) self.logger.info("Read source change list, %d changes listed" % (len(src_change_list))) #if (len(src_change_list)==0): # raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_change_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source change list") # Check all changes have timestamp and record last self.last_timestamp = 0 for resource in src_change_list: if (resource.timestamp is None): raise ClientFatalError("Aborting - missing timestamp for change in %s" % (uri)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp ### 4. Check that the change list has authority over URIs listed # FIXME - What does authority mean for change list? Here use both the # change list URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(change_list) if (not change_list_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_change_list: if (not uauth_cs.has_authority_over(resource.uri) and (change_list_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): #self.logger.info("Change list (%s) mentions resource at a location it does not have authority over (%s)" % (change_list,resource.uri)) pass else: raise ClientFatalError("Aborting as change list (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (change_list,resource.uri)) ### 5. Prune entries before starting timestamp and dupe changes for a resource num_skipped = src_change_list.prune_before(from_timestamp) if (num_skipped>0): self.logger.info("Skipped %d changes before %s" % (num_skipped,datetime_to_str(from_timestamp))) num_dupes = src_change_list.prune_dupes() if (num_dupes>0): self.logger.info("Removed %d prior changes" % (num_dupes)) ### 6. Apply changes at same time or after from_timestamp self.logger.info("Applying %d changes" % (len(src_change_list))) num_updated = 0 num_deleted = 0 num_created = 0 for resource in src_change_list: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.change == 'updated'): self.logger.info("updated: %s -> %s" % (uri,file)) self.update_resource(resource,file,'updated') num_updated+=1 elif (resource.change == 'created'): self.logger.info("created: %s -> %s" % (uri,file)) self.update_resource(resource,file,'created') num_created+=1 elif (resource.change == 'deleted'): self.delete_resource(resource,file,allow_deletion) num_deleted+=1 else: raise ClientError("Unknown change type %s" % (resource.change) ) ### 7. Report status and planned actions self.log_status(in_sync=((num_updated+num_deleted+num_created)==0), incremental=True,created=num_created, updated=num_updated, deleted=num_deleted) ### 8. Record last timestamp we have seen if (self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 9. Done self.logger.debug("Completed incremental sync") def update_resource(self, resource, file, change=None): """Update resource from uri to file on local system Update means three things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the resource_list 3. check that resource matches expected information Also update self.last_timestamp if the timestamp (in source frame) of this resource is later and the current value. Returns the number of resources updated/created (0 or 1) """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) num_updated=0 if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri,file)) else: # 1. GET try: urllib.urlretrieve(resource.uri,file) num_updated+=1 except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri,str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) # 2. set timestamp if we have one if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file,(unixtime,unixtime)) if (resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp self.log_event(Resource(resource=resource, change=change)) # 3. sanity check length = os.stat(file).st_size if (resource.length != length): self.logger.info("Downloaded size for %s of %d bytes does not match expected %d bytes" % (resource.uri,length,resource.length)) if (self.checksum and resource.md5 is not None): file_md5 = compute_md5_for_file(file) if (resource.md5 != file_md5): self.logger.info("MD5 mismatch for %s, got %s but expected %s bytes" % (resource.uri,file_md5,resource.md5)) return(num_updated) def delete_resource(self, resource, file, allow_deletion=False): """Delete copy of resource in file on local system Will only actually do the deletion if allow_deletion is True. Regardless of whether the deletion occurs, self.last_timestamp will be updated if the resource.timestamp is later than the current value. Returns the number of files actually deleted (0 or 1). """ num_deleted=0 uri = resource.uri if (resource.timestamp is not None and resource.timestamp > self.last_timestamp): self.last_timestamp = resource.timestamp if (allow_deletion): if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri,file)) else: try: os.unlink(file) num_deleted+=1 except OSError as e: msg = "Failed to DELETE %s -> %s : %s" % (uri,file,str(e)) #if (self.ignore_failures): self.logger.warning(msg) # return #else: # raise ClientFatalError(msg) self.logger.info("deleted: %s -> %s" % (uri,file)) self.log_event(Resource(resource=resource, change="deleted")) else: self.logger.info("nodelete: would delete %s (--delete to enable)" % uri) return(num_deleted) def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s=Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability,num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for resource in list: print '[%d] %s' % (n,str(resource)) n+=1 if ( n >= to_show ): break def explore(self): """Explore capabilities of a server interactvely Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ uri = None if (self.sitemap_name is not None): uri = self.sitemap print "Taking location from --sitemap option" acceptable_capabilities = None #ie. any elif (len(self.mappings)>0): pu = urlparse.urlparse(self.mappings[0].src_uri) uri = urlparse.urlunparse( [ pu[0], pu[1], '/.well-known/resourcesync', '', '', '' ] ) print "Will look for discovery information based on mappings" acceptable_capabilities = [ 'capabilitylist', 'capabilitylistindex' ] else: raise FatalError("Neither explicit sitemap nor mapping specified") inp = None while (inp!='q'): print (uri, acceptable_capabilities, inp) = self.explore_uri(uri,acceptable_capabilities) def explore_uri(self, uri, caps): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s=Sitemap() print "Reading %s" % (uri) try: list = s.parse_xml(urllib.urlopen(uri)) except IOError as e: raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e))) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] if (s.parsed_index): capability += 'index' print "Parsed %s document with %d entries:" % (capability,num_entries) if (caps is not None and capability not in caps): print "WARNING - expected a %s document" % (','.join(caps)) to_show = num_entries if (num_entries>21): to_show = 20 # What entries are allowed? # FIXME - not complete if (capability == 'capabilitylistindex'): entry_caps = ['capabilitylist'] elif (capability == 'capabilitylist'): entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex'] elif (capability == 'changelistindex'): entry_caps = ['changelist'] n = 0 options = {} for r in list.resources: if (n>=to_show): print "(not showing remaining %d entries)" % (num_entries-n) last n+=1 options[str(n)]=r print "[%d] %s" % (n,r.uri) if (r.capability is not None): warning = '' if (r.capability not in entry_caps): warning = " (EXPECTED %s)" % (' or '.join(entry_caps)) print " %s%s" % (r.capability,warning) elif (len(entry_caps)==1): r.capability=entry_caps[0] print " capability not specified, should be %s" % (r.capability) while (True): inp = raw_input( "Follow [number or q(uit)]?" ) if (inp in options.keys()): break if (inp == 'q'): return('','',inp) caps = [ options[inp].capability ] if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents if (caps is None): caps = ['capabilitylist'] return( options[inp].uri, caps, inp ) def write_resource_list(self,outfile=None,links=None,dump=None): """Write a resource list sitemap for files on local disk based on the base_path->base_uri mappings. """ rl = self.resource_list rl.ln = links kwargs = { 'pretty_xml': True, 'allow_multifile': self.allow_multifile, 'mapper' : self.mapper } if (self.max_sitemap_entries is not None): kwargs['max_sitemap_entries'] = self.max_sitemap_entries if (outfile is None): print rl.as_xml(**kwargs) else: rl.write(basename=outfile,**kwargs) self.write_dump_if_requested(rl,dump) def write_change_list(self,outfile=None,ref_sitemap=None,newref_sitemap=None, empty=None,links=None,dump=None): cl = ChangeList(ln=links) if (not empty): # 1. Get and parse reference sitemap old_rl = self.read_reference_resource_list(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build resource_list from files on disk if (newref_sitemap is None): # Get resource list from disk new_rl = self.resource_list else: new_rl = self.read_reference_resource_list(newref_sitemap,name='new reference') # 3. Calculate change list (same,updated,deleted,created)=old_rl.compare(new_rl) cl.add_changed_resources( updated, change='updated' ) cl.add_changed_resources( deleted, change='deleted' ) cl.add_changed_resources( created, change='created' ) # 4. Write out change list kwargs = { 'pretty_xml': True, 'mapper' : self.mapper } if (self.max_sitemap_entries is not None): kwargs['max_sitemap_entries'] = self.max_sitemap_entries if (outfile is None): print cl.as_xml(**kwargs) else: cl.write(basename=outfile,**kwargs) self.write_dump_if_requested(cl,dump) def write_capability_list(self,capabilities=None,outfile=None,links=None): """Write a Capability List to outfile or STDOUT""" capl = CapabilityList(ln=links) if (capabilities is not None): for name in capabilities.keys(): capl.add_capability(name=name, uri=capabilities[name]) kwargs = { 'pretty_xml': True } if (outfile is None): print capl.as_xml(**kwargs) else: capl.write(basename=outfile,**kwargs) def write_capability_list_index(self,capability_lists=None,outfile=None,links=None): """Write a Capability List to outfile or STDOUT""" capli = CapabilityListIndex(ln=links) if (capability_lists is not None): for uri in capability_lists: capli.add_capability_list(uri) kwargs = { 'pretty_xml': True } if (outfile is None): print capli.as_xml(**kwargs) else: capli.write(basename=outfile,**kwargs) def write_dump_if_requested(self,resource_list,dump): if (dump is None): return self.logger.info("Writing dump to %s..." % (dump)) d = Dump(format=self.dump_format) d.write(resource_list=resource_list,dumpfile=dump) def read_reference_resource_list(self,ref_sitemap,name='reference'): """Read reference resource list and return the ResourceList object name parameter just uses in output messages to say what type of resource list is being read. """ rl = ResourceList() self.logger.info("Reading reference %s resource list from %s ..." % (name,ref_sitemap)) rl.mapper=self.mapper rl.read(uri=ref_sitemap,index_only=(not self.allow_multifile)) num_entries = len(rl.resources) self.logger.info("Read %s resource list with %d entries in %d sitemaps" % (name,num_entries,rl.num_files)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for r in rl.resources: print r n+=1 if ( n >= to_show ): break return(rl) def log_status(self, in_sync=True, incremental=False, audit=False, same=None, created=0, updated=0, deleted=0): """Write log message regarding status in standard form Split this off so we messages from baseline/audit/incremental are written in a consistent form. """ if (incremental): status = "NO CHANGES" if in_sync else "CHANGES" else: status = "IN SYNC" if in_sync else ("NOT IN SYNC" if (audit) else "SYNCED") if (audit): words = { 'created': 'to create', 'updated': 'to update', 'deleted': 'to delete' } else: words = { 'created': 'created', 'updated': 'updated', 'deleted': 'deleted' } same = "" if (same is None) else ("same=%d, " % same) self.logger.warning("Status: %11s (%s%s=%d, %s=%d, %s=%d)" %\ (status, same, words['created'], created, words['updated'], updated, words['deleted'], deleted))
def test_dst_to_src(self): m=Mapper('http://e.org/p','/tmp/q') self.assertEqual( m.dst_to_src('/tmp/q'), 'http://e.org/p') self.assertEqual( m.dst_to_src('/tmp/q/bb'), 'http://e.org/p/bb') self.assertEqual( m.dst_to_src('/tmp/q/bb/cc'), 'http://e.org/p/bb/cc')