def build_resource_list(self, paths=None, set_path=False): """Return a resource list for files on local disk The set of files is taken by disk scan from the paths specified or else defaults to the paths specified in the current mappings paths - override paths from mappings if specified set_path - set true to set the path information for each resource included. This is used to build a resource list as the basis for creating a dump. Return ResourceList. Uses existing self.mapper settings. """ # 0. Sanity checks, parse paths is specified if (len(self.mapper)<1): raise ClientFatalError("No source to destination mapping specified") if (paths is not None): # Expect comma separated list of paths paths=paths.split(',') # 1. Build from disk rlb = ResourceListBuilder(set_md5=self.checksum,mapper=self.mapper) rlb.set_path=set_path rlb.add_exclude_files(self.exclude_patterns) rl = rlb.from_disk(paths=paths) # 2. Set defaults and overrides rl.allow_multifile = self.allow_multifile rl.pretty_xml = self.pretty_xml rl.mapper = self.mapper if (self.max_sitemap_entries is not None): rl.max_sitemap_entries = self.max_sitemap_entries return(rl)
def sync_audit(map, counter): """Run resync audit.""" client = Client() # ignore fail to continue running, log later client.ignore_failures = True client.set_mappings(map) # init_logging(verbose=True) src_resource_list = client.find_resource_list() rlb = ResourceListBuilder(mapper=client.mapper) dst_resource_list = rlb.from_disk() # Compare these resource lists respecting any comparison options (same, updated, deleted, created) = dst_resource_list.compare(src_resource_list) result = dict(created=[], updated=[], deleted=[]) for item in created: record_id = item.uri.rsplit('/', 1)[1] result['created'].append(record_id) for item in updated: record_id = item.uri.rsplit('/', 1)[1] result['updated'].append(record_id) for item in deleted: record_id = item.uri.rsplit('/', 1)[1] result['deleted'].append(record_id) update_counter(counter, result) return dict(same=len(same), updated=len(updated), deleted=len(deleted), created=len(created))
def test2_pretty_output(self): rlb = ResourceListBuilder() rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"]) rl = rlb.from_disk() rl.md["modified"] = None # don't write so we can test output easily self.assertEqual( rl.as_xml(pretty_xml=True), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<rs:md capability="resourcelist" />\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:md length="20" /></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:md length="45" /></url>\n</urlset>', )
def test05_from_disk_paths(self): rlb = ResourceListBuilder() rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) # no path, should get no resources rl = rlb.from_disk(paths=[]) self.assertEqual( len(rl), 0) # full path, 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual( len(rl), 2) # new object with mapper covering larger space of disk rlb = ResourceListBuilder(set_path=True) rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata']) # same path with 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual( len(rl), 2) # same path with 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1','resync/test/testdata/dir2']) self.assertEqual( len(rl), 3) # path that is just a single file rl = rlb.from_disk(paths=['resync/test/testdata/dir1/file_a']) self.assertEqual( len(rl), 1) rli = iter(rl) r = rli.next() self.assertTrue( r is not None ) self.assertEqual( r.uri, 'http://example.org/t/dir1/file_a' ) self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' ) self.assertEqual( r.md5, None ) self.assertEqual( r.length, 20 ) self.assertEqual( r.path, 'resync/test/testdata/dir1/file_a' )
def test04_data(self): rlb = ResourceListBuilder(set_path=True,set_md5=True) rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) rl = rlb.from_disk() self.assertEqual( len(rl), 2) r = rl.resources.get('http://example.org/t/file_a') self.assertTrue( r is not None ) self.assertEqual( r.uri, 'http://example.org/t/file_a' ) self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' ) self.assertEqual( r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==' ) self.assertEqual( r.path, 'resync/test/testdata/dir1/file_a' )
def test4_data(self): rlb = ResourceListBuilder(do_md5=True) rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"]) rl = rlb.from_disk(set_path=True) self.assertEqual(len(rl), 2) r1 = rl.resources.get("http://example.org/t/file_a") self.assertTrue(r1 is not None) self.assertEqual(r1.uri, "http://example.org/t/file_a") self.assertEqual(r1.lastmod, "2012-07-25T17:13:46Z") self.assertEqual(r1.md5, "a/Jv1mYBtSjS4LR+qoft/Q==") self.assertEqual(r1.path, "resync/test/testdata/dir1/file_a")
def test04_data(self): rlb = ResourceListBuilder(set_path=True, set_md5=True) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) r = rl.resources.get('http://example.org/t/file_a') self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
def test04_data(self): rlb = ResourceListBuilder(set_path=True, set_hashes=['md5']) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) r = rl.resources.get('http://example.org/t/file_a') self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r.path, 'tests/testdata/dir1/file_a')
def resource_list(self): """Return resource_list on disk based on current mappings Return resource_list. Uses existing self.mapper settings. """ ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Build from disk rlb = ResourceListBuilder(do_md5=self.checksum,mapper=self.mapper) rlb.add_exclude_files(self.exclude_patterns) return( rlb.from_disk() )
def test06_odd_file_names(self): """Verfify we can read unicode file names properly.""" rlb = ResourceListBuilder() rlb.mapper = Mapper(['x:', 'tests/testdata/odd_file_names']) rl = rlb.from_disk(paths=['tests/testdata/odd_file_names']) # Get list of URIs to test uris = [x.uri for x in rl] self.assertTrue('x:/not_odd.txt' in uris) self.assertTrue('x:/with&ersand.txt' in uris) self.assertTrue('x:/with spaces.txt' in uris) # File names for accented chars represented with combining chars self.assertTrue(u'x:/Pi\u006e\u0303a_Colada.txt' in uris) self.assertFalse(u'x:/Pi\u00f1a_Colada.txt' in uris) self.assertTrue(u'x:/A_\u0041\u0303_tilde.txt' in uris) self.assertFalse(u'x:/A_\u00c3_tilde.txt' in uris) # Snowman is single char self.assertFalse(u'x:snowman_\u2603.txt' in uris)
def test03_set_hashes(self): rlb = ResourceListBuilder(set_hashes=['md5']) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, '6bf26fd66601b528d2e0b47eaa87edfd') self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, '452e54bdae1626ac5d6e7be81b39de21') self.assertEqual(r.length, 45) self.assertEqual(r.path, None)
def test03_set_md5(self): rlb = ResourceListBuilder(set_md5=True) rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) rl = rlb.from_disk() self.assertEqual( len(rl), 2 ) rli = iter(rl) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_a' ) self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' ) self.assertEqual( r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==' ) self.assertEqual( r.length, 20 ) self.assertEqual( r.path, None ) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_b' ) self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' ) self.assertEqual( r.md5, 'RS5Uva4WJqxdbnvoGzneIQ==' ) self.assertEqual( r.length, 45 ) self.assertEqual( r.path, None )
def test02_no_length(self): rlb = ResourceListBuilder(set_length=False) rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) rl = rlb.from_disk() self.assertEqual( len(rl), 2 ) rli = iter(rl) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_a' ) self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' ) self.assertEqual( r.md5, None ) self.assertEqual( r.length, None ) self.assertEqual( r.path, None ) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_b' ) self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' ) self.assertEqual( r.md5, None ) self.assertEqual( r.length, None ) self.assertEqual( r.path, None )
def test03_set_md5(self): rlb = ResourceListBuilder(set_md5=True) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==') self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, 'RS5Uva4WJqxdbnvoGzneIQ==') self.assertEqual(r.length, 45) self.assertEqual(r.path, None)
def test02_no_length(self): rlb = ResourceListBuilder(set_length=False) rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, None) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, None) self.assertEqual(r.path, None)
def test3_with_md5(self): rlb = ResourceListBuilder(do_md5=True) rlb.mapper = Mapper(["http://example.org/t", "resync/test/testdata/dir1"]) rl = rlb.from_disk() xml = rl.as_xml() self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:md hash="md5:a/Jv1mYBtSjS4LR\+qoft/Q==" length="20" />', xml, ), ) # must escape + in md5 self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:md hash="md5:RS5Uva4WJqxdbnvoGzneIQ==" length="45" />', xml, ), )
def test01_simple_scan(self): rlb = ResourceListBuilder() rlb.mapper = Mapper(['http://example.org/t', 'tests/testdata/dir1']) rl = rlb.from_disk() self.assertEqual(len(rl), 2) rli = iter(rl) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 20) self.assertEqual(r.path, None) r = next(rli) self.assertEqual(r.uri, 'http://example.org/t/file_b') self.assertEqual(r.lastmod, '2001-09-09T01:46:40Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 45) self.assertEqual(r.path, None) # Make sure at and completed were set self.assertTrue(rl.md_at is not None) self.assertTrue(rl.md_completed is not None)
def test01_simple_scan(self): rlb = ResourceListBuilder() rlb.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) rl = rlb.from_disk() self.assertEqual( len(rl), 2 ) rli = iter(rl) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_a' ) self.assertEqual( r.lastmod, '2012-07-25T17:13:46Z' ) self.assertEqual( r.md5, None ) self.assertEqual( r.length, 20 ) self.assertEqual( r.path, None ) r = rli.next() self.assertEqual( r.uri, 'http://example.org/t/file_b' ) self.assertEqual( r.lastmod, '2001-09-09T01:46:40Z' ) self.assertEqual( r.md5, None ) self.assertEqual( r.length, 45 ) self.assertEqual( r.path, None ) # Make sure at and completed were set self.assertTrue( rl.md_at is not None ) self.assertTrue( rl.md_completed is not None )
def build_resource_list(self, paths=None, set_path=False): """Return a resource list for files on local disk The set of files is taken by disk scan from the paths specified or else defaults to the paths specified in the current mappings paths - override paths from mappings if specified set_path - set true to set the path information for each resource included. This is used to build a resource list as the basis for creating a dump. Return ResourceList. Uses existing self.mapper settings. """ # 0. Sanity checks, parse paths is specified if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (paths is not None): # Expect comma separated list of paths paths = paths.split(',') # 1. Build from disk rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) rlb.set_path = set_path rlb.add_exclude_files(self.exclude_patterns) rl = rlb.from_disk(paths=paths) # 2. Set defaults and overrides rl.allow_multifile = self.allow_multifile rl.pretty_xml = self.pretty_xml rl.mapper = self.mapper if (self.max_sitemap_entries is not None): rl.max_sitemap_entries = self.max_sitemap_entries return (rl)
def test05_from_disk_paths(self): rlb = ResourceListBuilder() rlb.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) # no path, should get no resources rl = rlb.from_disk(paths=[]) self.assertEqual(len(rl), 0) # full path, 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual(len(rl), 2) # new object with mapper covering larger space of disk rlb = ResourceListBuilder(set_path=True) rlb.mapper = Mapper(['http://example.org/t', 'resync/test/testdata']) # same path with 2 resources rl = rlb.from_disk(paths=['resync/test/testdata/dir1']) self.assertEqual(len(rl), 2) # same path with 2 resources rl = rlb.from_disk( paths=['resync/test/testdata/dir1', 'resync/test/testdata/dir2']) self.assertEqual(len(rl), 3) # path that is just a single file rl = rlb.from_disk(paths=['resync/test/testdata/dir1/file_a']) self.assertEqual(len(rl), 1) rli = iter(rl) r = rli.next() self.assertTrue(r is not None) self.assertEqual(r.uri, 'http://example.org/t/dir1/file_a') self.assertEqual(r.lastmod, '2012-07-25T17:13:46Z') self.assertEqual(r.md5, None) self.assertEqual(r.length, 20) self.assertEqual(r.path, 'resync/test/testdata/dir1/file_a')
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ('audit' if (audit_only) else 'baseline sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mapper) < 1): raise ClientFatalError( "No source to destination mapping specified") if (not audit_only and self.mapper.unsafe()): raise ClientFatalError( "Source to destination mappings unsafe: %s" % str(self.mapper)) ### 1. Get inventories from both src and dst # 1.a source resource list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList( allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError( "Can't read source resource list from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source resource list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source resource list" ) # 1.b destination resource list mapped back to source URIs rlb = ResourceListBuilder(set_md5=self.checksum, mapper=self.mapper) dst_resource_list = rlb.from_disk() ### 2. Compare these resource lists respecting any comparison options (same, updated, deleted, created) = dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), audit=True, same=len(same), created=len(created), updated=len(updated), deleted=len(deleted)) if (audit_only or len(created) + len(updated) + len(deleted) == 0): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed if (not self.noauth): uauth = UrlAuthority(self.sitemap, strict=self.strictauth) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created) + len(updated), delete_msg)) self.last_timestamp = 0 num_created = 0 num_updated = 0 num_deleted = 0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) num_created += self.update_resource(resource, file, 'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) num_updated += self.update_resource(resource, file, 'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted += self.delete_resource(resource, file, allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp > 0): ClientState().set_state(self.sitemap, self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated) + len(deleted) + len(created) == 0), same=len(same), created=num_created, updated=num_updated, deleted=num_deleted, to_delete=len(deleted)) self.logger.debug("Completed %s" % (action))
def baseline_or_audit(self, allow_deletion=False, audit_only=False): """Baseline synchonization or audit Both functions implemented in this routine because audit is a prerequisite for a baseline sync. In the case of baseline sync the last timestamp seen is recorded as client state. """ action = ( 'audit' if (audit_only) else 'baseline sync' ) self.logger.debug("Starting "+action) ### 0. Sanity checks if (len(self.mappings)<1): raise ClientFatalError("No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source resource_list try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_resource_list = ResourceList(allow_multifile=self.allow_multifile, mapper=self.mapper) src_resource_list.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source resource_list from %s (%s)" % (self.sitemap,str(e))) self.logger.info("Read source resource_list, %d resources listed" % (len(src_resource_list))) if (len(src_resource_list)==0): raise ClientFatalError("Aborting as there are no resources to sync") if (self.checksum and not src_resource_list.has_md5()): self.checksum=False self.logger.info("Not calculating checksums on destination as not present in source resource_list") # 1.b destination resource_list mapped back to source URIs rlb = ResourceListBuilder(mapper=self.mapper) rlb.do_md5=self.checksum dst_resource_list = rlb.from_disk() ### 2. Compare these resource_lists respecting any comparison options (same,updated,deleted,created)=dst_resource_list.compare(src_resource_list) ### 3. Report status and planned actions self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), audit=True,same=len(same),created=len(created), updated=len(updated),deleted=len(deleted)) if (audit_only or len(created)+len(updated)+len(deleted)==0): self.logger.debug("Completed "+action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_resource_list: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): #self.logger.info("Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap,resource.uri)) pass else: raise ClientFatalError("Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap,resource.uri)) ### 5. Grab files to do sync delete_msg = (", and delete %d resources" % len(deleted)) if (allow_deletion) else '' self.logger.warning("Will GET %d resources%s" % (len(created)+len(updated),delete_msg)) self.last_timestamp = 0 num_created=0 num_updated=0 num_deleted=0 for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri,file)) num_created+=self.update_resource(resource,file,'created') for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri,file)) num_updated+=self.update_resource(resource,file,'updated') for resource in deleted: uri = resource.uri file = self.mapper.src_to_dst(uri) num_deleted+=self.delete_resource(resource,file,allow_deletion) ### 6. Store last timestamp to allow incremental sync if (not audit_only and self.last_timestamp>0): ClientState().set_state(self.sitemap,self.last_timestamp) self.logger.info("Written last timestamp %s for incremental sync" % (datetime_to_str(self.last_timestamp))) ### 7. Done self.log_status(in_sync=(len(updated)+len(deleted)+len(created)==0), same=len(same),created=num_created, updated=num_updated,deleted=num_deleted) self.logger.debug("Completed %s" % (action))
def base_line(self, unzipdir): """ Synchronize the unzipped contents of a resource dump with the local resources :param unzipdir: the directory of the unzipped packed contents. :return: """ manifest_file_name = os.path.join(unzipdir, "manifest.xml") try: sitemap = Sitemap() manifest_doc = sitemap.parse_xml(fh=manifest_file_name) # the manifest_doc is a resync.resource_container.ResourceContainer capability = manifest_doc.capability assert capability == CAPA_RESOURCEDUMP_MANIFEST, "Capability is not %s but %s" % ( CAPA_RESOURCEDUMP_MANIFEST, capability) self.status = Status.parsed self.__inform_sitemap_received__(capability, manifest_file_name) config = Config() netloc = config.boolean_prop(Config.key_use_netloc, False) base_uri, destination = DestinationMap().find_destination( self.pack_uri, netloc=netloc) assert destination is not None, "Found no destination folder in DestinationMap" mapper = Mapper((base_uri, destination)) rlb = ResourceListBuilder(mapper=mapper) dst_resource_list = rlb.from_disk() # Compares on uri same, updated, deleted, created = dst_resource_list.compare( manifest_doc) raise NotImplementedError("This class is not fully implemented.") print(len(same), len(updated), len(deleted), len(created)) print("same") for resource in same: print(resource) print("updated") for resource in updated: print(resource) print("deleted") for resource in deleted: print(resource) print("created") for resource in created: print(resource) base_uri, local_path = DestinationMap().find_local_path( resource.uri) print(base_uri, local_path) except AssertionError as err: self.logger.debug("%s Error: %s" % (self.pack_uri, str(err))) self.status = Status.parse_error self.exceptions.append(err) except SitemapParseError as err: self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err))) self.status = Status.parse_error self.exceptions.append(err) self.status = Status.processed_with_exceptions if self.has_exceptions( ) else Status.processed