def write_static_inventory(self): """Writes the inventory to the filesystem""" # Generate sitemap in temp directory then = time.time() self.ensure_temp_dir(Source.TEMP_FILE_PATH) inventory = self.generate() basename = Source.TEMP_FILE_PATH + "/sitemap.xml" s = Sitemap() s.max_sitemap_entries = self.config['max_sitemap_entries'] s.mapper = Mapper([self.source.base_uri, Source.TEMP_FILE_PATH]) s.write(inventory, basename) # Delete old sitemap files; move the new ones; delete the temp dir self.rm_sitemap_files(Source.STATIC_FILE_PATH) self.mv_sitemap_files(Source.TEMP_FILE_PATH, Source.STATIC_FILE_PATH) shutil.rmtree(Source.TEMP_FILE_PATH) now = time.time() # Log Sitemap create start event sitemap_size = self.compute_sitemap_size(Source.STATIC_FILE_PATH) log_data = { 'time': (now - then), 'no_resources': self.source.resource_count } self.logger.info("Wrote static sitemap inventory. %s" % log_data) sm_write_end = ResourceChange(resource=ResourceChange( self.uri, size=sitemap_size, timestamp=then), changetype="UPDATED") self.source.notify_observers(sm_write_end)
def test1_set_with_repeats(self): src = ChangeSet() src.add( ResourceChange('a',timestamp=1) ) src.add( ResourceChange('b',timestamp=1) ) src.add( ResourceChange('c',timestamp=1) ) src.add( ResourceChange('a',timestamp=2) ) src.add( ResourceChange('b',timestamp=2) ) self.assertEqual(len(src), 5, "5 changes in changeset")
def test3_changeset(self): src = ChangeSet() src.add( ResourceChange('a',timestamp=1) ) src.add( ResourceChange('b',timestamp=2) ) src.add( ResourceChange('c',timestamp=3) ) src.add( ResourceChange('d',timestamp=4)) src.add( ResourceChange('e',timestamp=5) ) self.assertEqual(len(src), 5, "5 things in src")
def test2_with_repeats_again(self): r1 = ResourceChange(uri='a',size=1) r2 = ResourceChange(uri='b',size=2) i = ChangeSet() i.add(r1) i.add(r2) self.assertEqual( len(i), 2 ) # Can add another ResourceChange with same URI r1d = ResourceChange(uri='a',size=10) i.add(r1d) self.assertEqual( len(i), 3 )
def test4_iter(self): i = ChangeSet() i.add( ResourceChange('a',timestamp=1) ) i.add( ResourceChange('b',timestamp=2) ) i.add( ResourceChange('c',timestamp=3) ) i.add( ResourceChange('d',timestamp=4) ) resources=[] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual( resources[0].uri, 'a') self.assertEqual( resources[3].uri, 'd')
def test2_create_with_change(self): r = Resource('a', timestamp=1234) rc = ResourceChange(resource=r, changeid=89, changetype='UP') self.assertEqual(rc.changeid, 89) self.assertEqual(rc.changetype, 'UP') # So far these still turn out equal self.assertEqual(r, rc)
def _delete_resource(self, identifier, timestamp, notify_observers=True, oai=True): """Delete a given resource, notify observers.""" basename = None if oai: basename = self.oaimapping[identifier] del self.oaimapping[identifier] # delete metadata resource url self._delete_resource(identifier, timestamp, notify_observers=notify_observers, oai=False) else: basename = self.client.endpoint + "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" + identifier res = self.resource(basename) del self._repository[basename] res.timestamp = timestamp if notify_observers: change = ResourceChange(resource=res, changetype="DELETED") self.notify_observers(change) self.logger.debug("Event: %s" % repr(change))
def test_02_resource_created(self): # ResourceChange with created r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==', changetype='CREATED') xml = Sitemap().resource_as_xml(r1) self.assertEqual( xml, "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) self.assertEqual( inv_xml, "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url></urlset>" ) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual(len(i), 1) r = iter(i).next() self.assertEqual(r.uri, 'http://example.org/r/1') self.assertEqual(r.timestamp, 1234) self.assertEqual(r.changetype, 'CREATED')
def update_resource(self, resource, file, changetype=None): """Update resource from uri to file on local system Update means two things: 1. GET resources 2. set mtime in local time to be equal to timestamp in UTC (should perhaps or at least warn if different from LastModified from the GET response instead but maybe warn if different (or just earlier than) the lastmod we expected from the inventory """ path = os.path.dirname(file) distutils.dir_util.mkpath(path) if (self.dryrun): self.logger.info("dryrun: would GET %s --> %s" % (resource.uri, file)) else: try: urllib.urlretrieve(resource.uri, file) except IOError as e: msg = "Failed to GET %s -- %s" % (resource.uri, str(e)) if (self.ignore_failures): self.logger.warning(msg) return else: raise ClientFatalError(msg) if (resource.timestamp is not None): unixtime = int(resource.timestamp) #no fractional os.utime(file, (unixtime, unixtime)) self.log_event( ResourceChange(resource=resource, changetype=changetype))
def test1_create(self): r = Resource('a', timestamp=1234) rc = ResourceChange(resource=r) self.assertEqual(rc.uri, 'a') self.assertEqual(rc.timestamp, 1234) # So far these turn out equal self.assertEqual(r, rc) self.assertTrue(re.match(r"\[a | 1969", str(rc)))
def test_01_resource_str(self): # ResourceChange but with no change info r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==') self.assertEqual( Sitemap().resource_as_xml(r1), "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod>1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" )
def _update_resource(self, basename, identifier, timestamp, oai=True): """Update a resource, notify observers.""" self._repository[basename] = {'timestamp': timestamp} change = ResourceChange(resource=self.resource(basename), changetype="UPDATED") self.notify_observers(change) self.logger.debug("Event: %s" % repr(change)) # update metadata resource url if oai: self._update_resource( self.client.endpoint + "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" + identifier, identifier, timestamp, oai=False)
def _create_resource(self, basename=None, identifier=None, timestamp=time.time(), notify_observers=True, oai=True): """Create a new resource, add it to the source, notify observers.""" self._repository[basename] = {'timestamp': timestamp} change = ResourceChange(resource=self.resource(basename), changetype="CREATED") if notify_observers: self.notify_observers(change) self.logger.debug("Event: %s" % repr(change)) # add metadata resource url if oai: self._create_resource( basename=self.client.endpoint + "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" + identifier, timestamp=timestamp, notify_observers=notify_observers, oai=False) self.oaimapping[identifier] = basename
def test_02_resource_deleted(self): # ResourceChange with deleted r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==', changetype='DELETED') self.assertEqual( Sitemap().resource_as_xml(r1), "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual(len(i), 1) r = iter(i).next() self.assertEqual(r.uri, 'http://example.org/r/1') self.assertEqual(r.timestamp, 1234) self.assertEqual(r.changetype, 'DELETED')
def sync_or_audit(self, allow_deletion=False, audit_only=False): action = ('audit' if (audit_only) else 'sync') self.logger.debug("Starting " + action) ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get inventories from both src and dst # 1.a source inventory ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap) self.logger.debug("Finished reading sitemap") except Exception as e: raise ClientFatalError("Can't read source inventory from %s (%s)" % (self.sitemap, str(e))) self.logger.info("Read source inventory, %d resources listed" % (len(src_inventory))) if (len(src_inventory) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_inventory.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) # 1.b destination inventory mapped back to source URIs ib.do_md5 = self.checksum dst_inventory = ib.from_disk() ### 2. Compare these inventorys respecting any comparison options (same, updated, deleted, created) = dst_inventory.compare(src_inventory) ### 3. Report status and planned actions status = " IN SYNC " if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0): status = "NOT IN SYNC" self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\ (status,len(same),len(updated),len(deleted),len(created))) if (audit_only): self.logger.debug("Completed " + action) return ### 4. Check that sitemap has authority over URIs listed uauth = UrlAuthority(self.sitemap) for resource in src_inventory: if (not uauth.has_authority_over(resource.uri)): if (self.noauth): self.logger.info( "Sitemap (%s) mentions resource at a location it does not have authority over (%s)" % (self.sitemap, resource.uri)) else: raise ClientFatalError( "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (self.sitemap, resource.uri)) ### 5. Grab files to do sync for resource in updated: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') for resource in created: uri = resource.uri file = self.mapper.src_to_dst(uri) self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') for resource in deleted: uri = resource.uri if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) self.logger.debug("Completed " + action)
def incremental(self, allow_deletion=False, changeset_uri=None): self.logger.debug("Starting incremental sync") ### 0. Sanity checks if (len(self.mappings) < 1): raise ClientFatalError( "No source to destination mapping specified") ### 1. Get URI of changeset, from sitemap or explicit if (changeset_uri): # Translate as necessary using maps changeset = self.sitemap_changeset_uri(changeset_uri) else: # Get sitemap try: self.logger.info("Reading sitemap %s" % (self.sitemap)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_inventory = src_sitemap.read(uri=self.sitemap, index_only=True) self.logger.debug("Finished reading sitemap/sitemapindex") except Exception as e: raise ClientFatalError( "Can't read source sitemap from %s (%s)" % (self.sitemap, str(e))) # Extract changeset location # FIXME - need to completely rework the way we handle/store capabilities links = self.extract_links(src_inventory.capabilities) if ('current' not in links): raise ClientFatalError( "Failed to extract changeset location from sitemap %s" % (self.sitemap)) changeset = links['current'] ### 2. Read changeset from source ib = InventoryBuilder(mapper=self.mapper) try: self.logger.info("Reading changeset %s" % (changeset)) src_sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) src_changeset = src_sitemap.read(uri=changeset, changeset=True) self.logger.debug("Finished reading changeset") except Exception as e: raise ClientFatalError("Can't read source changeset from %s (%s)" % (changeset, str(e))) self.logger.info("Read source changeset, %d resources listed" % (len(src_changeset))) if (len(src_changeset) == 0): raise ClientFatalError( "Aborting as there are no resources to sync") if (self.checksum and not src_changeset.has_md5()): self.checksum = False self.logger.info( "Not calculating checksums on destination as not present in source inventory" ) ### 3. Check that sitemap has authority over URIs listed # FIXME - What does authority mean for changeset? Here use both the # changeset URI and, if we used it, the sitemap URI uauth_cs = UrlAuthority(changeset) if (not changeset_uri): uauth_sm = UrlAuthority(self.sitemap) for resource in src_changeset: if (not uauth_cs.has_authority_over(resource.uri) and (changeset_uri or not uauth_sm.has_authority_over(resource.uri))): if (self.noauth): self.logger.warning( "Changeset (%s) mentions resource at a location it does not have authority over (%s)" % (changeset, resource.uri)) else: raise ClientFatalError( "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth" % (changeset, resource.uri)) ### 3. Apply changes for resource in src_changeset: uri = resource.uri file = self.mapper.src_to_dst(uri) if (resource.changetype == 'UPDATED'): self.logger.info("updated: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'UPDATED') elif (resource.changetype == 'CREATED'): self.logger.info("created: %s -> %s" % (uri, file)) self.update_resource(resource, file, 'CREATED') elif (resource.changetype == 'DELETED'): if (allow_deletion): file = self.mapper.src_to_dst(uri) if (self.dryrun): self.logger.info("dryrun: would delete %s -> %s" % (uri, file)) else: os.unlink(file) self.logger.info("deleted: %s -> %s" % (uri, file)) self.log_event( ResourceChange(resource=resource, changetype="DELETED")) else: self.logger.info( "nodelete: would delete %s (--delete to enable)" % uri) else: raise ClientError("Unknown change type %s" % (resource.changetype)) self.logger.debug("Completed incremental stuff")