Example #1
0
 def write_static_inventory(self):
     """Writes the inventory to the filesystem"""
     # Generate sitemap in temp directory
     then = time.time()
     self.ensure_temp_dir(Source.TEMP_FILE_PATH)
     inventory = self.generate()
     basename = Source.TEMP_FILE_PATH + "/sitemap.xml"
     s = Sitemap()
     s.max_sitemap_entries = self.config['max_sitemap_entries']
     s.mapper = Mapper([self.source.base_uri, Source.TEMP_FILE_PATH])
     s.write(inventory, basename)
     # Delete old sitemap files; move the new ones; delete the temp dir
     self.rm_sitemap_files(Source.STATIC_FILE_PATH)
     self.mv_sitemap_files(Source.TEMP_FILE_PATH, Source.STATIC_FILE_PATH)
     shutil.rmtree(Source.TEMP_FILE_PATH)
     now = time.time()
     # Log Sitemap create start event
     sitemap_size = self.compute_sitemap_size(Source.STATIC_FILE_PATH)
     log_data = {
         'time': (now - then),
         'no_resources': self.source.resource_count
     }
     self.logger.info("Wrote static sitemap inventory. %s" % log_data)
     sm_write_end = ResourceChange(resource=ResourceChange(
         self.uri, size=sitemap_size, timestamp=then),
                                   changetype="UPDATED")
     self.source.notify_observers(sm_write_end)
Example #2
0
 def test1_set_with_repeats(self):
     src = ChangeSet()
     src.add( ResourceChange('a',timestamp=1) )
     src.add( ResourceChange('b',timestamp=1) )
     src.add( ResourceChange('c',timestamp=1) )
     src.add( ResourceChange('a',timestamp=2) )
     src.add( ResourceChange('b',timestamp=2) )
     self.assertEqual(len(src), 5, "5 changes in changeset")
Example #3
0
 def test3_changeset(self):
     src = ChangeSet()
     src.add( ResourceChange('a',timestamp=1) )
     src.add( ResourceChange('b',timestamp=2) )
     src.add( ResourceChange('c',timestamp=3) )
     src.add( ResourceChange('d',timestamp=4)) 
     src.add( ResourceChange('e',timestamp=5) )
     self.assertEqual(len(src), 5, "5 things in src")
Example #4
0
 def test2_with_repeats_again(self):
     r1 = ResourceChange(uri='a',size=1)
     r2 = ResourceChange(uri='b',size=2)
     i = ChangeSet()
     i.add(r1)
     i.add(r2)
     self.assertEqual( len(i), 2 )
     # Can add another ResourceChange with same URI
     r1d = ResourceChange(uri='a',size=10)
     i.add(r1d)
     self.assertEqual( len(i), 3 )
Example #5
0
 def test4_iter(self):
     i = ChangeSet()
     i.add( ResourceChange('a',timestamp=1) )
     i.add( ResourceChange('b',timestamp=2) )
     i.add( ResourceChange('c',timestamp=3) )
     i.add( ResourceChange('d',timestamp=4) )
     resources=[]
     for r in i:
         resources.append(r)
     self.assertEqual(len(resources), 4)
     self.assertEqual( resources[0].uri, 'a')
     self.assertEqual( resources[3].uri, 'd')
Example #6
0
 def test2_create_with_change(self):
     r = Resource('a', timestamp=1234)
     rc = ResourceChange(resource=r, changeid=89, changetype='UP')
     self.assertEqual(rc.changeid, 89)
     self.assertEqual(rc.changetype, 'UP')
     # So far these still turn out equal
     self.assertEqual(r, rc)
Example #7
0
    def _delete_resource(self,
                         identifier,
                         timestamp,
                         notify_observers=True,
                         oai=True):
        """Delete a given resource, notify observers."""
        basename = None
        if oai:
            basename = self.oaimapping[identifier]
            del self.oaimapping[identifier]
            # delete metadata resource url
            self._delete_resource(identifier,
                                  timestamp,
                                  notify_observers=notify_observers,
                                  oai=False)
        else:
            basename = self.client.endpoint + "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" + identifier

        res = self.resource(basename)
        del self._repository[basename]
        res.timestamp = timestamp

        if notify_observers:
            change = ResourceChange(resource=res, changetype="DELETED")
            self.notify_observers(change)
            self.logger.debug("Event: %s" % repr(change))
 def test_02_resource_created(self):
     # ResourceChange with created
     r1 = ResourceChange('http://example.org/r/1',
                         1234,
                         9999,
                         'Q2hlY2sgSW50ZWdyaXR5IQ==',
                         changetype='CREATED')
     xml = Sitemap().resource_as_xml(r1)
     self.assertEqual(
         xml,
         "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>"
     )
     # Now make inventory
     i = Inventory()
     i.add(r1)
     inv_xml = Sitemap().resources_as_xml(i)
     self.assertEqual(
         inv_xml,
         "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>http://example.org/r/1</loc><lastmod rs:type=\"created\">1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url></urlset>"
     )
     # and try parsing back
     s = Sitemap()
     s.resource_class = ResourceChange
     i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml))
     self.assertEqual(len(i), 1)
     r = iter(i).next()
     self.assertEqual(r.uri, 'http://example.org/r/1')
     self.assertEqual(r.timestamp, 1234)
     self.assertEqual(r.changetype, 'CREATED')
Example #9
0
    def update_resource(self, resource, file, changetype=None):
        """Update resource from uri to file on local system

        Update means two things:
        1. GET resources
        2. set mtime in local time to be equal to timestamp in UTC (should perhaps
        or at least warn if different from LastModified from the GET response instead 
        but maybe warn if different (or just earlier than) the lastmod we expected 
        from the inventory
        """
        path = os.path.dirname(file)
        distutils.dir_util.mkpath(path)
        if (self.dryrun):
            self.logger.info("dryrun: would GET %s --> %s" %
                             (resource.uri, file))
        else:
            try:
                urllib.urlretrieve(resource.uri, file)
            except IOError as e:
                msg = "Failed to GET %s -- %s" % (resource.uri, str(e))
                if (self.ignore_failures):
                    self.logger.warning(msg)
                    return
                else:
                    raise ClientFatalError(msg)
            if (resource.timestamp is not None):
                unixtime = int(resource.timestamp)  #no fractional
                os.utime(file, (unixtime, unixtime))
            self.log_event(
                ResourceChange(resource=resource, changetype=changetype))
Example #10
0
 def test1_create(self):
     r = Resource('a', timestamp=1234)
     rc = ResourceChange(resource=r)
     self.assertEqual(rc.uri, 'a')
     self.assertEqual(rc.timestamp, 1234)
     # So far these turn out equal
     self.assertEqual(r, rc)
     self.assertTrue(re.match(r"\[a | 1969", str(rc)))
 def test_01_resource_str(self):
     # ResourceChange but with no change info
     r1 = ResourceChange('http://example.org/r/1', 1234, 9999,
                         'Q2hlY2sgSW50ZWdyaXR5IQ==')
     self.assertEqual(
         Sitemap().resource_as_xml(r1),
         "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><lastmod>1970-01-01T00:20:34Z</lastmod><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>"
     )
Example #12
0
 def _update_resource(self, basename, identifier, timestamp, oai=True):
     """Update a resource, notify observers."""
     self._repository[basename] = {'timestamp': timestamp}
     change = ResourceChange(resource=self.resource(basename),
                             changetype="UPDATED")
     self.notify_observers(change)
     self.logger.debug("Event: %s" % repr(change))
     # update metadata resource url
     if oai:
         self._update_resource(
             self.client.endpoint +
             "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" +
             identifier,
             identifier,
             timestamp,
             oai=False)
Example #13
0
 def _create_resource(self,
                      basename=None,
                      identifier=None,
                      timestamp=time.time(),
                      notify_observers=True,
                      oai=True):
     """Create a new resource, add it to the source, notify observers."""
     self._repository[basename] = {'timestamp': timestamp}
     change = ResourceChange(resource=self.resource(basename),
                             changetype="CREATED")
     if notify_observers:
         self.notify_observers(change)
         self.logger.debug("Event: %s" % repr(change))
     # add metadata resource url
     if oai:
         self._create_resource(
             basename=self.client.endpoint +
             "?verb=GetRecord&metadataPrefix=oai_dc&identifier=" +
             identifier,
             timestamp=timestamp,
             notify_observers=notify_observers,
             oai=False)
         self.oaimapping[identifier] = basename
 def test_02_resource_deleted(self):
     # ResourceChange with deleted
     r1 = ResourceChange('http://example.org/r/1',
                         1234,
                         9999,
                         'Q2hlY2sgSW50ZWdyaXR5IQ==',
                         changetype='DELETED')
     self.assertEqual(
         Sitemap().resource_as_xml(r1),
         "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>"
     )
     # Now make inventory
     i = Inventory()
     i.add(r1)
     inv_xml = Sitemap().resources_as_xml(i)
     # and try parsing back
     s = Sitemap()
     s.resource_class = ResourceChange
     i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml))
     self.assertEqual(len(i), 1)
     r = iter(i).next()
     self.assertEqual(r.uri, 'http://example.org/r/1')
     self.assertEqual(r.timestamp, 1234)
     self.assertEqual(r.changetype, 'DELETED')
Example #15
0
 def sync_or_audit(self, allow_deletion=False, audit_only=False):
     action = ('audit' if (audit_only) else 'sync')
     self.logger.debug("Starting " + action)
     ### 0. Sanity checks
     if (len(self.mappings) < 1):
         raise ClientFatalError(
             "No source to destination mapping specified")
     ### 1. Get inventories from both src and dst
     # 1.a source inventory
     ib = InventoryBuilder(mapper=self.mapper)
     try:
         self.logger.info("Reading sitemap %s" % (self.sitemap))
         src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                               mapper=self.mapper)
         src_inventory = src_sitemap.read(uri=self.sitemap)
         self.logger.debug("Finished reading sitemap")
     except Exception as e:
         raise ClientFatalError("Can't read source inventory from %s (%s)" %
                                (self.sitemap, str(e)))
     self.logger.info("Read source inventory, %d resources listed" %
                      (len(src_inventory)))
     if (len(src_inventory) == 0):
         raise ClientFatalError(
             "Aborting as there are no resources to sync")
     if (self.checksum and not src_inventory.has_md5()):
         self.checksum = False
         self.logger.info(
             "Not calculating checksums on destination as not present in source inventory"
         )
     # 1.b destination inventory mapped back to source URIs
     ib.do_md5 = self.checksum
     dst_inventory = ib.from_disk()
     ### 2. Compare these inventorys respecting any comparison options
     (same, updated, deleted,
      created) = dst_inventory.compare(src_inventory)
     ### 3. Report status and planned actions
     status = "  IN SYNC  "
     if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0):
         status = "NOT IN SYNC"
     self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
           (status,len(same),len(updated),len(deleted),len(created)))
     if (audit_only):
         self.logger.debug("Completed " + action)
         return
     ### 4. Check that sitemap has authority over URIs listed
     uauth = UrlAuthority(self.sitemap)
     for resource in src_inventory:
         if (not uauth.has_authority_over(resource.uri)):
             if (self.noauth):
                 self.logger.info(
                     "Sitemap (%s) mentions resource at a location it does not have authority over (%s)"
                     % (self.sitemap, resource.uri))
             else:
                 raise ClientFatalError(
                     "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                     % (self.sitemap, resource.uri))
     ### 5. Grab files to do sync
     for resource in updated:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         self.logger.info("updated: %s -> %s" % (uri, file))
         self.update_resource(resource, file, 'UPDATED')
     for resource in created:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         self.logger.info("created: %s -> %s" % (uri, file))
         self.update_resource(resource, file, 'CREATED')
     for resource in deleted:
         uri = resource.uri
         if (allow_deletion):
             file = self.mapper.src_to_dst(uri)
             if (self.dryrun):
                 self.logger.info("dryrun: would delete %s -> %s" %
                                  (uri, file))
             else:
                 os.unlink(file)
                 self.logger.info("deleted: %s -> %s" % (uri, file))
                 self.log_event(
                     ResourceChange(resource=resource,
                                    changetype="DELETED"))
         else:
             self.logger.info(
                 "nodelete: would delete %s (--delete to enable)" % uri)
     self.logger.debug("Completed " + action)
Example #16
0
 def incremental(self, allow_deletion=False, changeset_uri=None):
     self.logger.debug("Starting incremental sync")
     ### 0. Sanity checks
     if (len(self.mappings) < 1):
         raise ClientFatalError(
             "No source to destination mapping specified")
     ### 1. Get URI of changeset, from sitemap or explicit
     if (changeset_uri):
         # Translate as necessary using maps
         changeset = self.sitemap_changeset_uri(changeset_uri)
     else:
         # Get sitemap
         try:
             self.logger.info("Reading sitemap %s" % (self.sitemap))
             src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                                   mapper=self.mapper)
             src_inventory = src_sitemap.read(uri=self.sitemap,
                                              index_only=True)
             self.logger.debug("Finished reading sitemap/sitemapindex")
         except Exception as e:
             raise ClientFatalError(
                 "Can't read source sitemap from %s (%s)" %
                 (self.sitemap, str(e)))
         # Extract changeset location
         # FIXME - need to completely rework the way we handle/store capabilities
         links = self.extract_links(src_inventory.capabilities)
         if ('current' not in links):
             raise ClientFatalError(
                 "Failed to extract changeset location from sitemap %s" %
                 (self.sitemap))
         changeset = links['current']
     ### 2. Read changeset from source
     ib = InventoryBuilder(mapper=self.mapper)
     try:
         self.logger.info("Reading changeset %s" % (changeset))
         src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                               mapper=self.mapper)
         src_changeset = src_sitemap.read(uri=changeset, changeset=True)
         self.logger.debug("Finished reading changeset")
     except Exception as e:
         raise ClientFatalError("Can't read source changeset from %s (%s)" %
                                (changeset, str(e)))
     self.logger.info("Read source changeset, %d resources listed" %
                      (len(src_changeset)))
     if (len(src_changeset) == 0):
         raise ClientFatalError(
             "Aborting as there are no resources to sync")
     if (self.checksum and not src_changeset.has_md5()):
         self.checksum = False
         self.logger.info(
             "Not calculating checksums on destination as not present in source inventory"
         )
     ### 3. Check that sitemap has authority over URIs listed
     # FIXME - What does authority mean for changeset? Here use both the
     # changeset URI and, if we used it, the sitemap URI
     uauth_cs = UrlAuthority(changeset)
     if (not changeset_uri):
         uauth_sm = UrlAuthority(self.sitemap)
     for resource in src_changeset:
         if (not uauth_cs.has_authority_over(resource.uri)
                 and (changeset_uri
                      or not uauth_sm.has_authority_over(resource.uri))):
             if (self.noauth):
                 self.logger.warning(
                     "Changeset (%s) mentions resource at a location it does not have authority over (%s)"
                     % (changeset, resource.uri))
             else:
                 raise ClientFatalError(
                     "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                     % (changeset, resource.uri))
     ### 3. Apply changes
     for resource in src_changeset:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         if (resource.changetype == 'UPDATED'):
             self.logger.info("updated: %s -> %s" % (uri, file))
             self.update_resource(resource, file, 'UPDATED')
         elif (resource.changetype == 'CREATED'):
             self.logger.info("created: %s -> %s" % (uri, file))
             self.update_resource(resource, file, 'CREATED')
         elif (resource.changetype == 'DELETED'):
             if (allow_deletion):
                 file = self.mapper.src_to_dst(uri)
                 if (self.dryrun):
                     self.logger.info("dryrun: would delete %s -> %s" %
                                      (uri, file))
                 else:
                     os.unlink(file)
                     self.logger.info("deleted: %s -> %s" % (uri, file))
                     self.log_event(
                         ResourceChange(resource=resource,
                                        changetype="DELETED"))
             else:
                 self.logger.info(
                     "nodelete: would delete %s (--delete to enable)" % uri)
         else:
             raise ClientError("Unknown change type %s" %
                               (resource.changetype))
     self.logger.debug("Completed incremental stuff")