Beispiel #1
0
 def test1_simple_output(self):
     ib = InventoryBuilder()
     i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t')
     self.assertEqual(
         Sitemap().inventory_as_xml(i),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url></urlset>'
     )
 def test1_simple_output(self):
     ib = InventoryBuilder(verbose=True)
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     self.assertEqual(
         Sitemap().resources_as_xml(i),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url><url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url></urlset>'
     )
Beispiel #3
0
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mappings settings.
        """
        ib = InventoryBuilder(do_md5=self.checksum)
        m = Inventory()
        for base_path in sorted(self.mappings.keys()):
            base_uri = self.mappings[base_path]
            m = ib.from_disk(base_path, base_uri, inventory=m)
        return m
 def test4_data(self):
     ib = InventoryBuilder(do_md5=True)
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     self.assertEqual(len(i), 2)
     r1 = i.resources.get('http://example.org/t/file_a')
     self.assertTrue(r1 is not None)
     self.assertEqual(r1.uri, 'http://example.org/t/file_a')
     self.assertEqual(r1.lastmod, '2012-07-25T17:13:46Z')
     self.assertEqual(r1.md5, 'a/Jv1mYBtSjS4LR+qoft/Q==')
     self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
 def test4_data(self):
     ib = InventoryBuilder(do_md5=True)
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     self.assertEqual(len(i), 2)
     r1 = i.resources.get('http://example.org/t/file_a')
     self.assertTrue(r1 is not None)
     self.assertEqual(r1.uri, 'http://example.org/t/file_a')
     self.assertEqual(r1.lastmod, '2012-03-14T17:46:04')
     self.assertEqual(r1.md5, '6bf26fd66601b528d2e0b47eaa87edfd')
     self.assertEqual(r1.file, 'resync/test/testdata/dir1/file_a')
Beispiel #6
0
    def inventory(self):
        """Return inventory on disk based on current mappings

        Return inventory. Uses existing self.mapper settings.
        """
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Build from disk
        ib = InventoryBuilder(do_md5=self.checksum, mapper=self.mapper)
        ib.add_exclude_files(self.exclude_patterns)
        return (ib.from_disk())
Beispiel #7
0
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t')
     s = Sitemap()
     xml = s.inventory_as_xml(i)
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>',
             xml), 'size/checksum for file_a')
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>',
             xml), 'size/checksum for file_b')
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     xml = s.resources_as_xml(i)
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>',
             xml))  #must escape + in md5
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>',
             xml))
Beispiel #9
0
    def sync_or_audit(self,
                      src_uri,
                      dst_path,
                      allow_deletion=False,
                      audit_only=False):
        ### 1. Get inventorys from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder()
        try:
            src_inventory = ib.get(src_uri)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory (%s)" % str(e))
        if (self.verbose):
            print "Read src inventory from %s, %d resources listed" % (
                src_uri, len(src_inventory))
        if (len(src_inventory) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum = False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        segments = src_uri.split('/')
        segments.pop()
        url_prefix = '/'.join(segments)
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk(dst_path, url_prefix)
        ### 2. Compare these inventorys respecting any comparison options
        (num_same, changed, deleted,
         added) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(changed) > 0 or len(deleted) > 0 or len(added) > 0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, changed=%d, deleted=%d, added=%d)" %\
              (status,num_same,len(changed),len(deleted),len(added))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        mapper = Mapper(url_prefix, dst_path)
        for uri in changed:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "changed: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in added:
            file = mapper.src_to_dst(uri)
            if (self.verbose):
                print "added: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = mapper.src_to_dst(uri)
                if (self.verbose):
                    print "deleted: %s -> %s" % (uri, file)
                os.unlink(file)
            else:
                if (self.verbose):
                    print "would delete %s (--delete to enable)" % uri
Beispiel #10
0
 def sync_or_audit(self, allow_deletion=False, audit_only=False):
     action = ('audit' if (audit_only) else 'sync')
     self.logger.debug("Starting " + action)
     ### 0. Sanity checks
     if (len(self.mappings) < 1):
         raise ClientFatalError(
             "No source to destination mapping specified")
     ### 1. Get inventories from both src and dst
     # 1.a source inventory
     ib = InventoryBuilder(mapper=self.mapper)
     try:
         self.logger.info("Reading sitemap %s" % (self.sitemap))
         src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                               mapper=self.mapper)
         src_inventory = src_sitemap.read(uri=self.sitemap)
         self.logger.debug("Finished reading sitemap")
     except Exception as e:
         raise ClientFatalError("Can't read source inventory from %s (%s)" %
                                (self.sitemap, str(e)))
     self.logger.info("Read source inventory, %d resources listed" %
                      (len(src_inventory)))
     if (len(src_inventory) == 0):
         raise ClientFatalError(
             "Aborting as there are no resources to sync")
     if (self.checksum and not src_inventory.has_md5()):
         self.checksum = False
         self.logger.info(
             "Not calculating checksums on destination as not present in source inventory"
         )
     # 1.b destination inventory mapped back to source URIs
     ib.do_md5 = self.checksum
     dst_inventory = ib.from_disk()
     ### 2. Compare these inventorys respecting any comparison options
     (same, updated, deleted,
      created) = dst_inventory.compare(src_inventory)
     ### 3. Report status and planned actions
     status = "  IN SYNC  "
     if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0):
         status = "NOT IN SYNC"
     self.logger.warning("Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
           (status,len(same),len(updated),len(deleted),len(created)))
     if (audit_only):
         self.logger.debug("Completed " + action)
         return
     ### 4. Check that sitemap has authority over URIs listed
     uauth = UrlAuthority(self.sitemap)
     for resource in src_inventory:
         if (not uauth.has_authority_over(resource.uri)):
             if (self.noauth):
                 self.logger.info(
                     "Sitemap (%s) mentions resource at a location it does not have authority over (%s)"
                     % (self.sitemap, resource.uri))
             else:
                 raise ClientFatalError(
                     "Aborting as sitemap (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                     % (self.sitemap, resource.uri))
     ### 5. Grab files to do sync
     for resource in updated:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         self.logger.info("updated: %s -> %s" % (uri, file))
         self.update_resource(resource, file, 'UPDATED')
     for resource in created:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         self.logger.info("created: %s -> %s" % (uri, file))
         self.update_resource(resource, file, 'CREATED')
     for resource in deleted:
         uri = resource.uri
         if (allow_deletion):
             file = self.mapper.src_to_dst(uri)
             if (self.dryrun):
                 self.logger.info("dryrun: would delete %s -> %s" %
                                  (uri, file))
             else:
                 os.unlink(file)
                 self.logger.info("deleted: %s -> %s" % (uri, file))
                 self.log_event(
                     ResourceChange(resource=resource,
                                    changetype="DELETED"))
         else:
             self.logger.info(
                 "nodelete: would delete %s (--delete to enable)" % uri)
     self.logger.debug("Completed " + action)
Beispiel #11
0
 def incremental(self, allow_deletion=False, changeset_uri=None):
     self.logger.debug("Starting incremental sync")
     ### 0. Sanity checks
     if (len(self.mappings) < 1):
         raise ClientFatalError(
             "No source to destination mapping specified")
     ### 1. Get URI of changeset, from sitemap or explicit
     if (changeset_uri):
         # Translate as necessary using maps
         changeset = self.sitemap_changeset_uri(changeset_uri)
     else:
         # Get sitemap
         try:
             self.logger.info("Reading sitemap %s" % (self.sitemap))
             src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                                   mapper=self.mapper)
             src_inventory = src_sitemap.read(uri=self.sitemap,
                                              index_only=True)
             self.logger.debug("Finished reading sitemap/sitemapindex")
         except Exception as e:
             raise ClientFatalError(
                 "Can't read source sitemap from %s (%s)" %
                 (self.sitemap, str(e)))
         # Extract changeset location
         # FIXME - need to completely rework the way we handle/store capabilities
         links = self.extract_links(src_inventory.capabilities)
         if ('current' not in links):
             raise ClientFatalError(
                 "Failed to extract changeset location from sitemap %s" %
                 (self.sitemap))
         changeset = links['current']
     ### 2. Read changeset from source
     ib = InventoryBuilder(mapper=self.mapper)
     try:
         self.logger.info("Reading changeset %s" % (changeset))
         src_sitemap = Sitemap(allow_multifile=self.allow_multifile,
                               mapper=self.mapper)
         src_changeset = src_sitemap.read(uri=changeset, changeset=True)
         self.logger.debug("Finished reading changeset")
     except Exception as e:
         raise ClientFatalError("Can't read source changeset from %s (%s)" %
                                (changeset, str(e)))
     self.logger.info("Read source changeset, %d resources listed" %
                      (len(src_changeset)))
     if (len(src_changeset) == 0):
         raise ClientFatalError(
             "Aborting as there are no resources to sync")
     if (self.checksum and not src_changeset.has_md5()):
         self.checksum = False
         self.logger.info(
             "Not calculating checksums on destination as not present in source inventory"
         )
     ### 3. Check that sitemap has authority over URIs listed
     # FIXME - What does authority mean for changeset? Here use both the
     # changeset URI and, if we used it, the sitemap URI
     uauth_cs = UrlAuthority(changeset)
     if (not changeset_uri):
         uauth_sm = UrlAuthority(self.sitemap)
     for resource in src_changeset:
         if (not uauth_cs.has_authority_over(resource.uri)
                 and (changeset_uri
                      or not uauth_sm.has_authority_over(resource.uri))):
             if (self.noauth):
                 self.logger.warning(
                     "Changeset (%s) mentions resource at a location it does not have authority over (%s)"
                     % (changeset, resource.uri))
             else:
                 raise ClientFatalError(
                     "Aborting as changeset (%s) mentions resource at a location it does not have authority over (%s), override with --noauth"
                     % (changeset, resource.uri))
     ### 3. Apply changes
     for resource in src_changeset:
         uri = resource.uri
         file = self.mapper.src_to_dst(uri)
         if (resource.changetype == 'UPDATED'):
             self.logger.info("updated: %s -> %s" % (uri, file))
             self.update_resource(resource, file, 'UPDATED')
         elif (resource.changetype == 'CREATED'):
             self.logger.info("created: %s -> %s" % (uri, file))
             self.update_resource(resource, file, 'CREATED')
         elif (resource.changetype == 'DELETED'):
             if (allow_deletion):
                 file = self.mapper.src_to_dst(uri)
                 if (self.dryrun):
                     self.logger.info("dryrun: would delete %s -> %s" %
                                      (uri, file))
                 else:
                     os.unlink(file)
                     self.logger.info("deleted: %s -> %s" % (uri, file))
                     self.log_event(
                         ResourceChange(resource=resource,
                                        changetype="DELETED"))
             else:
                 self.logger.info(
                     "nodelete: would delete %s (--delete to enable)" % uri)
         else:
             raise ClientError("Unknown change type %s" %
                               (resource.changetype))
     self.logger.debug("Completed incremental stuff")
Beispiel #12
0
    def sync_or_audit(self, allow_deletion=False, audit_only=False):
        ### 0. Sanity checks
        if (len(self.mappings) < 1):
            raise ClientFatalError(
                "No source to destination mapping specified")
        ### 1. Get inventories from both src and dst
        # 1.a source inventory
        ib = InventoryBuilder(verbose=self.verbose, mapper=self.mapper)
        try:
            if (self.verbose):
                print "Reading sitemap %s ..." % (self.sitemap)
            src_inventory = ib.get(self.sitemap)
        except IOError as e:
            raise ClientFatalError("Can't read source inventory from %s (%s)" %
                                   (self.sitemap, str(e)))
        if (self.verbose):
            print "Read source inventory, %d resources listed" % (
                len(src_inventory))
        if (len(src_inventory) == 0):
            raise ClientFatalError(
                "Aborting as there are no resources to sync")
        if (self.checksum and not src_inventory.has_md5()):
            self.checksum = False
            print "Not calculating checksums on destination as not present in source inventory"
        # 1.b destination inventory mapped back to source URIs
        ib.do_md5 = self.checksum
        dst_inventory = ib.from_disk()
        ### 2. Compare these inventorys respecting any comparison options
        (num_same, updated, deleted,
         created) = dst_inventory.compare(src_inventory)
        ### 3. Report status and planned actions
        status = "  IN SYNC  "
        if (len(updated) > 0 or len(deleted) > 0 or len(created) > 0):
            status = "NOT IN SYNC"
        print "Status: %s (same=%d, updated=%d, deleted=%d, created=%d)" %\
              (status,num_same,len(updated),len(deleted),len(created))

        if (audit_only):
            return
        ### 4. Grab files to do sync
        for uri in updated:
            file = self.mapper.src_to_dst(uri)
            if (self.verbose):
                print "updated: %s -> %s" % (uri, file)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in created:
            file = self.mapper.src_to_dst(uri)
            self.update_resource(uri, file,
                                 src_inventory.resources[uri].timestamp)
        for uri in deleted:
            if (allow_deletion):
                file = self.mapper.src_to_dst(uri)
                if (self.dryrun):
                    print "dryrun: would delete %s -> %s" % (uri, file)
                else:
                    os.unlink(file)
                    if (self.verbose):
                        print "deleted: %s -> %s" % (uri, file)
            else:
                if (self.verbose):
                    print "nodelete: would delete %s (--delete to enable)" % uri