def test04_dump_multi_file_max_size(self): rl = ResourceList() for letter in map(chr, range(ord('a'), ord('l') + 1)): uri = 'http://ex.org/%s' % (letter) fname = 'tests/testdata/a_to_z/%s' % (letter) rl.add(Resource(uri, path=fname)) self.assertEqual(len(rl), 12) d2 = Dump(rl) tmpbase = os.path.join(self.tmpdir, 'test0f_') d2.max_size = 2000 n = d2.write(tmpbase) self.assertEqual(n, 2, 'expect to write 2 dump files') self.assertTrue(os.path.isfile(tmpbase + '00000.zip')) self.assertTrue(os.path.isfile(tmpbase + '00001.zip')) # Look at the first file in detail zipf = tmpbase + '00000.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'a', 'b', 'c', 'd', 'e', 'f']) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual(zo.getinfo('a').file_size, 9) self.assertEqual(zo.getinfo('b').file_size, 1116) self.assertEqual(zo.getinfo('c').file_size, 32) self.assertEqual(zo.getinfo('d').file_size, 13) self.assertEqual(zo.getinfo('e').file_size, 20) self.assertEqual(zo.getinfo('f').file_size, 1625) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf = tmpbase + '00001.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'g', 'h', 'i', 'j', 'k', 'l']) zo.close() os.unlink(zipf)
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test00_dump_creation(self): i = ResourceList() i.add(Resource("http://ex.org/a", length=1, path="resync/test/testdata/a")) i.add(Resource("http://ex.org/b", length=2, path="resync/test/testdata/b")) d = Dump() d.check_files(resource_list=i) self.assertEqual(d.total_size, 28)
def test11_bad_size(self): rl = ResourceList() rl.add( Resource('http://ex.org/a', length=9999, path='tests/testdata/a')) d = Dump(rl) self.assertTrue(d.check_files(check_length=False)) self.assertRaises(DumpError, d.check_files)
def test5_add_changed_resources(self): added = ResourceList() added.add( Resource('a',timestamp=1) ) added.add( Resource('d',timestamp=4)) self.assertEqual(len(added), 2, "2 things in added resource_list") changes = ChangeList() changes.add_changed_resources( added, change='created' ) self.assertEqual(len(changes), 2, "2 things added") i = iter(changes) first = i.next() self.assertEqual(first.uri, 'a', "changes[0].uri=a") self.assertEqual(first.timestamp, 1, "changes[0].timestamp=1") self.assertEqual(first.change, 'created') #, "changes[0].change=created") second = i.next() self.assertEqual(second.timestamp, 4, "changes[1].timestamp=4") self.assertEqual(second.change, 'created', "changes[1].change=created") # Now add some with updated (one same, one diff) updated = ResourceList() updated.add( Resource('a',timestamp=5) ) updated.add( Resource('b',timestamp=6)) self.assertEqual(len(updated), 2, "2 things in updated resource_list") changes.add_changed_resources( updated, change='updated' ) self.assertEqual(len(changes), 4, "4 = 2 old + 2 things updated") # Make new resource_list from the changes which should not have dupes dst = ResourceList() dst.add( changes, replace=True ) self.assertEqual(len(dst), 3, "3 unique resources") self.assertEqual(dst.resources['a'].timestamp, 5 ) # 5 was later in last the 1 self.assertEqual(dst.resources['a'].change, 'updated') self.assertEqual(dst.resources['b'].timestamp, 6) self.assertEqual(dst.resources['b'].change, 'updated') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].change, 'created')
def test04_dump_multi_file_max_size(self): rl=ResourceList() for letter in map(chr,xrange(ord('a'),ord('l')+1)): uri='http://ex.org/%s' % (letter) fname='resync/test/testdata/a_to_z/%s' % (letter) rl.add( Resource(uri, path=fname) ) self.assertEqual( len(rl), 12 ) d2=Dump(rl) tmpbase=os.path.join(self.tmpdir,'test0f_') d2.max_size=2000 n=d2.write(tmpbase) self.assertEqual( n, 2, 'expect to write 2 dump files' ) self.assertTrue( os.path.isfile(tmpbase+'00000.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00001.zip') ) # Look at the first file in detail zipf=tmpbase+'00000.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d','e','f'] ) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual( zo.getinfo('a').file_size, 9 ) self.assertEqual( zo.getinfo('b').file_size, 1116 ) self.assertEqual( zo.getinfo('c').file_size, 32 ) self.assertEqual( zo.getinfo('d').file_size, 13 ) self.assertEqual( zo.getinfo('e').file_size, 20 ) self.assertEqual( zo.getinfo('f').file_size, 1625 ) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf=tmpbase+'00001.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','g','h','i','j','k','l'] ) zo.close() os.unlink(zipf)
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() print xml self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test_build_ex_01(self): """Simple Resource List document """ rl = ResourceList() rl.md_at = '2013-01-03T09:00:00Z' rl.add( Resource('http://example.com/res1') ) rl.add( Resource('http://example.com/res2') ) ex_xml = self._open_ex('resourcesync_ex_1').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test_09_print_from_iter(self): r1 = Resource(uri='a',lastmod='2001-01-01',length=1234) r2 = Resource(uri='b',lastmod='2002-02-02',length=56789) m = ResourceList() m.add(r1) m.add(r2) i = iter(m) self.assertEqual( Sitemap().resources_as_xml(i), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>")
def test_08_print_non_ascii_uri(self): """Verify that valid Unicode uri values give good XML out.""" m = ResourceList(md={'capability': 'resourcelist', 'modified': None}) m.add(Resource(uri=u'a_\u00c3_b')) m.add(Resource(uri=u'c_\u1234_d')) xml = Sitemap().resources_as_xml(m) self.assertTrue(re.search(u'<loc>a_.*_b</loc>', xml)) self.assertTrue(re.search(u'<loc>a_\u00c3_b</loc>', xml)) self.assertTrue(re.search(u'<loc>c_\u1234_d</loc>', xml))
def test10_no_path(self): rl = ResourceList() rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a')) rl.add(Resource('http://ex.org/b', length=21)) d = Dump(rl) self.assertRaises(DumpError, d.check_files)
def test20_as_xml(self): rl = ResourceList() rl.add( Resource('a',timestamp=1) ) rl.add( Resource('b',timestamp=2) ) xml = rl.as_xml() print xml self.assertTrue( re.search(r'<rs:md .*capability="resourcelist"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<rs:md .*modified="\d\d\d\d\-\d\d\-\d\dT\d\d:\d\d:\d\dZ"', xml), 'XML has modified to seconds precision (and not more)' ) self.assertTrue( re.search(r'<url><loc>a</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test07_has_md5(self): r1 = Resource(uri='a') r2 = Resource(uri='b') i = ResourceList() self.assertFalse( i.has_md5() ) i.add(r1) i.add(r2) self.assertFalse( i.has_md5() ) r1.md5="aabbcc" self.assertTrue( i.has_md5() )
def test_08_print(self): r1 = Resource(uri='a',lastmod='2001-01-01',length=1234) r2 = Resource(uri='b',lastmod='2002-02-02',length=56789) r3 = Resource(uri='c',lastmod='2003-03-03',length=0) m = ResourceList(md={'capability':'resourcelist','modified':None}) m.add(r1) m.add(r2) m.add(r3) #print m self.assertEqual( Sitemap().resources_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>")
def test_09_print_from_iter(self): r1 = Resource(uri='a', lastmod='2001-01-01', length=1234) r2 = Resource(uri='b', lastmod='2002-02-02', length=56789) m = ResourceList() m.add(r1) m.add(r2) i = iter(m) self.assertEqual( Sitemap().resources_as_xml(i), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url></urlset>" )
def test_11_write_multifile(self): tempdir = tempfile.mkdtemp(prefix='test_resource_list_multifile') rl = ResourceList() rl.mapper = Mapper(['http://localhost/=%s/' % (tempdir)]) rl.add(Resource(uri='http://localhost/a')) rl.add(Resource(uri='http://localhost/b')) rl.add(Resource(uri='http://localhost/c')) rl.add(Resource(uri='http://localhost/d')) rl.max_sitemap_entries = 2 # first try writing without mutlifile allowed rl.allow_multifile = False self.assertRaises(ListBaseIndexError, rl.write, basename=os.path.join(tempdir, 'sitemap.xml')) # second actually do it rl.allow_multifile = True rl.write(basename=os.path.join(tempdir, 'sitemap.xml')) # check the two component sitemaps rl1 = ResourceList() rl1.read(os.path.join(tempdir, 'sitemap00000.xml')) self.assertEquals(len(rl1), 2) self.assertEquals(rl1.capability, 'resourcelist') self.assertFalse(rl1.sitemapindex) i = iter(rl1) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') rl2 = ResourceList() rl2.read(os.path.join(tempdir, 'sitemap00001.xml')) self.assertEquals(len(rl2), 2) i = iter(rl2) self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # check the sitemapindex (read just as index) rli = ResourceList() rli.read(os.path.join(tempdir, 'sitemap.xml'), index_only=True) self.assertEquals(len(rli), 2) i = iter(rli) self.assertEquals(rli.capability, 'resourcelist') self.assertTrue(rli.sitemapindex) self.assertEquals(next(i).uri, 'http://localhost/sitemap00000.xml') self.assertEquals(next(i).uri, 'http://localhost/sitemap00001.xml') # check the sitemapindex and components rli = ResourceList(mapper=rl.mapper) rli.read(os.path.join(tempdir, 'sitemap.xml')) self.assertEquals(len(rli), 4) self.assertEquals(rli.capability, 'resourcelist') self.assertFalse(rli.sitemapindex) i = iter(rli) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # cleanup tempdir shutil.rmtree(tempdir)
def test07_hashes(self): r1 = Resource(uri='a') r2 = Resource(uri='b') i = ResourceList() self.assertEqual(i.hashes(), set()) i.add(r1) i.add(r2) self.assertEqual(i.hashes(), set()) r1.md5 = "aabbcc" self.assertEqual(i.hashes(), set(['md5'])) r2.sha1 = "ddeeff" self.assertEqual(i.hashes(), set(['md5', 'sha-1']))
def test08_iter(self): i = ResourceList() i.add( Resource('a',timestamp=1) ) i.add( Resource('b',timestamp=2) ) i.add( Resource('c',timestamp=3) ) i.add( Resource('d',timestamp=4) ) resources=[] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual( resources[0].uri, 'a') self.assertEqual( resources[3].uri, 'd')
def test08_iter(self): i = ResourceList() i.add(Resource('a', timestamp=1)) i.add(Resource('b', timestamp=2)) i.add(Resource('c', timestamp=3)) i.add(Resource('d', timestamp=4)) resources = [] for r in i: resources.append(r) self.assertEqual(len(resources), 4) self.assertEqual(resources[0].uri, 'a') self.assertEqual(resources[3].uri, 'd')
def test06_add_iterable(self): r1 = Resource(uri='a', length=1) r2 = Resource(uri='b', length=2) i = ResourceList() i.add([r1, r2]) self.assertRaises(ResourceListDupeError, i.add, r1) self.assertRaises(ResourceListDupeError, i.add, r2) # allow dupes r1d = Resource(uri='a', length=10) i.add([r1d], replace=True) self.assertEqual(len(i), 2) self.assertEqual(i.resources['a'].length, 10)
def test06_add_iterable(self): r1 = Resource(uri='a',length=1) r2 = Resource(uri='b',length=2) i = ResourceList() i.add( [r1,r2] ) self.assertRaises( ResourceListDupeError, i.add, r1) self.assertRaises( ResourceListDupeError, i.add, r2) # allow dupes r1d = Resource(uri='a',length=10) i.add( [r1d] ,replace=True) self.assertEqual( len(i), 2 ) self.assertEqual( i.resources['a'].length, 10 )
def test_11_write_multifile(self): tempdir = tempfile.mkdtemp(prefix='test_resource_list_multifile_dir') rl = ResourceList() rl.mapper = Mapper(['http://localhost/=%s/' % (tempdir)]) rl.add(Resource(uri='http://localhost/a')) rl.add(Resource(uri='http://localhost/b')) rl.add(Resource(uri='http://localhost/c')) rl.add(Resource(uri='http://localhost/d')) rl.max_sitemap_entries = 2 # first try writing without mutlifile allowed rl.allow_multifile = False self.assertRaises(ListBaseIndexError, rl.write, basename=os.path.join(tempdir, 'sitemap.xml')) # second actually do it rl.allow_multifile = True rl.write(basename=os.path.join(tempdir, 'sitemap.xml')) # check the two component sitemaps rl1 = ResourceList() rl1.read(os.path.join(tempdir, 'sitemap00000.xml')) self.assertEquals(len(rl1), 2) self.assertEquals(rl1.capability, 'resourcelist') self.assertFalse(rl1.sitemapindex) i = iter(rl1) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') rl2 = ResourceList() rl2.read(os.path.join(tempdir, 'sitemap00001.xml')) self.assertEquals(len(rl2), 2) i = iter(rl2) self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # check the sitemapindex (read just as index) rli = ResourceList() rli.read(os.path.join(tempdir, 'sitemap.xml'), index_only=True) self.assertEquals(len(rli), 2) i = iter(rli) self.assertEquals(rli.capability, 'resourcelist') self.assertTrue(rli.sitemapindex) self.assertEquals(next(i).uri, 'http://localhost/sitemap00000.xml') self.assertEquals(next(i).uri, 'http://localhost/sitemap00001.xml') # check the sitemapindex and components rli = ResourceList(mapper=rl.mapper) rli.read(os.path.join(tempdir, 'sitemap.xml')) self.assertEquals(len(rli), 4) self.assertEquals(rli.capability, 'resourcelist') self.assertFalse(rli.sitemapindex) i = iter(rli) self.assertEquals(next(i).uri, 'http://localhost/a') self.assertEquals(next(i).uri, 'http://localhost/b') self.assertEquals(next(i).uri, 'http://localhost/c') self.assertEquals(next(i).uri, 'http://localhost/d') # cleanup tempdir shutil.rmtree(tempdir)
def test_07_print(self): r1 = Resource(uri='a', lastmod='2001-01-01', length=1234) r2 = Resource(uri='b', lastmod='2002-02-02', length=56789) r3 = Resource(uri='c', lastmod='2003-03-03', length=0) m = ResourceList(md={'capability': 'resourcelist', 'modified': None}) m.add(r1) m.add(r2) m.add(r3) # print m self.assertEqual( Sitemap().resources_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>" )
def test33_write(self): # ResourceList rl = ResourceList() rl.add(Resource(uri='http://example.com/test/a', timestamp=1)) rl.add(Resource(uri='http://example.com/test/b', timestamp=1)) rl.add(Resource(uri='http://example.com/test/c', timestamp=1)) rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml') rl.write(basename=rl_filename) with open(rl_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertFalse(s.parsed_index) # ResourceListIndex rli = ResourceList() rli.add(Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1)) rli.add(Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1)) rli.add(Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1)) rli.sitemapindex = True rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml') rli.write(basename=rli_filename) with open(rli_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertTrue(s.parsed_index)
def test_build_ex_02(self): """Slightly more complex Resource List document """ rl = ResourceList() rl.md_at = '2013-01-03T09:00:00Z' rl.add( Resource(uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6') ) r2 = Resource(uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e') r2.link_set(rel="duplicate",href="http://mirror.example.com/res2") rl.add( r2 ) ex_xml = self._open_ex('resourcesync_ex_2').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test_build_ex_08(self): """Simple Resource List Index document This is not something that would usually be created directly but instead would be created as part of the process of writing a large Resource List in multiple files. However, it is possible to create manually. """ rli = ResourceList() rli.sitemapindex=True rli.md_at = '2013-01-03T09:00:00Z' rli.add( Resource(uri='http://example.com/resourcelist-part1.xml') ) rli.add( Resource(uri='http://example.com/resourcelist-part2.xml') ) ex_xml = self._open_ex('resourcesync_ex_8').read() self._assert_xml_equal( rli.as_xml(), ex_xml )
def test02_changed(self): src = ResourceList() src.add(Resource('a', timestamp=1)) src.add(Resource('b', timestamp=2)) dst = ResourceList() dst.add(Resource('a', timestamp=3)) dst.add(Resource('b', timestamp=4)) (same, changed, deleted, added) = dst.compare(src) self.assertEqual(len(same), 0, "0 things unchanged") self.assertEqual(len(changed), 2, "2 things changed") i = iter(changed) self.assertEqual(next(i).uri, 'a', "first was a") self.assertEqual(next(i).uri, 'b', "second was b") self.assertEqual(len(deleted), 0, "nothing deleted") self.assertEqual(len(added), 0, "nothing added")
def test02_changed(self): src = ResourceList() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) dst = ResourceList() dst.add( Resource('a',timestamp=3) ) dst.add( Resource('b',timestamp=4) ) ( same, changed, deleted, added ) = dst.compare(src) self.assertEqual( len(same), 0, "0 things unchanged" ) self.assertEqual( len(changed), 2, "2 things changed" ) i = iter(changed) self.assertEqual( next(i).uri, 'a', "first was a" ) self.assertEqual( next(i).uri, 'b', "second was b" ) self.assertEqual( len(deleted), 0, "nothing deleted" ) self.assertEqual( len(added), 0, "nothing added" )
def test_build_ex_16(self): rl = ResourceList() rl.up = 'http://example.com/dataset1/capabilitylist.xml' rl.index = 'http://example.com/dataset1/resourcelist-index.xml' rl.md_at="2013-01-03T09:00:00Z" rl.add( Resource( uri='http://example.com/res3', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c8753', length=4385, mime_type="application/pdf" )) rl.add( Resource( uri='http://example.com/res4', lastmod='2013-01-02T14:00:00Z', md5='4556abdf8ebdc9802ac0c6a7402c9881', length=883, mime_type="image/png" )) ex_xml = self._open_ex('resourcesync_ex_16').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test03_dump_multi_file_max_size(self): rl = ResourceList() for letter in map(chr, range(ord('a'), ord('l') + 1)): uri = 'http://ex.org/%s' % (letter) fname = 'tests/testdata/a_to_z/%s' % (letter) rl.add(Resource(uri, path=fname)) self.assertEqual(len(rl), 12) #d=Dump(rl) #tmpdir=tempfile.mkdtemp() #tmpbase=os.path.join(tmpdir,'base') #d.max_size=2000 # start new zip after size exceeds 2000 bytes #n=d.write(tmpbase) #self.assertEqual( n, 2, 'expect to write 2 dump files' ) # # Now repeat with large size limit but small number of files limit d2 = Dump(rl) tmpbase = os.path.join(self.tmpdir, 'test03_') d2.max_files = 4 n = d2.write(tmpbase) self.assertEqual(n, 3, 'expect to write 3 dump files') self.assertTrue(os.path.isfile(tmpbase + '00000.zip')) self.assertTrue(os.path.isfile(tmpbase + '00001.zip')) self.assertTrue(os.path.isfile(tmpbase + '00002.zip')) # Look at the first file in detail zipf = tmpbase + '00000.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'a', 'b', 'c', 'd']) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual(zo.getinfo('a').file_size, 9) self.assertEqual(zo.getinfo('b').file_size, 1116) self.assertEqual(zo.getinfo('c').file_size, 32) self.assertEqual(zo.getinfo('d').file_size, 13) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf = tmpbase + '00001.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'e', 'f', 'g', 'h']) zo.close() os.unlink(zipf) zipf = tmpbase + '00002.zip' zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(zo.namelist(), ['manifest.xml', 'i', 'j', 'k', 'l']) zo.close() os.unlink(zipf)
def test03_dump_multi_file_max_size(self): rl=ResourceList() for letter in map(chr,xrange(ord('a'),ord('l')+1)): uri='http://ex.org/%s' % (letter) fname='resync/test/testdata/a_to_z/%s' % (letter) rl.add( Resource(uri, path=fname) ) self.assertEqual( len(rl), 12 ) #d=Dump(rl) #tmpdir=tempfile.mkdtemp() #tmpbase=os.path.join(tmpdir,'base') #d.max_size=2000 # start new zip after size exceeds 2000 bytes #n=d.write(tmpbase) #self.assertEqual( n, 2, 'expect to write 2 dump files' ) # # Now repeat with large size limit but small number of files limit d2=Dump(rl) tmpbase=os.path.join(self.tmpdir,'test03_') d2.max_files=4 n=d2.write(tmpbase) self.assertEqual( n, 3, 'expect to write 3 dump files' ) self.assertTrue( os.path.isfile(tmpbase+'00000.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00001.zip') ) self.assertTrue( os.path.isfile(tmpbase+'00002.zip') ) # Look at the first file in detail zipf=tmpbase+'00000.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','a','b','c','d'] ) #self.assertEqual( zo.getinfo('manifest.xml').file_size, 470 ) self.assertEqual( zo.getinfo('a').file_size, 9 ) self.assertEqual( zo.getinfo('b').file_size, 1116 ) self.assertEqual( zo.getinfo('c').file_size, 32 ) self.assertEqual( zo.getinfo('d').file_size, 13 ) zo.close() os.unlink(zipf) # Check second and third files have expected contents zipf=tmpbase+'00001.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','e','f','g','h'] ) zo.close() os.unlink(zipf) zipf=tmpbase+'00002.zip' zo=zipfile.ZipFile(zipf,'r') self.assertEqual( zo.namelist(), ['manifest.xml','i','j','k','l'] ) zo.close() os.unlink(zipf)
def test_build_ex_14(self): """Resource List with 2 entries and some metadata""" rl = ResourceList() rl.up='http://example.com/dataset1/capabilitylist.xml' rl.md_at="2013-01-03T09:00:00Z" rl.md_completed="2013-01-03T09:01:00Z" rl.add( Resource( uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type="text/html" )) rl.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type="application/pdf" )) ex_xml = self._open_ex('resourcesync_ex_14').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test06_add_changed_resources(self): added = ResourceList() added.add( Resource('a',timestamp=1,change='created') ) added.add( Resource('d',timestamp=4,change='created') ) self.assertEqual(len(added), 2, "2 things in added resource_list") changes = ChangeList() changes.add_changed_resources( added, change='created' ) self.assertEqual(len(changes), 2, "2 things added") i = iter(changes) first = next(i) self.assertEqual(first.uri, 'a', "changes[0].uri=a") self.assertEqual(first.timestamp, 1, "changes[0].timestamp=1") self.assertEqual(first.change, 'created') #, "changes[0].change=createdd") second = next(i) self.assertEqual(second.timestamp, 4, "changes[1].timestamp=4") self.assertEqual(second.change, 'created', "changes[1].change=createdd") # Now add some with updated (one same, one diff) updated = ResourceList() updated.add( Resource('a',timestamp=5,change='created') ) updated.add( Resource('b',timestamp=6,change='created') ) self.assertEqual(len(updated), 2, "2 things in updated resource_list") changes.add_changed_resources( updated, change='updated' ) self.assertEqual(len(changes), 4, "4 = 2 old + 2 things updated") # Make new resource_list from the changes which should not have dupes dst = ResourceList() dst.add( changes, replace=True ) self.assertEqual(len(dst), 3, "3 unique resources") self.assertEqual(dst.resources['a'].timestamp, 5 ) # 5 was later in last the 1 self.assertEqual(dst.resources['a'].change, 'updated') self.assertEqual(dst.resources['b'].timestamp, 6) self.assertEqual(dst.resources['b'].change, 'updated') self.assertEqual(dst.resources['d'].timestamp, 4) self.assertEqual(dst.resources['d'].change, 'created')
def test33_write(self): # ResourceList rl = ResourceList() rl.add(Resource(uri='http://example.com/test/a', timestamp=1)) rl.add(Resource(uri='http://example.com/test/b', timestamp=1)) rl.add(Resource(uri='http://example.com/test/c', timestamp=1)) rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml') rl.write(basename=rl_filename) with open(rl_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertFalse(s.parsed_index) # ResourceListIndex rli = ResourceList() rli.add( Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1)) rli.sitemapindex = True rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml') rli.write(basename=rli_filename) with open(rli_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertTrue(s.parsed_index)
def test_build_ex_15(self): """Resource List Index with metadata""" rl = ResourceList(resources_class=ResourceListOrdered) #order in example is non-canonical rl.sitemapindex=True rl.up='http://example.com/dataset1/capabilitylist.xml' rl.md_at="2013-01-03T09:00:00Z" rl.md_completed="2013-01-03T09:10:00Z" rl.add( Resource( uri='http://example.com/resourcelist1.xml', md_at='2013-01-03T09:00:00Z' )) rl.add( Resource( uri='http://example.com/resourcelist2.xml', md_at='2013-01-03T09:03:00Z' )) rl.add( Resource( uri='http://example.com/resourcelist3.xml', md_at='2013-01-03T09:07:00Z' )) ex_xml = self._open_ex('resourcesync_ex_15').read() self._assert_xml_equal( rl.as_xml(), ex_xml )
def test03_deleted(self): src = ResourceList() src.add( Resource('a',timestamp=1) ) src.add( Resource('b',timestamp=2) ) dst = ResourceList() dst.add( Resource('a',timestamp=1) ) dst.add( Resource('b',timestamp=2) ) dst.add( Resource('c',timestamp=3) ) dst.add( Resource('d',timestamp=4) ) ( same, changed, deleted, added ) = dst.compare(src) self.assertEqual( len(same), 2, "2 things unchanged" ) self.assertEqual( len(changed), 0, "nothing changed" ) self.assertEqual( len(deleted), 2, "c and d deleted" ) i = iter(deleted) self.assertEqual( i.next().uri, 'c', "first was c" ) self.assertEqual( i.next().uri, 'd', "second was d" ) self.assertEqual( len(added), 0, "nothing added" )
from resync.resource_list import ResourceList from resync.resource import Resource from resync.sitemap import Sitemap rl = ResourceList() rl.add( Resource('http://example.com/res1', lastmod='2013-01-01') ) rl.add( Resource('http://example.com/res2', lastmod='2013-01-02') ) print rl.as_xml(pretty_xml=True)
rl = ResourceList() timestamps = [] for filename in listdir(args.resource_dir): if filename[:len("rdfpatch-")] != "rdfpatch-": continue _, raw_ts = filename.split("-") ts = ( raw_ts[:4] + "-" + raw_ts[4:6] + "-" + raw_ts[6:8] + "T" + raw_ts[8:10] + ":" + raw_ts[10:12] + ":" + raw_ts[12:14] + "Z" ) timestamps.append(ts) rl.add(Resource(args.resource_url + filename, lastmod=ts)) # Print to file at args.resource_dir + "/resource-list.xml" resource_list_file = open(args.resource_dir + "/resource-list.xml", "w") resource_list_file.write(rl.as_xml()) resource_list_file.close() print "Wrote resource list to: " + args.resource_dir + "/resource-list.xml" timestamps.sort() caps = CapabilityList() caps.add_capability(rl, args.resource_url + "resource-list.xml") if len(timestamps) > 0: caps.md['from'] = timestamps[0] # Print to file at args.resource_dir + "/capability-list.xml"
from resync.resource_list import ResourceList from resync.resource import Resource from resync.sitemap import Sitemap rl = ResourceList() rl.add(Resource('http://example.com/res1', lastmod='2013-01-01')) rl.add(Resource('http://example.com/res2', lastmod='2013-01-02')) print rl.as_xml(pretty_xml=True)
if args.resource_url[-1] != '/': args.resource_url += '/' if not isdir(args.resource_dir): raise IOError(args.resource_dir + " is not a directory") rl = ResourceList() timestamps = [] for filename in listdir(args.resource_dir): if filename[:len("rdfpatch-")] != "rdfpatch-": continue _, raw_ts = filename.split("-") ts = (raw_ts[:4] + "-" + raw_ts[4:6] + "-" + raw_ts[6:8] + "T" + raw_ts[8:10] + ":" + raw_ts[10:12] + ":" + raw_ts[12:14] + "Z") timestamps.append(ts) rl.add(Resource(args.resource_url + filename, lastmod=ts)) # Print to file at args.resource_dir + "/resource-list.xml" resource_list_file = open(args.resource_dir + "/resource-list.xml", "w") resource_list_file.write(rl.as_xml()) resource_list_file.close() print "Wrote resource list to: " + args.resource_dir + "/resource-list.xml" timestamps.sort() caps = CapabilityList() caps.add_capability(rl, args.resource_url + "resource-list.xml") if len(timestamps) > 0: caps.md['from'] = timestamps[0] # Print to file at args.resource_dir + "/capability-list.xml"
def test11_bad_size(self): rl=ResourceList() rl.add( Resource('http://ex.org/a', length=9999, path='resync/test/testdata/a') ) d=Dump(rl) self.assertTrue( d.check_files(check_length=False) ) self.assertRaises( DumpError, d.check_files )
def test10_no_path(self): rl=ResourceList() rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a') ) rl.add( Resource('http://ex.org/b', length=21 ) ) d=Dump(rl) self.assertRaises( DumpError, d.check_files )