def test_md5(self): from ambry.run import get_runconfig from ambry.cache import new_cache from ambry.util import md5_for_file from ambry.cache.filesystem import make_metadata rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG)) fn = self.make_test_file() md5 = md5_for_file(fn) cache = new_cache(rc.filesystem("fscache")) cache.put(fn, "foo1") abs_path = cache.path("foo1") self.assertEquals(md5, cache.md5("foo1")) cache = new_cache(rc.filesystem("compressioncache")) cache.put(fn, "foo2", metadata=make_metadata(fn)) abs_path = cache.path("foo2") self.assertEquals(md5, cache.md5("foo2")) os.remove(fn)
def md5(self, rel_path): from ambry.util import md5_for_file abs_path = self.path(rel_path) if not abs_path: return None return md5_for_file(abs_path)
def fs_hash(self): from ambry.util import md5_for_file if not self.exists(): return None fn_path = self.file_name with self._fs.open(fn_path, mode='rb') as f: return md5_for_file(f)
def test_caches(self): '''Basic test of put(), get() and has() for all cache types''' from functools import partial from ambry.run import get_runconfig, RunConfig from ambry.filesystem import Filesystem from ambry.cache import new_cache from ambry.util import md5_for_file from ambry.bundle import DbBundle #self.start_server() # For the rest-cache #fn = '/tmp/1mbfile' #with open(fn, 'wb') as f: # f.write('.'*(1024)) fn = self.bundle.database.path # Opening the file might run the database updates in # database.sqlite._on_connect_update_schema, which can affect the md5. b = DbBundle(fn) md5 = md5_for_file(fn) print "MD5 {} = {}".format(fn, md5) rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'), RunConfig.USER_ACCOUNTS)) for i, fsname in enumerate(['fscache', 'limitedcache', 'compressioncache', 'cached-s3', 'cached-compressed-s3']): config = rc.filesystem(fsname) cache = new_cache(config) print '---', fsname, cache identity = self.bundle.identity relpath = identity.cache_key r = cache.put(fn, relpath,identity.to_meta(md5=md5)) r = cache.get(relpath) if not r.startswith('http'): self.assertTrue(os.path.exists(r), 'Not a url: {}: {}'.format(r,str(cache))) self.assertTrue(cache.has(relpath, md5=md5)) cache.remove(relpath, propagate=True) self.assertFalse(os.path.exists(r), str(cache)) self.assertFalse(cache.has(relpath)) cache = new_cache(rc.filesystem('s3cache-noupstream')) r = cache.put(fn, 'a')
def test_caches(self): '''Basic test of put(), get() and has() for all cache types''' from ambry.run import get_runconfig from ambry.cache import new_cache from ambry.util import md5_for_file from ambry.bundle import DbBundle self.start_server() # For the rest-cache #fn = '/tmp/1mbfile' #with open(fn, 'wb') as f: # f.write('.'*(1024)) fn = self.bundle.database.path # Opening the file might run the database updates in # database.sqlite._on_connect_update_schema, which can affect the md5. b = DbBundle(fn) md5 = md5_for_file(fn) print "MD5 {} = {}".format(fn, md5) rc = get_runconfig( (os.path.join(self.bundle_dir, 'test-run-config.yaml'), RunConfig.USER_CONFIG)) for i, fsname in enumerate([ 'fscache', 'limitedcache', 'compressioncache', 'cached-s3', 'cached-compressed-s3' ]): #'compressioncache', config = rc.filesystem(fsname) cache = new_cache(config) print '---', fsname, cache identity = self.bundle.identity relpath = identity.cache_key r = cache.put(fn, relpath, identity.to_meta(md5=md5)) r = cache.get(relpath) if not r.startswith('http'): self.assertTrue(os.path.exists(r), str(cache)) self.assertTrue(cache.has(relpath, md5=md5)) cache.remove(relpath, propagate=True) self.assertFalse(os.path.exists(r), str(cache)) self.assertFalse(cache.has(relpath)) cache = new_cache(rc.filesystem('s3cache-noupstream')) r = cache.put(fn, 'a')
def has(self, rel_path, md5=None, propagate=True): from . import md5_for_file abs_path = os.path.join(self.cache_dir, rel_path) if os.path.exists(abs_path) and ( not md5 or md5 == md5_for_file(abs_path)): return abs_path if self.upstream and propagate: return self.upstream.has(rel_path, md5=md5, propagate=propagate) return False
def has(self, rel_path, md5=None, use_upstream=True): from ..util import md5_for_file abs_path = os.path.join(self.cache_dir, rel_path) if os.path.exists(abs_path) and ( not md5 or md5 == md5_for_file(abs_path)): return abs_path if self.upstream and use_upstream: return self.upstream.has(rel_path, md5=md5, use_upstream=use_upstream) return False
def test_caches(self): """Basic test of put(), get() and has() for all cache types""" from ambry.run import get_runconfig from ambry.cache import new_cache from ambry.util import md5_for_file from ambry.bundle import DbBundle self.start_server() # For the rest-cache # fn = '/tmp/1mbfile' # with open(fn, 'wb') as f: # f.write('.'*(1024)) fn = self.bundle.database.path # Opening the file might run the database updates in # database.sqlite._on_connect_update_schema, which can affect the md5. b = DbBundle(fn) md5 = md5_for_file(fn) print "MD5 {} = {}".format(fn, md5) rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG)) for i, fsname in enumerate( ["fscache", "limitedcache", "compressioncache", "cached-s3", "cached-compressed-s3"] ): #'compressioncache', config = rc.filesystem(fsname) cache = new_cache(config) print "---", fsname, cache identity = self.bundle.identity relpath = identity.cache_key r = cache.put(fn, relpath, identity.to_meta(md5=md5)) r = cache.get(relpath) if not r.startswith("http"): self.assertTrue(os.path.exists(r), str(cache)) self.assertTrue(cache.has(relpath, md5=md5)) cache.remove(relpath, propagate=True) self.assertFalse(os.path.exists(r), str(cache)) self.assertFalse(cache.has(relpath)) cache = new_cache(rc.filesystem("s3cache-noupstream")) r = cache.put(fn, "a")
def _send(self, package, extract_data, file_): import os import mimetypes _, ext = os.path.splitext(file_) mimetypes.init() content_type = mimetypes.types_map.get(ext, None) # @UndefinedVariable try: _, format = content_type.split('/') except: format = None name = extract_data.get('name', os.path.basename(file_)) # # If the filestore exists, write to S3 first, the upload the URL if self.filestore: from ambry.util import md5_for_file urlf = self.filestore.public_url_f(public=True) path = self.bundle.identity.path + '/' + name # Don't upload if S3 has the file of the same key and md5 md5 = md5_for_file(file_) if not self.filestore.has(path, md5=md5): self.filestore.put( file_, path, metadata={ 'public': True, 'md5': md5}) r = self.remote.add_url_resource( package, urlf(path), name, description=extract_data['description'], content_type=content_type, format=format, hash=md5, rel_path=path) else: r = self.remote.add_file_resource( package, file_, name=name, description=extract_data['description'], content_type=content_type, format=format) return r
def _send(self, package, extract_data, file_): import os import mimetypes _, ext = os.path.splitext(file_) mimetypes.init() content_type = mimetypes.types_map.get(ext, None) # @UndefinedVariable try: _, format = content_type.split('/') except: format = None name = extract_data.get('name', os.path.basename(file_)) # # If the filestore exists, write to S3 first, the upload the URL if self.filestore: from ambry.util import md5_for_file urlf = self.filestore.public_url_f(public=True) path = self.bundle.identity.path + '/' + name # Don't upload if S3 has the file of the same key and md5 md5 = md5_for_file(file_) if not self.filestore.has(path, md5=md5): self.filestore.put(file_, path, metadata={ 'public': True, 'md5': md5 }) r = self.remote.add_url_resource( package, urlf(path), name, description=extract_data['description'], content_type=content_type, format=format, hash=md5, rel_path=path) else: r = self.remote.add_file_resource( package, file_, name=name, description=extract_data['description'], content_type=content_type, format=format) return r
def test_compression(self): from ambry.run import get_runconfig from ambry.cache import new_cache from ambry.util import temp_file_name, md5_for_file, copy_file_or_flo rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG)) comp_cache = new_cache(rc.filesystem("compressioncache")) test_file_name = "test_file" fn = temp_file_name() print "orig file ", fn with open(fn, "wb") as f: for i in range(1000): f.write("{:03d}:".format(i)) cf = comp_cache.put(fn, test_file_name) with open(cf) as stream: from ambry.util.sgzip import GzipFile stream = GzipFile(stream) uncomp_cache = new_cache(rc.filesystem("fscache")) uncomp_stream = uncomp_cache.put_stream("decomp") copy_file_or_flo(stream, uncomp_stream) uncomp_stream.close() dcf = uncomp_cache.get("decomp") self.assertEquals(md5_for_file(fn), md5_for_file(dcf)) os.remove(fn)
def _record_file(self, url, out_file, process = File.PROCESS.DOWNLOADED): from ambry.util import md5_for_file from time import time from sqlalchemy.orm.exc import NoResultFound from os import stat # Create a file record of the download try: fr = (self.bundle.database.session.query(File) .filter(File.path == url).filter(File.ref == str(self.bundle.identity.vid)) .filter(File.type_ == File.TYPE.DOWNLOAD).one() ) except NoResultFound: self.bundle.database.session.rollback() fr = None content_hash = md5_for_file(out_file) stat = os.stat(out_file) if fr: if content_hash != fr.hash: fr.modified = stat.st_mtime fr.hash = content_hash fr.process = File.PROCESS.MODIFIED else: fr.process = File.PROCESS.UNMODIFIED else: fr = File(path=url, ref=str(self.bundle.identity.vid), type=File.TYPE.DOWNLOAD, modified= int(stat.st_mtime), size = stat.st_size, hash=content_hash, process=process, source_url=url, data=dict() ) fr.data['last_download'] = time() self.bundle.database.session.merge(fr) self.bundle.database.session.commit()
def make_metadata(file_name): from ambry.util import md5_for_file return dict( md5 = md5_for_file(file_name) )
def md5(self): from ambry.util import md5_for_file return md5_for_file(self.path)
def test_load(self): from ambry.run import get_runconfig, RunConfig from ambry.client.rest import RemoteLibrary from ambry.cache import new_cache from ambry.util import md5_for_file from ambry.identity import Identity config = self.start_server() l = new_library(config) rl = RemoteLibrary(self.server_url) # # Check that the library can list datasets that are inserted externally # l.put_bundle(self.bundle) s = set([i.fqname for i in rl.list().values()]) self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', s) dsident = rl.dataset('diEGPXmDC8001') s = set([i.fqname for i in dsident.partitions.values()]) self.assertEquals(4, len(s)) self.assertIn('source-dataset-subset-variation-tthree-0.0.1~piEGPXmDC8003001', s) self.assertIn('source-dataset-subset-variation-geot1-geo-0.0.1~piEGPXmDC8001001', s) self.assertIn('source-dataset-subset-variation-geot2-geo-0.0.1~piEGPXmDC8002001', s) # # Upload the dataset to S3, clear the library, then load it back in # rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_ACCOUNTS)) cache = new_cache(rc.filesystem('cached-compressed-s3')) fn = self.bundle.database.path identity = self.bundle.identity relpath = identity.cache_key r = cache.put(fn, relpath, identity.to_meta(file=fn)) self.assertTrue(bool(cache.has(relpath))) # clear the library. l.purge() self.assertNotIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', set([i.fqname for i in rl.list()])) # Load from S3, directly in to the local library identity.add_md5(md5_for_file(fn)) l.load(identity.cache_key, identity.md5) self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', set([i.fqname for i in rl.list().values()])) # Do it one more time, using the remote library l.purge() self.assertNotIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', set([i.fqname for i in rl.list().values()])) rl.load_dataset(identity) self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', set([i.fqname for i in rl.list().values()])) # Check that we can get the record from the library self.assertEquals(identity.vid, rl.resolve(identity.vid).vid) self.assertEquals(identity.vid, rl.resolve(identity.vname).vid) self.assertEquals(identity.vid, rl.resolve(identity.cache_key).vid) self.assertEquals(identity.vid, rl.resolve(identity.sname).vid)