Ejemplo n.º 1
0
    def test_md5(self):
        from ambry.run import get_runconfig
        from ambry.cache import new_cache
        from ambry.util import md5_for_file
        from ambry.cache.filesystem import make_metadata

        rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG))

        fn = self.make_test_file()

        md5 = md5_for_file(fn)

        cache = new_cache(rc.filesystem("fscache"))

        cache.put(fn, "foo1")

        abs_path = cache.path("foo1")

        self.assertEquals(md5, cache.md5("foo1"))

        cache = new_cache(rc.filesystem("compressioncache"))

        cache.put(fn, "foo2", metadata=make_metadata(fn))

        abs_path = cache.path("foo2")

        self.assertEquals(md5, cache.md5("foo2"))

        os.remove(fn)
Ejemplo n.º 2
0
    def md5(self, rel_path):
        from ambry.util import md5_for_file

        abs_path = self.path(rel_path)

        if not abs_path:
            return None

        return md5_for_file(abs_path)
Ejemplo n.º 3
0
    def fs_hash(self):
        from ambry.util import md5_for_file

        if not self.exists():
            return None

        fn_path = self.file_name

        with self._fs.open(fn_path, mode='rb') as f:
            return md5_for_file(f)
Ejemplo n.º 4
0
    def fs_hash(self):
        from ambry.util import md5_for_file

        if not self.exists():
            return None

        fn_path = self.file_name

        with self._fs.open(fn_path, mode='rb') as f:
            return md5_for_file(f)
Ejemplo n.º 5
0
    def test_caches(self):
        '''Basic test of put(), get() and has() for all cache types'''
        from functools import partial
        from ambry.run import  get_runconfig, RunConfig
        from ambry.filesystem import Filesystem
        from ambry.cache import new_cache
        from ambry.util import md5_for_file
        from ambry.bundle import DbBundle

        #self.start_server() # For the rest-cache

        #fn = '/tmp/1mbfile'
        #with open(fn, 'wb') as f:
        #    f.write('.'*(1024))

        fn = self.bundle.database.path

        # Opening the file might run the database updates in
        # database.sqlite._on_connect_update_schema, which can affect the md5.
        b = DbBundle(fn)

        md5 = md5_for_file(fn)

        print "MD5 {}  = {}".format(fn, md5)

        rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),
                            RunConfig.USER_ACCOUNTS))

        for i, fsname in enumerate(['fscache', 'limitedcache', 'compressioncache',
                                    'cached-s3', 'cached-compressed-s3']):

            config = rc.filesystem(fsname)
            cache = new_cache(config)
            print '---', fsname, cache
            identity = self.bundle.identity

            relpath = identity.cache_key

            r = cache.put(fn, relpath,identity.to_meta(md5=md5))

            r = cache.get(relpath)

            if not r.startswith('http'):
                self.assertTrue(os.path.exists(r), 'Not a url: {}: {}'.format(r,str(cache)))

            self.assertTrue(cache.has(relpath, md5=md5))

            cache.remove(relpath, propagate=True)

            self.assertFalse(os.path.exists(r), str(cache))
            self.assertFalse(cache.has(relpath))


        cache = new_cache(rc.filesystem('s3cache-noupstream'))
        r = cache.put(fn, 'a')
Ejemplo n.º 6
0
    def test_caches(self):
        '''Basic test of put(), get() and has() for all cache types'''
        from ambry.run import get_runconfig
        from ambry.cache import new_cache
        from ambry.util import md5_for_file
        from ambry.bundle import DbBundle

        self.start_server()  # For the rest-cache

        #fn = '/tmp/1mbfile'
        #with open(fn, 'wb') as f:
        #    f.write('.'*(1024))

        fn = self.bundle.database.path

        # Opening the file might run the database updates in
        # database.sqlite._on_connect_update_schema, which can affect the md5.
        b = DbBundle(fn)

        md5 = md5_for_file(fn)

        print "MD5 {}  = {}".format(fn, md5)

        rc = get_runconfig(
            (os.path.join(self.bundle_dir,
                          'test-run-config.yaml'), RunConfig.USER_CONFIG))

        for i, fsname in enumerate([
                'fscache', 'limitedcache', 'compressioncache', 'cached-s3',
                'cached-compressed-s3'
        ]):  #'compressioncache',

            config = rc.filesystem(fsname)
            cache = new_cache(config)
            print '---', fsname, cache
            identity = self.bundle.identity

            relpath = identity.cache_key

            r = cache.put(fn, relpath, identity.to_meta(md5=md5))
            r = cache.get(relpath)

            if not r.startswith('http'):
                self.assertTrue(os.path.exists(r), str(cache))

            self.assertTrue(cache.has(relpath, md5=md5))

            cache.remove(relpath, propagate=True)

            self.assertFalse(os.path.exists(r), str(cache))
            self.assertFalse(cache.has(relpath))

        cache = new_cache(rc.filesystem('s3cache-noupstream'))
        r = cache.put(fn, 'a')
Ejemplo n.º 7
0
    def has(self, rel_path, md5=None, propagate=True):
        from . import md5_for_file

        abs_path = os.path.join(self.cache_dir, rel_path)

        if os.path.exists(abs_path) and ( not md5 or md5 == md5_for_file(abs_path)):
            return abs_path

        if self.upstream and propagate:
            return self.upstream.has(rel_path, md5=md5, propagate=propagate)

        return False
Ejemplo n.º 8
0
    def has(self, rel_path, md5=None, use_upstream=True):
        from ..util import md5_for_file

        abs_path = os.path.join(self.cache_dir, rel_path)


        if os.path.exists(abs_path) and ( not md5 or md5 == md5_for_file(abs_path)):
            return abs_path

        if self.upstream and use_upstream:
            return self.upstream.has(rel_path, md5=md5, use_upstream=use_upstream)

        return False
Ejemplo n.º 9
0
    def test_caches(self):
        """Basic test of put(), get() and has() for all cache types"""
        from ambry.run import get_runconfig
        from ambry.cache import new_cache
        from ambry.util import md5_for_file
        from ambry.bundle import DbBundle

        self.start_server()  # For the rest-cache

        # fn = '/tmp/1mbfile'
        # with open(fn, 'wb') as f:
        #    f.write('.'*(1024))

        fn = self.bundle.database.path

        # Opening the file might run the database updates in
        # database.sqlite._on_connect_update_schema, which can affect the md5.
        b = DbBundle(fn)

        md5 = md5_for_file(fn)

        print "MD5 {}  = {}".format(fn, md5)

        rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG))

        for i, fsname in enumerate(
            ["fscache", "limitedcache", "compressioncache", "cached-s3", "cached-compressed-s3"]
        ):  #'compressioncache',

            config = rc.filesystem(fsname)
            cache = new_cache(config)
            print "---", fsname, cache
            identity = self.bundle.identity

            relpath = identity.cache_key

            r = cache.put(fn, relpath, identity.to_meta(md5=md5))
            r = cache.get(relpath)

            if not r.startswith("http"):
                self.assertTrue(os.path.exists(r), str(cache))

            self.assertTrue(cache.has(relpath, md5=md5))

            cache.remove(relpath, propagate=True)

            self.assertFalse(os.path.exists(r), str(cache))
            self.assertFalse(cache.has(relpath))

        cache = new_cache(rc.filesystem("s3cache-noupstream"))
        r = cache.put(fn, "a")
Ejemplo n.º 10
0
    def _send(self, package, extract_data, file_):
        import os
        import mimetypes

        _, ext = os.path.splitext(file_)
        mimetypes.init()
        content_type = mimetypes.types_map.get(ext, None)  # @UndefinedVariable

        try:
            _, format = content_type.split('/')
        except:
            format = None

        name = extract_data.get('name', os.path.basename(file_))

        #
        # If the filestore exists, write to S3 first, the upload the URL
        if self.filestore:
            from ambry.util import md5_for_file
            urlf = self.filestore.public_url_f(public=True)
            path = self.bundle.identity.path + '/' + name

            # Don't upload if  S3 has the file of the same key and md5
            md5 = md5_for_file(file_)
            if not self.filestore.has(path, md5=md5):
                self.filestore.put(
                    file_,
                    path,
                    metadata={
                        'public': True,
                        'md5': md5})

            r = self.remote.add_url_resource(
                package,
                urlf(path),
                name,
                description=extract_data['description'],
                content_type=content_type,
                format=format,
                hash=md5,
                rel_path=path)
        else:
            r = self.remote.add_file_resource(
                package,
                file_,
                name=name,
                description=extract_data['description'],
                content_type=content_type,
                format=format)

        return r
Ejemplo n.º 11
0
    def _send(self, package, extract_data, file_):
        import os
        import mimetypes

        _, ext = os.path.splitext(file_)
        mimetypes.init()
        content_type = mimetypes.types_map.get(ext, None)  # @UndefinedVariable

        try:
            _, format = content_type.split('/')
        except:
            format = None

        name = extract_data.get('name', os.path.basename(file_))

        #
        # If the filestore exists, write to S3 first, the upload the URL
        if self.filestore:
            from ambry.util import md5_for_file
            urlf = self.filestore.public_url_f(public=True)
            path = self.bundle.identity.path + '/' + name

            # Don't upload if  S3 has the file of the same key and md5
            md5 = md5_for_file(file_)
            if not self.filestore.has(path, md5=md5):
                self.filestore.put(file_,
                                   path,
                                   metadata={
                                       'public': True,
                                       'md5': md5
                                   })

            r = self.remote.add_url_resource(
                package,
                urlf(path),
                name,
                description=extract_data['description'],
                content_type=content_type,
                format=format,
                hash=md5,
                rel_path=path)
        else:
            r = self.remote.add_file_resource(
                package,
                file_,
                name=name,
                description=extract_data['description'],
                content_type=content_type,
                format=format)

        return r
Ejemplo n.º 12
0
    def test_compression(self):
        from ambry.run import get_runconfig
        from ambry.cache import new_cache
        from ambry.util import temp_file_name, md5_for_file, copy_file_or_flo

        rc = get_runconfig((os.path.join(self.bundle_dir, "test-run-config.yaml"), RunConfig.USER_CONFIG))

        comp_cache = new_cache(rc.filesystem("compressioncache"))

        test_file_name = "test_file"

        fn = temp_file_name()
        print "orig file ", fn
        with open(fn, "wb") as f:
            for i in range(1000):
                f.write("{:03d}:".format(i))

        cf = comp_cache.put(fn, test_file_name)

        with open(cf) as stream:
            from ambry.util.sgzip import GzipFile

            stream = GzipFile(stream)

            uncomp_cache = new_cache(rc.filesystem("fscache"))

            uncomp_stream = uncomp_cache.put_stream("decomp")

            copy_file_or_flo(stream, uncomp_stream)

        uncomp_stream.close()

        dcf = uncomp_cache.get("decomp")

        self.assertEquals(md5_for_file(fn), md5_for_file(dcf))

        os.remove(fn)
Ejemplo n.º 13
0
    def _record_file(self, url, out_file, process = File.PROCESS.DOWNLOADED):
        from ambry.util import md5_for_file
        from time import time
        from sqlalchemy.orm.exc import NoResultFound
        from os import stat

        # Create a file record of the download
        try:
            fr = (self.bundle.database.session.query(File)
                  .filter(File.path == url).filter(File.ref == str(self.bundle.identity.vid))
                  .filter(File.type_ == File.TYPE.DOWNLOAD).one() )
        except NoResultFound:
            self.bundle.database.session.rollback()
            fr = None

        content_hash = md5_for_file(out_file)

        stat = os.stat(out_file)

        if fr:
            if content_hash != fr.hash:
                fr.modified = stat.st_mtime
                fr.hash = content_hash
                fr.process = File.PROCESS.MODIFIED
            else:
                fr.process = File.PROCESS.UNMODIFIED
        else:
            fr = File(path=url,
                      ref=str(self.bundle.identity.vid),
                      type=File.TYPE.DOWNLOAD,
                      modified= int(stat.st_mtime),
                      size = stat.st_size,
                      hash=content_hash,
                      process=process,
                      source_url=url,
                      data=dict()
            )

        fr.data['last_download'] = time()
        self.bundle.database.session.merge(fr)
        self.bundle.database.session.commit()
Ejemplo n.º 14
0
def make_metadata(file_name):
    from ambry.util import md5_for_file
    return dict(
        md5 = md5_for_file(file_name)
    )
Ejemplo n.º 15
0
Archivo: hdf.py Proyecto: kball/ambry
    def md5(self):
        from ambry.util import md5_for_file

        return md5_for_file(self.path)
Ejemplo n.º 16
0
 def md5(self):
     from ambry.util import md5_for_file
     return md5_for_file(self.path)
Ejemplo n.º 17
0
    def test_load(self):

        from ambry.run import  get_runconfig, RunConfig
        from ambry.client.rest import RemoteLibrary
        from ambry.cache import new_cache
        from ambry.util import md5_for_file
        from ambry.identity import Identity

        config = self.start_server()
        l = new_library(config)

        rl = RemoteLibrary(self.server_url)


        #
        # Check that the library can list datasets that are inserted externally
        #

        l.put_bundle(self.bundle)

        s = set([i.fqname for i in rl.list().values()])

        self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', s)

        dsident = rl.dataset('diEGPXmDC8001')

        s = set([i.fqname for i in dsident.partitions.values()])

        self.assertEquals(4, len(s))

        self.assertIn('source-dataset-subset-variation-tthree-0.0.1~piEGPXmDC8003001', s)
        self.assertIn('source-dataset-subset-variation-geot1-geo-0.0.1~piEGPXmDC8001001', s)
        self.assertIn('source-dataset-subset-variation-geot2-geo-0.0.1~piEGPXmDC8002001', s)

        #
        # Upload the dataset to S3, clear the library, then load it back in
        #

        rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_ACCOUNTS))
        cache = new_cache(rc.filesystem('cached-compressed-s3'))

        fn = self.bundle.database.path
        identity = self.bundle.identity
        relpath = identity.cache_key

        r = cache.put(fn, relpath, identity.to_meta(file=fn))


        self.assertTrue(bool(cache.has(relpath)))

        # clear the library.

        l.purge()
        self.assertNotIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',
                         set([i.fqname for i in rl.list()]))

        # Load from  S3, directly in to the local library

        identity.add_md5(md5_for_file(fn))

        l.load(identity.cache_key, identity.md5)

        self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',
                      set([i.fqname for i in rl.list().values()]))

        # Do it one more time, using the remote library

        l.purge()
        self.assertNotIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',
                         set([i.fqname for i in rl.list().values()]))

        rl.load_dataset(identity)

        self.assertIn('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',
                      set([i.fqname for i in rl.list().values()]))

        # Check that we can get the record from the library

        self.assertEquals(identity.vid, rl.resolve(identity.vid).vid)
        self.assertEquals(identity.vid, rl.resolve(identity.vname).vid)
        self.assertEquals(identity.vid, rl.resolve(identity.cache_key).vid)
        self.assertEquals(identity.vid, rl.resolve(identity.sname).vid)