Esempio n. 1
0
    def test_make_filename(self):
        """download.py: Test directory structure creation from make_filename""" 
        
        dest = dl.make_filename(name='full_path.html', dir=self.test_dir)
        self.assertEqual(dest,os.path.join(self.test_dir,'full_path.html'))

        long_new_path = os.path.join(self.test_dir,'alpha','beta','gamma') 

        # First the negative test - assert it fails to create new directories
        with self.assertRaises(EnvironmentError):
            dest = dl.make_filename(name='foobar.html', dir=long_new_path)

        self.assertFalse(os.path.isdir(long_new_path))

        # Then the positive - assert it succeeds when makedirs is on

        dest = dl.make_filename(name='foobar.html', dir=long_new_path,
                                makedirs=True)
        self.assertTrue(os.path.isdir(long_new_path))
Esempio n. 2
0
    def download(cls,name, store=False,silent=True, retries=0):
        """Download a reference genome of the given name, and return a GRCGenome

        Fetches the named reference assembly from the web, and creates a new
        GRCGenome object to handle it.

        If store is False (default), the data will be kept in a temporary file,
        and will be destroyed as soon as the object is released. If True,
        the entire assembly will be saved in the current
        directory - ValueError will be raised if this file seems to already
        exist. The resulting file will have the '.assembly' suffix, and may be
        either a .zip, a .tar, a .tar.gz, or a .tar.bz2 file. See 
        ``tigerlily.utility.archive.Archive`` for more information.
        If store is a string, it will be assumed to be a path to a
        directory (trailing slash optional) in which the .tar.gz archive should
        be stored. (Again, ValueError will be raised if the file already
        exists.) If necessary, any intermiediate directories will be created.

        If silent is False, status messages will be printed using print() to 
        keep the user informed of the progress. This is usually very important
        in command line applications as the reference archives are about 900 MB
        in size and may take minutes or hours to download depending on the
        internet connection.

        Because of the large size of these files, it is highly recommended that
        the store option be set. Please do not use Tiger Lily to abuse the
        UCSC Genome Browser group's generosity in hosting these large files to
        the general public.

        >>> refgen = GRCGenome.download('test1')
        >>> refgen2 = GRCGenome.download('test1',store=True)
        >>> import os
        >>> os.path.isfile('test1.assembly')
        True
        >>> os.unlink('test1.assembly')

        Only supported reference genome assemblies are allowed, otherwise
        ValueError will be raised.

        >>> GRCGenome.download('invalid')
        Traceback (most recent call last):
            ...
        ValueError: Unknown or unsupported reference genome specified

        When downloading the assembly files the download will check with stored
        md5 values and compare to see if the download was completed correctly.
        if the assembly is not correctly downloaded after specified amount of
        retries then an exeption will be thrown and abort the download.
        """

        if name not in SUPPORTED_ASSEMBLIES:
            raise ValueError('Unknown or unsupported reference genome'
                             ' specified')

        url = SUPPORTED_ASSEMBLIES[name]
        client = ConsoleDownloader()

        if store and store is True:
            filename = make_filename(name='{}.assembly'.format(name))
        elif store:
            name,dir = os.path.split(store)
            filename = make_filename(name=name,dir=dir,makedirs=True)
        else:
            temp = tempfile.NamedTemporaryFile()
            filename = temp.name
            
        client.retrieve(url[0], filename=filename, silent=silent)
        if url[1] != None:
            infile = open(filename,'rb')
            content = infile.read()
            infile.close()
            md5 = hashlib.md5(content).hexdigest()
            if md5 == url[1]:
                return GRCGenome.load_archive(Archive(filepath=filename))
            else:
                os.remove(filename)
                if retries > 0:
                    return GRCGenome.download(name,store,silent,retries-1)
                else:
                    raise EnvironmentError('MD5sum failed 5 tries, download aborted')
        else:
            return GRCGenome.load_archive(Archive(filepath=filename))