Beispiel #1
0
    def test_download_tarfile(self):
        # this is done after the small file sorting happens,
        # so pick UUIDs that would be grouped together
        files_to_dl = ['small_no_friends']

        index_client = GDCIndexClient(base_url)
        index_client._get_metadata(files_to_dl)

        client = GDCHTTPDownloadClient(uri=base_url,
                                       index_client=index_client,
                                       **client_kwargs)

        # it will remove redundant uuids
        tarfile_name, errors = client._download_tarfile(files_to_dl)

        assert tarfile_name != None
        assert os.path.exists(tarfile_name)
        assert tarfile.is_tarfile(tarfile_name) == True

        with tarfile.open(tarfile_name, 'r') as t:
            for member in t.getmembers():
                m = t.extractfile(member)
                contents = m.read()
                assert contents == uuids[m.name]['contents']
                os.remove(tarfile_name)
Beispiel #2
0
    def test_rel_mock_get_metadata(self):
        index = GDCIndexClient(uri=base_url)
        index._get_metadata(['small_rel'])

        assert index.get_access('small_rel') == uuids['small_rel']['access']
        assert index.get_filesize('small_rel') == uuids['small_rel']['file_size']
        assert index.get_md5sum('small_rel') == uuids['small_rel']['md5sum']
        assert index.get_related_files('small_rel') == uuids['small_rel']['related_files']
        assert index.get_annotations('small_rel') == []
Beispiel #3
0
    def test_md5_members(self):

        files_to_tar = ['small', 'small_ann', 'small_rel', 'small_no_friends']

        tarfile_name = make_tarfile(files_to_tar)

        index_client = GDCIndexClient(base_url)
        index_client._get_metadata(files_to_tar)

        client = GDCHTTPDownloadClient(uri=base_url,
                                       index_client=index_client,
                                       **client_kwargs)

        client._untar_file(tarfile_name)
        errors = client._md5_members(files_to_tar)

        assert errors == []

        for f in files_to_tar:
            os.path.exists(f)
            os.remove(f)
Beispiel #4
0
    def test_fix_url(self):
        index_client = GDCIndexClient(base_url)
        client = GDCHTTPDownloadClient(uri=base_url,
                                       index_client=index_client,
                                       **client_kwargs)

        assert client.fix_url('api.gdc.cancer.gov') == \
                'https://api.gdc.cancer.gov/'
        assert client.fix_url('http://api.gdc.cancer.gov/') == \
                'http://api.gdc.cancer.gov/'
        assert client.fix_url('api.gdc.cancer.gov/') == \
                'https://api.gdc.cancer.gov/'
Beispiel #5
0
    def test_big_full_separate_small_files(self):
        index = GDCIndexClient(uri=base_url)
        bigs, smalls = index.separate_small_files(
                ['big'],
                HTTP_CHUNK_SIZE,
                related_files=True,
                annotations=True)

        assert index.get_access('big') == uuids['big']['access']
        assert index.get_filesize('big') == uuids['big']['file_size']
        assert index.get_md5sum('big') == uuids['big']['md5sum']
        assert index.get_related_files('big') == uuids['big']['related_files']
        assert index.get_annotations('big') == uuids['big']['annotations']

        assert bigs == ['big']
        assert smalls == []
Beispiel #6
0
    def test_untar_file(self):

        files_to_tar = ['small', 'small_ann', 'small_rel', 'small_no_friends']

        tarfile_name = make_tarfile(files_to_tar)
        index_client = GDCIndexClient(base_url)

        client = GDCHTTPDownloadClient(uri=base_url,
                                       index_client=index_client,
                                       **client_kwargs)

        client._untar_file(tarfile_name)

        for f in files_to_tar:
            assert os.path.exists(f)
            os.remove(f)
Beispiel #7
0
    def test_small_invalid_separate_small_files(self):
        """ If no metadata can be found about a file, attempt a
        download using the big file method
        """

        invalid = 'invalid uuid'

        index = GDCIndexClient(uri=base_url)
        bigs, smalls = index.separate_small_files(
                [invalid],
                HTTP_CHUNK_SIZE,
                related_files=True,
                annotations=True)

        assert index.get_access(invalid) == None
        assert index.get_filesize(invalid) == None
        assert index.get_md5sum(invalid) == None
        assert index.get_related_files(invalid) == []
        assert index.get_annotations(invalid) == []

        assert bigs == [invalid]
        assert smalls == []
Beispiel #8
0
    def test_small_full_separate_small_files(self):
        """ Currently if a file has related or annotation files
        the dtt processes it as if it were a big file so that
        it goes through the old method of downloading,
        regardless of size.

        NOTE: This will probably change in the future.
        """

        index = GDCIndexClient(uri=base_url)
        bigs, smalls = index.separate_small_files(
                ['small'],
                HTTP_CHUNK_SIZE,
                related_files=True,
                annotations=True)

        assert index.get_access('small') == uuids['small']['access']
        assert index.get_filesize('small') == uuids['small']['file_size']
        assert index.get_md5sum('small') == uuids['small']['md5sum']
        assert index.get_related_files('small') == uuids['small']['related_files']
        assert index.get_annotations('small') == uuids['small']['annotations']

        assert bigs == ['small']
        assert smalls == []
Beispiel #9
0
    def test_no_metadata_get_filesize(self):
        index = GDCIndexClient(uri=base_url)

        results = index.get_access(uuids['small'])
        assert results == None
Beispiel #10
0
    def test_no_metadata_get_md5sum(self):
        index = GDCIndexClient(uri=base_url)

        results = index.get_md5sum(uuids['small'])
        assert results == None
Beispiel #11
0
    def test_no_metadata_get_annotations(self):
        index = GDCIndexClient(uri=base_url)

        results = index.get_annotations(uuids['small'])
        assert results == []
Beispiel #12
0
    def test_no_metadata_get_related_files(self):
        index = GDCIndexClient(uri=base_url)

        results = index.get_related_files(uuids['small'])
        assert results == []
Beispiel #13
0
    def test_big_and_small_full_separate_small_files(self):
        index = GDCIndexClient(uri=base_url)
        bigs, smalls = index.separate_small_files(
                ['big', 'small'],
                HTTP_CHUNK_SIZE,
                related_files=True,
                annotations=True)

        assert index.get_access('big') == uuids['big']['access']
        assert index.get_filesize('big') == uuids['big']['file_size']
        assert index.get_md5sum('big') == uuids['big']['md5sum']
        assert index.get_related_files('big') == uuids['big']['related_files']
        assert index.get_annotations('big') == uuids['big']['annotations']

        assert index.get_access('small') == uuids['small']['access']
        assert index.get_filesize('small') == uuids['small']['file_size']
        assert index.get_md5sum('small') == uuids['small']['md5sum']
        assert index.get_related_files('small') == uuids['small']['related_files']
        assert index.get_annotations('small') == uuids['small']['annotations']

        # if a uuid has related files or annotations then they
        # are downloaded as big files
        assert bigs == ['big', 'small']
        assert smalls == []
Beispiel #14
0
def download(parser, args):
    """ Downloads data from the GDC.

        Combine the smaller files (~KB range) into a grouped download.
        The API now supports combining UUID's into one uncompressed tarfile
        using the ?tarfile url parameter. Combining many smaller files into one
        download decreases the number of open connections we have to make
    """

    successful_count = 0
    unsuccessful_count = 0
    big_errors = []
    small_errors = []
    total_download_count = 0
    validate_args(parser, args)

    # sets do not allow duplicates in a list
    ids = set(args.file_ids)
    for i in args.manifest:
        if not i.get('id'):
            log.error('Invalid manifest')
            break
        ids.add(i['id'])

    index_client = GDCIndexClient(args.server)
    client = get_client(args, index_client)

    # separate the smaller files from the larger files
    bigs, smalls = index_client.separate_small_files(ids, args.http_chunk_size,
                                                     client.related_files,
                                                     client.annotations)

    # the big files will be normal downloads
    # the small files will be joined together and tarfiled
    if smalls:
        log.debug('Downloading smaller files...')

        # download small file grouped in an uncompressed tarfile
        small_errors, count = client.download_small_groups(smalls)
        successful_count += count

        i = 0
        while i < args.retry_amount and small_errors:
            time.sleep(args.wait_time)
            log.debug('Retrying failed grouped downloads')
            small_errors, count = client.download_small_groups(small_errors)
            successful_count += count
            i += 1

    # client.download_files is located in parcel which calls
    # self.parallel_download, which goes back to to gdc-client's parallel_download
    if bigs:
        log.debug('Downloading big files...')

        # create URLs to send to parcel for download
        bigs = [urlparse.urljoin(client.data_uri, b) for b in bigs]
        downloaded_files, big_error_dict = client.download_files(bigs)
        not_downloaded_url = ''
        big_errors_count = 0

        if args.retry_amount > 0:
            for url, reason in big_error_dict.iteritems():
                # only retry the download if it wasn't a controlled access error
                if '403' not in reason:
                    not_downloaded_url = retry_download(
                        client, url, args.retry_amount, args.no_auto_retry,
                        args.wait_time)
                else:
                    big_errors.append(url)
                    not_downloaded_url = ''

                if not_downloaded_url:
                    for b in big_error_dict:
                        big_errors.append(url)

        if big_errors:
            log.debug('Big files not downloaded: {0}'.format(', '.join(
                [b.split('/')[-1] for b in big_errors])))

        successful_count += len(bigs) - len(big_errors)

    unsuccessful_count = len(ids) - successful_count

    log.info('{0}: {1}'.format(colored('Successfully downloaded', 'green'),
                               successful_count))

    if unsuccessful_count > 0:
        log.info('{0}: {1}'.format(colored('Failed downloads', 'red'),
                                   unsuccessful_count))

    return small_errors or big_errors