def test_download_mirror_cache(): import pathlib import shelve from astropy.utils.data import _find_pkg_data_path, download_file, get_cached_urls main_url = pathlib.Path( _find_pkg_data_path(os.path.join('data', 'dataurl'))).as_uri() + '/' mirror_url = pathlib.Path( _find_pkg_data_path(os.path.join( 'data', 'dataurl_mirror'))).as_uri() + '/' # noqa main_file = main_url + 'index.html' mirror_file = mirror_url + 'index.html' # Temporarily change data.conf. # This also test https://github.com/astropy/astropy/pull/8163 because # urlopen() on a local dir URI also gives URLError. with conf.set_temp('dataurl', main_url): with conf.set_temp('dataurl_mirror', mirror_url): # "Download" files by rerouting URLs to local URIs. download_file(main_file, cache=True) download_file(mirror_file, cache=True) # Now test that download_file looks in mirror's cache before # download. # https://github.com/astropy/astropy/issues/6982 dldir, urlmapfn = _get_download_cache_locs() with shelve.open(urlmapfn) as url2hash: del url2hash[main_file] # Comparing hash makes sure they download the same file # but does not guarantee they were downloaded from the same URL. assert (download_file(main_file, cache=True) == download_file(mirror_file, cache=True)) # This has to be called after the last download to obtain # an accurate view of cached URLs. # This is to ensure that main_file was not re-downloaded # unnecessarily. # This test also tests for "assert TESTURL in get_cached_urls()". c_urls = get_cached_urls() assert (mirror_file in c_urls) and (main_file not in c_urls)
def test_download_mirror_cache(): import pathlib import shelve from astropy.utils.data import _find_pkg_data_path, download_file, get_cached_urls main_url = pathlib.Path( _find_pkg_data_path(os.path.join('data', 'dataurl'))).as_uri() + '/' mirror_url = pathlib.Path( _find_pkg_data_path(os.path.join('data', 'dataurl_mirror'))).as_uri() + '/' # noqa main_file = main_url + 'index.html' mirror_file = mirror_url + 'index.html' # Temporarily change data.conf. # This also test https://github.com/astropy/astropy/pull/8163 because # urlopen() on a local dir URI also gives URLError. with conf.set_temp('dataurl', main_url): with conf.set_temp('dataurl_mirror', mirror_url): # "Download" files by rerouting URLs to local URIs. download_file(main_file, cache=True) download_file(mirror_file, cache=True) # Now test that download_file looks in mirror's cache before # download. # https://github.com/astropy/astropy/issues/6982 dldir, urlmapfn = _get_download_cache_locs() with shelve.open(urlmapfn) as url2hash: del url2hash[main_file] # Comparing hash makes sure they download the same file # but does not guarantee they were downloaded from the same URL. assert (download_file(main_file, cache=True) == download_file(mirror_file, cache=True)) # This has to be called after the last download to obtain # an accurate view of cached URLs. # This is to ensure that main_file was not re-downloaded # unnecessarily. # This test also tests for "assert TESTURL in get_cached_urls()". c_urls = get_cached_urls() assert (mirror_file in c_urls) and (main_file not in c_urls)
def from_registry(cls, registry_url, timeout=60, **kwargs): """ Create a database of VO services from VO registry URL. This is described in detail in :ref:`vo-sec-validator-build-db`, except for the ``validate_xxx`` keys that are added by the validator itself. Parameters ---------- registry_url : str URL of VO registry that returns a VO Table. For example, see ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``. Pedantic is automatically set to `False` for parsing. timeout : number Temporarily set ``astropy.utils.data.conf.remote_timeout`` to this value to avoid time out error while reading the entire registry. kwargs : dict Keywords accepted by :func:`~astropy.utils.data.get_readable_fileobj`. Returns ------- db : `VOSDatabase` Database from given registry. Raises ------ VOSError Invalid VO registry. """ # Download registry as VO table with data_conf.set_temp('remote_timeout', timeout): with get_readable_fileobj(registry_url, **kwargs) as fd: tab_all = parse_single_table(fd, pedantic=False) # Registry must have these fields compulsory_fields = ['res_title', 'access_url'] cat_fields = tab_all.array.dtype.names for field in compulsory_fields: if field not in cat_fields: # pragma: no cover raise VOSError('"{0}" is missing from registry.'.format(field)) title_counter = defaultdict(int) title_fmt = '{0} {1}' db = cls.create_empty() # Each row in the table becomes a catalog for arr in tab_all.array.data: cur_cat = {} cur_key = '' # Process each field and build the catalog. # Catalog is completely built before being thrown out # because codes need less changes should we decide to # allow duplicate URLs in the future. for field in cat_fields: # For primary key, a number needs to be appended to the title # because registry can have multiple entries with the same # title but different URLs. if field == 'res_title': cur_title = arr['res_title'] title_counter[cur_title] += 1 # Starts with 1 if isinstance(cur_title, bytes): # pragma: py3 cur_key = title_fmt.format(cur_title.decode('utf-8'), title_counter[cur_title]) else: # pragma: py2 cur_key = title_fmt.format(cur_title, title_counter[cur_title]) # Special handling of title and access URL, # otherwise no change. if field == 'access_url': s = unescape_all(arr['access_url']) if isinstance(s, six.binary_type): s = s.decode('utf-8') cur_cat['url'] = s elif field == 'res_title': cur_cat['title'] = arr[field] else: cur_cat[field] = arr[field] # New field to track duplicate access URLs. cur_cat['duplicatesIgnored'] = 0 # Add catalog to database, unless duplicate access URL exists. # In that case, the entry is thrown out and the associated # counter is updated. dup_keys = db._url_keys[cur_cat['url']] if len(dup_keys) < 1: db.add_catalog( cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False) else: db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1 warnings.warn( '{0} is thrown out because it has same access URL as ' '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning) return db
def from_registry(cls, registry_url, timeout=60, **kwargs): """ Create a database of VO services from VO registry URL. This is described in detail in :ref:`vo-sec-validator-build-db`, except for the ``validate_xxx`` keys that are added by the validator itself. Parameters ---------- registry_url : str URL of VO registry that returns a VO Table. For example, see ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``. Pedantic is automatically set to `False` for parsing. timeout : number Temporarily set ``astropy.utils.data.conf.remote_timeout`` to this value to avoid time out error while reading the entire registry. kwargs : dict Keywords accepted by :func:`~astropy.utils.data.get_readable_fileobj`. Returns ------- db : `VOSDatabase` Database from given registry. Raises ------ VOSError Invalid VO registry. """ # Download registry as VO table with data_conf.set_temp('remote_timeout', timeout): with get_readable_fileobj(registry_url, **kwargs) as fd: tab_all = parse_single_table(fd, pedantic=False) # Registry must have these fields compulsory_fields = ['res_title', 'access_url'] cat_fields = tab_all.array.dtype.names for field in compulsory_fields: if field not in cat_fields: # pragma: no cover raise VOSError('"{0}" is missing from registry.'.format(field)) title_counter = defaultdict(int) title_fmt = '{0} {1}' db = cls.create_empty() # Each row in the table becomes a catalog for arr in tab_all.array.data: cur_cat = {} cur_key = '' # Process each field and build the catalog. # Catalog is completely built before being thrown out # because codes need less changes should we decide to # allow duplicate URLs in the future. for field in cat_fields: # For primary key, a number needs to be appended to the title # because registry can have multiple entries with the same # title but different URLs. if field == 'res_title': cur_title = arr['res_title'] title_counter[cur_title] += 1 # Starts with 1 if isinstance(cur_title, bytes): # pragma: py3 cur_key = title_fmt.format(cur_title.decode('utf-8'), title_counter[cur_title]) else: # pragma: py2 cur_key = title_fmt.format(cur_title, title_counter[cur_title]) # Special handling of title and access URL, # otherwise no change. if field == 'access_url': s = unescape_all(arr['access_url']) if isinstance(s, six.binary_type): s = s.decode('utf-8') cur_cat['url'] = s elif field == 'res_title': cur_cat['title'] = arr[field] else: cur_cat[field] = arr[field] # New field to track duplicate access URLs. cur_cat['duplicatesIgnored'] = 0 # Add catalog to database, unless duplicate access URL exists. # In that case, the entry is thrown out and the associated # counter is updated. dup_keys = db._url_keys[cur_cat['url']] if len(dup_keys) < 1: db.add_catalog(cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False) else: db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1 warnings.warn( '{0} is thrown out because it has same access URL as ' '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning) return db