def test_unescape_all(): # str url_in = 'http://casu.ast.cam.ac.uk/ag/iphas-dsa%2FSubmitCone?' \ 'DSACAT=IDR&DSATAB=Emitters&' url_out = 'http://casu.ast.cam.ac.uk/ag/iphas-dsa/SubmitCone?' \ 'DSACAT=IDR&DSATAB=Emitters&' assert unescaper.unescape_all(url_in) == url_out # bytes url_in = b'http://casu.ast.cam.ac.uk/ag/iphas-dsa%2FSubmitCone?' \ b'DSACAT=IDR&DSATAB=Emitters&' url_out = b'http://casu.ast.cam.ac.uk/ag/iphas-dsa/SubmitCone?' \ b'DSACAT=IDR&DSATAB=Emitters&' assert unescaper.unescape_all(url_in) == url_out
def from_registry(cls, registry_url, timeout=60, **kwargs): """ Create a database of VO services from VO registry URL. This is described in detail in :ref:`vo-sec-validator-build-db`, except for the ``validate_xxx`` keys that are added by the validator itself. Parameters ---------- registry_url : str URL of VO registry that returns a VO Table. For example, see ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``. Pedantic is automatically set to `False` for parsing. timeout : number Temporarily set ``astropy.utils.data.conf.remote_timeout`` to this value to avoid time out error while reading the entire registry. kwargs : dict Keywords accepted by :func:`~astropy.utils.data.get_readable_fileobj`. Returns ------- db : `VOSDatabase` Database from given registry. Raises ------ VOSError Invalid VO registry. """ # Download registry as VO table with data_conf.set_temp('remote_timeout', timeout): with get_readable_fileobj(registry_url, **kwargs) as fd: tab_all = parse_single_table(fd, pedantic=False) # Registry must have these fields compulsory_fields = ['res_title', 'access_url'] cat_fields = tab_all.array.dtype.names for field in compulsory_fields: if field not in cat_fields: # pragma: no cover raise VOSError('"{0}" is missing from registry.'.format(field)) title_counter = defaultdict(int) title_fmt = '{0} {1}' db = cls.create_empty() # Each row in the table becomes a catalog for arr in tab_all.array.data: cur_cat = {} cur_key = '' # Process each field and build the catalog. # Catalog is completely built before being thrown out # because codes need less changes should we decide to # allow duplicate URLs in the future. for field in cat_fields: # For primary key, a number needs to be appended to the title # because registry can have multiple entries with the same # title but different URLs. if field == 'res_title': cur_title = arr['res_title'] title_counter[cur_title] += 1 # Starts with 1 if isinstance(cur_title, bytes): # pragma: py3 cur_key = title_fmt.format(cur_title.decode('utf-8'), title_counter[cur_title]) else: # pragma: py2 cur_key = title_fmt.format(cur_title, title_counter[cur_title]) # Special handling of title and access URL, # otherwise no change. if field == 'access_url': s = unescape_all(arr['access_url']) if isinstance(s, six.binary_type): s = s.decode('utf-8') cur_cat['url'] = s elif field == 'res_title': cur_cat['title'] = arr[field] else: cur_cat[field] = arr[field] # New field to track duplicate access URLs. cur_cat['duplicatesIgnored'] = 0 # Add catalog to database, unless duplicate access URL exists. # In that case, the entry is thrown out and the associated # counter is updated. dup_keys = db._url_keys[cur_cat['url']] if len(dup_keys) < 1: db.add_catalog( cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False) else: db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1 warnings.warn( '{0} is thrown out because it has same access URL as ' '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning) return db
def from_registry(cls, registry_url, timeout=60, **kwargs): """ Create a database of VO services from VO registry URL. This is described in detail in :ref:`vo-sec-validator-build-db`, except for the ``validate_xxx`` keys that are added by the validator itself. Parameters ---------- registry_url : str URL of VO registry that returns a VO Table. For example, see ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``. Pedantic is automatically set to `False` for parsing. timeout : number Temporarily set ``astropy.utils.data.conf.remote_timeout`` to this value to avoid time out error while reading the entire registry. kwargs : dict Keywords accepted by :func:`~astropy.utils.data.get_readable_fileobj`. Returns ------- db : `VOSDatabase` Database from given registry. Raises ------ VOSError Invalid VO registry. """ # Download registry as VO table with data_conf.set_temp('remote_timeout', timeout): with get_readable_fileobj(registry_url, **kwargs) as fd: tab_all = parse_single_table(fd, pedantic=False) # Registry must have these fields compulsory_fields = ['res_title', 'access_url'] cat_fields = tab_all.array.dtype.names for field in compulsory_fields: if field not in cat_fields: # pragma: no cover raise VOSError('"{0}" is missing from registry.'.format(field)) title_counter = defaultdict(int) title_fmt = '{0} {1}' db = cls.create_empty() # Each row in the table becomes a catalog for arr in tab_all.array.data: cur_cat = {} cur_key = '' # Process each field and build the catalog. # Catalog is completely built before being thrown out # because codes need less changes should we decide to # allow duplicate URLs in the future. for field in cat_fields: # For primary key, a number needs to be appended to the title # because registry can have multiple entries with the same # title but different URLs. if field == 'res_title': cur_title = arr['res_title'] title_counter[cur_title] += 1 # Starts with 1 if isinstance(cur_title, bytes): # pragma: py3 cur_key = title_fmt.format(cur_title.decode('utf-8'), title_counter[cur_title]) else: # pragma: py2 cur_key = title_fmt.format(cur_title, title_counter[cur_title]) # Special handling of title and access URL, # otherwise no change. if field == 'access_url': s = unescape_all(arr['access_url']) if isinstance(s, six.binary_type): s = s.decode('utf-8') cur_cat['url'] = s elif field == 'res_title': cur_cat['title'] = arr[field] else: cur_cat[field] = arr[field] # New field to track duplicate access URLs. cur_cat['duplicatesIgnored'] = 0 # Add catalog to database, unless duplicate access URL exists. # In that case, the entry is thrown out and the associated # counter is updated. dup_keys = db._url_keys[cur_cat['url']] if len(dup_keys) < 1: db.add_catalog(cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False) else: db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1 warnings.warn( '{0} is thrown out because it has same access URL as ' '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning) return db
def try_query(url, retries=3, timeout=60, get_params=None, post_data=None, files=None): """ A wrapper to the astroquery _request() function allowing for retries """ from requests.exceptions import (Timeout, ReadTimeout) from urllib3.exceptions import ReadTimeoutError from astroquery.query import BaseQuery from IPython.core.debugger import Tracer ## For XML entities like "&" that may or may not be handled correctly server-side from astropy.utils.xml.unescaper import unescape_all url = unescape_all(url) """ ## This is necessary for some services, e.g., skyserver.sdss.org has an image service listed with access_url ## http://skyserver.sdss.org/vo/DR2SIAP/SIAP.asmx/getSiapInfo?&FORMAT=image/fits&BANDPASS=ugriz& try: ## Python 2 import HTMLParser url=HTMLParser.HTMLParser().unescape(url) except: ## Python 3 import html url=html.unescape(html) """ bq = BaseQuery() retry = retries ## By default, do a get with no parameters and assume they are in the URL if get_params is None and post_data is None: get_params = {} #Tracer()() while retry: try: if post_data is not None: response = bq._request('POST', url, data=post_data, cache=False, timeout=timeout, files=files) else: response = bq._request('GET', url, params=get_params, cache=False, timeout=timeout) retry = retries - 1 except (Timeout, ReadTimeout, ReadTimeoutError, ConnectionError) as e: retry = retry - 1 if retry == 0: print("ERROR: Got another timeout; quitting.") #Tracer()() #raise e return response else: print("WARNING: Got a timeout; trying again.") except: raise else: return response