Example #1
0
def test_unescape_all():
    # str
    url_in = 'http://casu.ast.cam.ac.uk/ag/iphas-dsa%2FSubmitCone?' \
             'DSACAT=IDR&DSATAB=Emitters&'
    url_out = 'http://casu.ast.cam.ac.uk/ag/iphas-dsa/SubmitCone?' \
              'DSACAT=IDR&DSATAB=Emitters&'
    assert unescaper.unescape_all(url_in) == url_out

    # bytes
    url_in = b'http://casu.ast.cam.ac.uk/ag/iphas-dsa%2FSubmitCone?' \
             b'DSACAT=IDR&DSATAB=Emitters&'
    url_out = b'http://casu.ast.cam.ac.uk/ag/iphas-dsa/SubmitCone?' \
              b'DSACAT=IDR&DSATAB=Emitters&'
    assert unescaper.unescape_all(url_in) == url_out
Example #2
0
    def from_registry(cls, registry_url, timeout=60, **kwargs):
        """
        Create a database of VO services from VO registry URL.

        This is described in detail in :ref:`vo-sec-validator-build-db`,
        except for the ``validate_xxx`` keys that are added by the
        validator itself.

        Parameters
        ----------
        registry_url : str
            URL of VO registry that returns a VO Table.
            For example, see
            ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``.
            Pedantic is automatically set to `False` for parsing.

        timeout : number
            Temporarily set ``astropy.utils.data.conf.remote_timeout``
            to this value to avoid time out error while reading the
            entire registry.

        kwargs : dict
            Keywords accepted by
            :func:`~astropy.utils.data.get_readable_fileobj`.

        Returns
        -------
        db : `VOSDatabase`
            Database from given registry.

        Raises
        ------
        VOSError
            Invalid VO registry.

        """
        # Download registry as VO table
        with data_conf.set_temp('remote_timeout', timeout):
            with get_readable_fileobj(registry_url, **kwargs) as fd:
                tab_all = parse_single_table(fd, pedantic=False)

        # Registry must have these fields
        compulsory_fields = ['res_title', 'access_url']
        cat_fields = tab_all.array.dtype.names
        for field in compulsory_fields:
            if field not in cat_fields:  # pragma: no cover
                raise VOSError('"{0}" is missing from registry.'.format(field))

        title_counter = defaultdict(int)
        title_fmt = '{0} {1}'
        db = cls.create_empty()

        # Each row in the table becomes a catalog
        for arr in tab_all.array.data:
            cur_cat = {}
            cur_key = ''

            # Process each field and build the catalog.
            # Catalog is completely built before being thrown out
            # because codes need less changes should we decide to
            # allow duplicate URLs in the future.
            for field in cat_fields:

                # For primary key, a number needs to be appended to the title
                # because registry can have multiple entries with the same
                # title but different URLs.
                if field == 'res_title':
                    cur_title = arr['res_title']
                    title_counter[cur_title] += 1  # Starts with 1

                    if isinstance(cur_title, bytes):  # pragma: py3
                        cur_key = title_fmt.format(cur_title.decode('utf-8'),
                                                   title_counter[cur_title])
                    else:  # pragma: py2
                        cur_key = title_fmt.format(cur_title,
                                                   title_counter[cur_title])

                # Special handling of title and access URL,
                # otherwise no change.
                if field == 'access_url':
                    s = unescape_all(arr['access_url'])
                    if isinstance(s, six.binary_type):
                        s = s.decode('utf-8')
                    cur_cat['url'] = s
                elif field == 'res_title':
                    cur_cat['title'] = arr[field]
                else:
                    cur_cat[field] = arr[field]

            # New field to track duplicate access URLs.
            cur_cat['duplicatesIgnored'] = 0

            # Add catalog to database, unless duplicate access URL exists.
            # In that case, the entry is thrown out and the associated
            # counter is updated.
            dup_keys = db._url_keys[cur_cat['url']]
            if len(dup_keys) < 1:
                db.add_catalog(
                    cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False)
            else:
                db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1
                warnings.warn(
                    '{0} is thrown out because it has same access URL as '
                    '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning)

        return db
Example #3
0
    def from_registry(cls, registry_url, timeout=60, **kwargs):
        """
        Create a database of VO services from VO registry URL.

        This is described in detail in :ref:`vo-sec-validator-build-db`,
        except for the ``validate_xxx`` keys that are added by the
        validator itself.

        Parameters
        ----------
        registry_url : str
            URL of VO registry that returns a VO Table.
            For example, see
            ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``.
            Pedantic is automatically set to `False` for parsing.

        timeout : number
            Temporarily set ``astropy.utils.data.conf.remote_timeout``
            to this value to avoid time out error while reading the
            entire registry.

        kwargs : dict
            Keywords accepted by
            :func:`~astropy.utils.data.get_readable_fileobj`.

        Returns
        -------
        db : `VOSDatabase`
            Database from given registry.

        Raises
        ------
        VOSError
            Invalid VO registry.

        """
        # Download registry as VO table
        with data_conf.set_temp('remote_timeout', timeout):
            with get_readable_fileobj(registry_url, **kwargs) as fd:
                tab_all = parse_single_table(fd, pedantic=False)

        # Registry must have these fields
        compulsory_fields = ['res_title', 'access_url']
        cat_fields = tab_all.array.dtype.names
        for field in compulsory_fields:
            if field not in cat_fields:  # pragma: no cover
                raise VOSError('"{0}" is missing from registry.'.format(field))

        title_counter = defaultdict(int)
        title_fmt = '{0} {1}'
        db = cls.create_empty()

        # Each row in the table becomes a catalog
        for arr in tab_all.array.data:
            cur_cat = {}
            cur_key = ''

            # Process each field and build the catalog.
            # Catalog is completely built before being thrown out
            # because codes need less changes should we decide to
            # allow duplicate URLs in the future.
            for field in cat_fields:

                # For primary key, a number needs to be appended to the title
                # because registry can have multiple entries with the same
                # title but different URLs.
                if field == 'res_title':
                    cur_title = arr['res_title']
                    title_counter[cur_title] += 1  # Starts with 1

                    if isinstance(cur_title, bytes):  # pragma: py3
                        cur_key = title_fmt.format(cur_title.decode('utf-8'),
                                                   title_counter[cur_title])
                    else:  # pragma: py2
                        cur_key = title_fmt.format(cur_title,
                                                   title_counter[cur_title])

                # Special handling of title and access URL,
                # otherwise no change.
                if field == 'access_url':
                    s = unescape_all(arr['access_url'])
                    if isinstance(s, six.binary_type):
                        s = s.decode('utf-8')
                    cur_cat['url'] = s
                elif field == 'res_title':
                    cur_cat['title'] = arr[field]
                else:
                    cur_cat[field] = arr[field]

            # New field to track duplicate access URLs.
            cur_cat['duplicatesIgnored'] = 0

            # Add catalog to database, unless duplicate access URL exists.
            # In that case, the entry is thrown out and the associated
            # counter is updated.
            dup_keys = db._url_keys[cur_cat['url']]
            if len(dup_keys) < 1:
                db.add_catalog(cur_key,
                               VOSCatalog(cur_cat),
                               allow_duplicate_url=False)
            else:
                db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1
                warnings.warn(
                    '{0} is thrown out because it has same access URL as '
                    '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning)

        return db
Example #4
0
def try_query(url,
              retries=3,
              timeout=60,
              get_params=None,
              post_data=None,
              files=None):
    """ A wrapper to the astroquery _request() function allowing for retries
    """
    from requests.exceptions import (Timeout, ReadTimeout)
    from urllib3.exceptions import ReadTimeoutError
    from astroquery.query import BaseQuery
    from IPython.core.debugger import Tracer

    ## For XML entities like "&amp;" that may or may not be handled correctly server-side
    from astropy.utils.xml.unescaper import unescape_all
    url = unescape_all(url)
    """
    ## This is necessary for some services, e.g., skyserver.sdss.org has an image service listed with access_url
    ##  http://skyserver.sdss.org/vo/DR2SIAP/SIAP.asmx/getSiapInfo?&amp;FORMAT=image/fits&amp;BANDPASS=ugriz&amp;	
    try:
        ## Python 2
        import HTMLParser 
        url=HTMLParser.HTMLParser().unescape(url)
    except:
        ## Python 3
        import html
        url=html.unescape(html)
    """
    bq = BaseQuery()
    retry = retries
    ## By default, do a get with no parameters and assume they are in the URL
    if get_params is None and post_data is None:
        get_params = {}

    #Tracer()()
    while retry:
        try:
            if post_data is not None:
                response = bq._request('POST',
                                       url,
                                       data=post_data,
                                       cache=False,
                                       timeout=timeout,
                                       files=files)
            else:
                response = bq._request('GET',
                                       url,
                                       params=get_params,
                                       cache=False,
                                       timeout=timeout)
            retry = retries - 1
        except (Timeout, ReadTimeout, ReadTimeoutError, ConnectionError) as e:
            retry = retry - 1
            if retry == 0:
                print("ERROR: Got another timeout; quitting.")
                #Tracer()()
                #raise e
                return response
            else:
                print("WARNING: Got a timeout; trying again.")
        except:
            raise
        else:
            return response