def test_download_mirror_cache():
    import pathlib
    import shelve
    from astropy.utils.data import _find_pkg_data_path, download_file, get_cached_urls

    main_url = pathlib.Path(
        _find_pkg_data_path(os.path.join('data', 'dataurl'))).as_uri() + '/'
    mirror_url = pathlib.Path(
        _find_pkg_data_path(os.path.join(
            'data', 'dataurl_mirror'))).as_uri() + '/'  # noqa

    main_file = main_url + 'index.html'
    mirror_file = mirror_url + 'index.html'

    # Temporarily change data.conf.
    # This also test https://github.com/astropy/astropy/pull/8163 because
    # urlopen() on a local dir URI also gives URLError.
    with conf.set_temp('dataurl', main_url):
        with conf.set_temp('dataurl_mirror', mirror_url):

            # "Download" files by rerouting URLs to local URIs.
            download_file(main_file, cache=True)
            download_file(mirror_file, cache=True)

            # Now test that download_file looks in mirror's cache before
            # download.
            # https://github.com/astropy/astropy/issues/6982
            dldir, urlmapfn = _get_download_cache_locs()
            with shelve.open(urlmapfn) as url2hash:
                del url2hash[main_file]

            # Comparing hash makes sure they download the same file
            # but does not guarantee they were downloaded from the same URL.
            assert (download_file(main_file,
                                  cache=True) == download_file(mirror_file,
                                                               cache=True))

            # This has to be called after the last download to obtain
            # an accurate view of cached URLs.
            # This is to ensure that main_file was not re-downloaded
            # unnecessarily.
            # This test also tests for "assert TESTURL in get_cached_urls()".
            c_urls = get_cached_urls()
            assert (mirror_file in c_urls) and (main_file not in c_urls)
Beispiel #2
0
def test_download_mirror_cache():
    import pathlib
    import shelve
    from astropy.utils.data import _find_pkg_data_path, download_file, get_cached_urls

    main_url = pathlib.Path(
        _find_pkg_data_path(os.path.join('data', 'dataurl'))).as_uri() + '/'
    mirror_url = pathlib.Path(
        _find_pkg_data_path(os.path.join('data', 'dataurl_mirror'))).as_uri() + '/'  # noqa

    main_file = main_url + 'index.html'
    mirror_file = mirror_url + 'index.html'

    # Temporarily change data.conf.
    # This also test https://github.com/astropy/astropy/pull/8163 because
    # urlopen() on a local dir URI also gives URLError.
    with conf.set_temp('dataurl', main_url):
        with conf.set_temp('dataurl_mirror', mirror_url):

            # "Download" files by rerouting URLs to local URIs.
            download_file(main_file, cache=True)
            download_file(mirror_file, cache=True)

            # Now test that download_file looks in mirror's cache before
            # download.
            # https://github.com/astropy/astropy/issues/6982
            dldir, urlmapfn = _get_download_cache_locs()
            with shelve.open(urlmapfn) as url2hash:
                del url2hash[main_file]

            # Comparing hash makes sure they download the same file
            # but does not guarantee they were downloaded from the same URL.
            assert (download_file(main_file, cache=True) ==
                    download_file(mirror_file, cache=True))

            # This has to be called after the last download to obtain
            # an accurate view of cached URLs.
            # This is to ensure that main_file was not re-downloaded
            # unnecessarily.
            # This test also tests for "assert TESTURL in get_cached_urls()".
            c_urls = get_cached_urls()
            assert (mirror_file in c_urls) and (main_file not in c_urls)
Beispiel #3
0
    def from_registry(cls, registry_url, timeout=60, **kwargs):
        """
        Create a database of VO services from VO registry URL.

        This is described in detail in :ref:`vo-sec-validator-build-db`,
        except for the ``validate_xxx`` keys that are added by the
        validator itself.

        Parameters
        ----------
        registry_url : str
            URL of VO registry that returns a VO Table.
            For example, see
            ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``.
            Pedantic is automatically set to `False` for parsing.

        timeout : number
            Temporarily set ``astropy.utils.data.conf.remote_timeout``
            to this value to avoid time out error while reading the
            entire registry.

        kwargs : dict
            Keywords accepted by
            :func:`~astropy.utils.data.get_readable_fileobj`.

        Returns
        -------
        db : `VOSDatabase`
            Database from given registry.

        Raises
        ------
        VOSError
            Invalid VO registry.

        """
        # Download registry as VO table
        with data_conf.set_temp('remote_timeout', timeout):
            with get_readable_fileobj(registry_url, **kwargs) as fd:
                tab_all = parse_single_table(fd, pedantic=False)

        # Registry must have these fields
        compulsory_fields = ['res_title', 'access_url']
        cat_fields = tab_all.array.dtype.names
        for field in compulsory_fields:
            if field not in cat_fields:  # pragma: no cover
                raise VOSError('"{0}" is missing from registry.'.format(field))

        title_counter = defaultdict(int)
        title_fmt = '{0} {1}'
        db = cls.create_empty()

        # Each row in the table becomes a catalog
        for arr in tab_all.array.data:
            cur_cat = {}
            cur_key = ''

            # Process each field and build the catalog.
            # Catalog is completely built before being thrown out
            # because codes need less changes should we decide to
            # allow duplicate URLs in the future.
            for field in cat_fields:

                # For primary key, a number needs to be appended to the title
                # because registry can have multiple entries with the same
                # title but different URLs.
                if field == 'res_title':
                    cur_title = arr['res_title']
                    title_counter[cur_title] += 1  # Starts with 1

                    if isinstance(cur_title, bytes):  # pragma: py3
                        cur_key = title_fmt.format(cur_title.decode('utf-8'),
                                                   title_counter[cur_title])
                    else:  # pragma: py2
                        cur_key = title_fmt.format(cur_title,
                                                   title_counter[cur_title])

                # Special handling of title and access URL,
                # otherwise no change.
                if field == 'access_url':
                    s = unescape_all(arr['access_url'])
                    if isinstance(s, six.binary_type):
                        s = s.decode('utf-8')
                    cur_cat['url'] = s
                elif field == 'res_title':
                    cur_cat['title'] = arr[field]
                else:
                    cur_cat[field] = arr[field]

            # New field to track duplicate access URLs.
            cur_cat['duplicatesIgnored'] = 0

            # Add catalog to database, unless duplicate access URL exists.
            # In that case, the entry is thrown out and the associated
            # counter is updated.
            dup_keys = db._url_keys[cur_cat['url']]
            if len(dup_keys) < 1:
                db.add_catalog(
                    cur_key, VOSCatalog(cur_cat), allow_duplicate_url=False)
            else:
                db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1
                warnings.warn(
                    '{0} is thrown out because it has same access URL as '
                    '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning)

        return db
Beispiel #4
0
    def from_registry(cls, registry_url, timeout=60, **kwargs):
        """
        Create a database of VO services from VO registry URL.

        This is described in detail in :ref:`vo-sec-validator-build-db`,
        except for the ``validate_xxx`` keys that are added by the
        validator itself.

        Parameters
        ----------
        registry_url : str
            URL of VO registry that returns a VO Table.
            For example, see
            ``astroquery.vo_conesearch.validator.conf.cs_mstr_list``.
            Pedantic is automatically set to `False` for parsing.

        timeout : number
            Temporarily set ``astropy.utils.data.conf.remote_timeout``
            to this value to avoid time out error while reading the
            entire registry.

        kwargs : dict
            Keywords accepted by
            :func:`~astropy.utils.data.get_readable_fileobj`.

        Returns
        -------
        db : `VOSDatabase`
            Database from given registry.

        Raises
        ------
        VOSError
            Invalid VO registry.

        """
        # Download registry as VO table
        with data_conf.set_temp('remote_timeout', timeout):
            with get_readable_fileobj(registry_url, **kwargs) as fd:
                tab_all = parse_single_table(fd, pedantic=False)

        # Registry must have these fields
        compulsory_fields = ['res_title', 'access_url']
        cat_fields = tab_all.array.dtype.names
        for field in compulsory_fields:
            if field not in cat_fields:  # pragma: no cover
                raise VOSError('"{0}" is missing from registry.'.format(field))

        title_counter = defaultdict(int)
        title_fmt = '{0} {1}'
        db = cls.create_empty()

        # Each row in the table becomes a catalog
        for arr in tab_all.array.data:
            cur_cat = {}
            cur_key = ''

            # Process each field and build the catalog.
            # Catalog is completely built before being thrown out
            # because codes need less changes should we decide to
            # allow duplicate URLs in the future.
            for field in cat_fields:

                # For primary key, a number needs to be appended to the title
                # because registry can have multiple entries with the same
                # title but different URLs.
                if field == 'res_title':
                    cur_title = arr['res_title']
                    title_counter[cur_title] += 1  # Starts with 1

                    if isinstance(cur_title, bytes):  # pragma: py3
                        cur_key = title_fmt.format(cur_title.decode('utf-8'),
                                                   title_counter[cur_title])
                    else:  # pragma: py2
                        cur_key = title_fmt.format(cur_title,
                                                   title_counter[cur_title])

                # Special handling of title and access URL,
                # otherwise no change.
                if field == 'access_url':
                    s = unescape_all(arr['access_url'])
                    if isinstance(s, six.binary_type):
                        s = s.decode('utf-8')
                    cur_cat['url'] = s
                elif field == 'res_title':
                    cur_cat['title'] = arr[field]
                else:
                    cur_cat[field] = arr[field]

            # New field to track duplicate access URLs.
            cur_cat['duplicatesIgnored'] = 0

            # Add catalog to database, unless duplicate access URL exists.
            # In that case, the entry is thrown out and the associated
            # counter is updated.
            dup_keys = db._url_keys[cur_cat['url']]
            if len(dup_keys) < 1:
                db.add_catalog(cur_key,
                               VOSCatalog(cur_cat),
                               allow_duplicate_url=False)
            else:
                db._catalogs[dup_keys[0]]['duplicatesIgnored'] += 1
                warnings.warn(
                    '{0} is thrown out because it has same access URL as '
                    '{1}.'.format(cur_key, dup_keys[0]), AstropyUserWarning)

        return db