Beispiel #1
0
def _image_from_url(url):
    '''a method for loading images from urls by first saving them locally'''
    filename = os.path.basename(url)
    if not os.path.exists(filename):
        if '://' not in url: url = 'http://' + url
        _urlretrieve(url, filename)
    image, filename = _image_from_file(filename)
    return image, filename
Beispiel #2
0
 def urlretrieve(url, filename, reporthook, sha256sum):
     try:
         _urlretrieve(url, filename, reporthook)
         if not validate_sha256(filename, sha256sum):
             raise DownloadError("Corrupted download, the sha256 doesn't match")
     except BaseException:
         os.unlink(filename)
         raise
Beispiel #3
0
def _image_from_url(url):
    '''a method for loading images from urls by first saving them locally'''
    filename = os.path.basename(url)
    if not os.path.exists(filename):
        if '://' not in url: url = 'http://'+url
        _urlretrieve(url, filename)
    image, filename =_image_from_file(filename)
    return image, filename
Beispiel #4
0
 def urlretrieve(url, filename, reporthook, sha256sum):
     try:
         _urlretrieve(url, filename, reporthook)
         if not validate_sha256(filename, sha256sum):
             raise DownloadError(
                 "Corrupted download, the sha256 doesn't match")
     except BaseException:
         os.unlink(filename)
         raise
Beispiel #5
0
def _image_from_url(url):
    """a method for loading images from urls by first saving them locally"""
    filename = os.path.basename(url)
    if not os.path.exists(filename):
        if "://" not in url:
            url = "http://" + url
        _urlretrieve(url, filename)
    image, filename = _image_from_file(filename)
    return image, filename
Beispiel #6
0
def load_sound(url_or_filename):
    '''Reads a sound file from a given filename or url'''
    if url_or_filename in _known_images: return _known_sounds[url_or_filename]
    if not os.path.exists(url_or_filename): 
        filename = os.path.basename(url_or_filename)
        if not os.path.exists(filename):
            _urlretrieve(url_or_filename, filename)
        url_or_filename = filename
    sound = pygame.mixer.Sound(url_or_filename)
    _known_sounds[url_or_filename] = sound
    return sound
Beispiel #7
0
def load_sound(url_or_filename):
    '''Reads a sound file from a given filename or url'''
    if url_or_filename in _known_images: return _known_sounds[url_or_filename]
    if not os.path.exists(url_or_filename): 
        filename = os.path.basename(url_or_filename)
        if not os.path.exists(filename):
            _urlretrieve(url_or_filename, filename)
        url_or_filename = filename
    sound = pygame.mixer.Sound(url_or_filename)
    _known_sounds[url_or_filename] = sound
    return sound
Beispiel #8
0
def urlretrieve(url, filename=None):
    '''Download a file to disk
    '''
    if filename is None:
        filename = os.path.basename(url)
        debug('DOWNLOAD', '%s from %s', name, os.path.dirname(url))
    else:
        debug('DOWNLOAD', '%s as %s', url, filename)

    if _urlretrieve is None:
        data = urllib2.urlopen(url).read()
        with open(filename, 'w') as f:
            f.write(data)
    else:
        _urlretrieve(url, filename)
Beispiel #9
0
def open_pdb(pdbid, pdb_url=None):
    """Make a local copy of an online pdb file and return a file handle."""
    if pdb_url is None:
        pdb_url = default_pdb_url
    url = pdb_url % pdbid
    fn, header = _urlretrieve(url)
    return open(fn)
Beispiel #10
0
def load_dataset(
    name: Optional[str] = None,
    cache: bool = True,
    cache_dir: str = _default_cache_dir,
    github_url: str = "https://github.com/pangeo-data/climpred-data",
    branch: str = "master",
    extension: Optional[str] = None,
    proxy_dict: Optional[Dict[str, str]] = None,
    **kws,
) -> xr.Dataset:
    """Load example data or a mask from an online repository.

    Args:
        name: Name of the netcdf file containing the
              dataset, without the ``.nc`` extension. If ``None``, this function
              prints out the available datasets to import.
        cache: If ``True``, cache data locally for use on later calls.
        cache_dir: The directory in which to search for and cache the data.
        github_url: Github repository where the data is stored.
        branch: The git branch to download from.
        extension: Subfolder within the repository where the data is stored.
        proxy_dict: Dictionary with keys as either "http" or "https" and values as the
            proxy server. This is useful if you are on a work computer behind a
            firewall and need to use a proxy out to download data.
        kws: Keywords passed to :py:meth:`~xarray.open_dataset`.

    Returns:
        The desired :py:class:`xarray.Dataset`

    Examples:
        >>> from climpred.tutorial import load_dataset
        >>> proxy_dict = {"http": "127.0.0.1"}
        >>> ds = load_dataset("FOSI-SST", cache=False, proxy_dict=proxy_dict)
    """
    if name is None:
        return _get_datasets()

    if proxy_dict is not None:
        _initialize_proxy(proxy_dict)

    # https://stackoverflow.com/questions/541390/extracting-extension-from-
    # filename-in-python
    # Allows for generalized file extensions.
    name, ext = _os.path.splitext(name)
    if not ext.endswith(".nc"):
        ext += ".nc"

    # use aliases
    if name in FILE_ALIAS_DICT.keys():
        name = FILE_ALIAS_DICT[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + ext
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + ".md5"
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = "/".join((github_url, "raw", branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = "/".join((github_url, "raw", branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = "/".join((github_url, "raw", branch, fullname))
            _urlretrieve(url, localfile)
            url = "/".join((github_url, "raw", branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, "r") as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            Try downloading the file again. There was a confliction between
            your local .md5 file compared to the one in the remote repository,
            so the local copy has been removed to resolve the issue.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)
    return ds
Beispiel #11
0
def urlretrieve(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True, miniters=1,
                             desc=url.split('/')[-1]) as t:
        _urlretrieve(url, filename=output_path, reporthook=t.update_to)
Beispiel #12
0
def open_dataset(name,
                 cache=True,
                 cache_dir=_default_cache_dir,
                 github_url='https://github.com/bradyrx/climpred',
                 branch='master',
                 extension='sample_data/prediction',
                 **kws):
    """Load example data or a mask from an online repository.

    This is a function from `xarray.tutorial` to load an online dataset
    with minimal package imports. I am copying it here because it looks like
    it will soon be deprecated. Also, I've added the ability to point to
    data files that are not in the main folder of the repo (i.e., they are
    in subfolders).

    Note that this requires an md5 file to be loaded. Check the github
    repo bradyrx/climdata for a python script that converts .nc files into
    md5 files.

    Args:
        name: (str) Name of the netcdf file containing the dataset, without
              the .nc extension.
        cache_dir: (str, optional) The directory in which to search
                   for and cache the data.
        cache: (bool, optional) If true, cache data locally for use on later
               calls.
        github_url: (str, optional) Github repository where the data is stored.
        branch: (str, optional) The git branch to download from.
        extension: (str, optional) Subfolder within the repository where the
                   data is stored.
        kws: (dict, optional) Keywords passed to xarray.open_dataset

    Returns:
        The desired xarray dataset.
    """
    if name.endswith('.nc'):
        name = name[:-3]
    # use aliases
    if name in file_alias_dict.keys():
        name = file_alias_dict[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + '.nc'
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + '.md5'
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = '/'.join((github_url, 'raw', branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = '/'.join((github_url, 'raw', branch, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, 'r') as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            MD5 checksum does not match, try downloading dataset again.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)

    return ds
Beispiel #13
0
def load_dataset(
    name=None,
    cache=True,
    cache_dir=_default_cache_dir,
    github_url='https://github.com/bradyrx/climpred-data',
    branch='master',
    extension=None,
    proxy_dict=None,
    **kws,
):
    """Load example data or a mask from an online repository.

    Args:
        name: (str, default None) Name of the netcdf file containing the
              dataset, without the .nc extension. If None, this function
              prints out the available datasets to import.
        cache_dir: (str, optional) The directory in which to search
                   for and cache the data.
        cache: (bool, optional) If True, cache data locally for use on later
               calls.
        github_url: (str, optional) Github repository where the data is stored.
        branch: (str, optional) The git branch to download from.
        extension: (str, optional) Subfolder within the repository where the
                   data is stored.
        proxy_dict: (dict, optional) Dictionary with keys as either 'http' or
                    'https' and values as the proxy server. This is useful
                    if you are on a work computer behind a firewall and need
                    to use a proxy out to download data.
        kws: (dict, optional) Keywords passed to xarray.open_dataset

    Returns:
        The desired xarray dataset.

    Examples:
        >>> from climpred.tutorial import load_dataset()
        >>> proxy_dict = {'http': '127.0.0.1'}
        >>> ds = load_dataset('FOSI-SST', cache=False, proxy_dict=proxy_dict)
    """
    if name is None:
        return _get_datasets()

    if proxy_dict is not None:
        _initialize_proxy(proxy_dict)

    # https://stackoverflow.com/questions/541390/extracting-extension-from-
    # filename-in-python
    # Allows for generalized file extensions.
    name, ext = _os.path.splitext(name)
    if not ext.endswith('.nc'):
        ext += '.nc'

    # use aliases
    if name in FILE_ALIAS_DICT.keys():
        name = FILE_ALIAS_DICT[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + ext
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + '.md5'
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = '/'.join((github_url, 'raw', branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = '/'.join((github_url, 'raw', branch, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, 'r') as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            Try downloading the file again. There was a confliction between
            your local .md5 file compared to the one in the remote repository,
            so the local copy has been removed to resolve the issue.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)
    return ds
Beispiel #14
0
def main():
    """Extract a SCOP domain's ATOM and HETATOM records from a PDB file."""
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "hp:o:i:",
            ["help", "usage", "pdb=", "output=", "input="])
    except getopt.GetoptError:
        # show help information and exit:
        usage()
        sys.exit(2)

    input = None
    in_handle = None
    output = None
    pdb_url = None
    cla_url = None
    raf_url = None

    for o, a in opts:
        if o in ("-h", "--help", "--usage"):
            usage()
            sys.exit()
        elif o in ("-o", "--output"):
            output = a
        elif o in ("-i", "--input"):
            input = a
        elif o in ("-p", "--pdb"):
            pdb_url = a

    if len(args) < 2:
        sys.stderr.write(
            "Not enough arguments. Try --help for more details.\n")
        sys.exit(2)

    raf_url = args[0]
    cla_url = args[1]

    (raf_filename, headers) = _urlretrieve(raf_url)
    seqMapIndex = Raf.SeqMapIndex(raf_filename)

    (cla_filename, headers) = _urlretrieve(cla_url)
    claIndex = Cla.Index(cla_filename)

    if input is None:
        sids = args[2:]
    elif input == "-":
        sids = sys.stdin
    else:
        in_handle = open(input)
        sids = in_handle

    try:
        for sid in sids:
            if not sid or sid[0:1] == "#":
                continue
            id = sid[0:7]
            pdbid = id[1:5]
            s = pdbid[0:1]
            if s == "0" or s == "s":
                sys.stderr.write("No coordinates for domain %s\n" % id)
                continue

            if output is None:
                filename = id + ".ent"
                out_handle = open(filename, "w+")
            elif output == "-":
                out_handle = sys.stdout
            else:
                out_handle = open(output, "w+")

            try:
                try:
                    claRec = claIndex[id]
                    residues = claRec.residues
                    seqMap = seqMapIndex.getSeqMap(residues)
                    pdbid = residues.pdbid

                    f = open_pdb(pdbid, pdb_url)
                    try:
                        seqMap.getAtoms(f, out_handle)
                    finally:
                        f.close()
                except (IOError, KeyError, RuntimeError) as e:
                    sys.stderr.write("I cannot do SCOP domain %s : %s\n" %
                                     (id, e))
            finally:
                out_handle.close()
    finally:
        if in_handle is not None:
            in_handle.close()
Beispiel #15
0
 def get_seqres_file(self, savefile="pdb_seqres.txt"):
     """Retrieve and save a (big) file containing all the sequences of PDB entries."""
     if self._verbose:
         print("Retrieving sequence file (takes over 110 MB).")
     url = self.pdb_server + "/pub/pdb/derived_data/pdb_seqres.txt"
     _urlretrieve(url, savefile)
Beispiel #16
0
    def retrieve_pdb_file(self,
                          pdb_code,
                          obsolete=False,
                          pdir=None,
                          file_format=None,
                          overwrite=False):
        """Fetch PDB structure file from PDB server, and store it locally.

        The PDB structure's file name is returned as a single string.
        If obsolete ``==`` True, the file will be saved in a special file tree.

        NOTE. The default download format has changed from PDB to PDBx/mmCif

        :param pdb_code: 4-symbols structure Id from PDB (e.g. 3J92).
        :type pdb_code: string

        :param file_format:
            File format. Available options:

            * "mmCif" (default, PDBx/mmCif file),
            * "pdb" (format PDB),
            * "xml" (PDBML/XML format),
            * "mmtf" (highly compressed),
            * "bundle" (PDB formatted archive for large structure}

        :type file_format: string

        :param overwrite: if set to True, existing structure files will be overwritten. Default: False
        :type overwrite: bool

        :param obsolete:
            Has a meaning only for obsolete structures. If True, download the obsolete structure
            to 'obsolete' folder, otherwise download won't be performed.
            This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf.
            Also doesn't have meaning when parameter pdir is specified.
            Note: make sure that you are about to download the really obsolete structure.
            Trying to download non-obsolete structure into obsolete folder will not work
            and you face the "structure doesn't exists" error.
            Default: False

        :type obsolete: bool

        :param pdir: put the file in this directory (default: create a PDB-style directory tree)
        :type pdir: string

        :return: filename
        :rtype: string
        """
        # Deprecation warning
        file_format = self._print_default_format_warning(file_format)

        # Get the compressed PDB structure
        code = pdb_code.lower()
        archive = {
            "pdb": "pdb%s.ent.gz",
            "mmCif": "%s.cif.gz",
            "xml": "%s.xml.gz",
            "mmtf": "%s",
            "bundle": "%s-pdb-bundle.tar.gz",
        }
        archive_fn = archive[file_format] % code

        if file_format not in archive.keys():
            raise (
                "Specified file_format %s doesn't exists or is not supported. Maybe a "
                "typo. Please, use one of the following: mmCif, pdb, xml, mmtf, bundle"
                % file_format)

        if file_format in ("pdb", "mmCif", "xml"):
            pdb_dir = "divided" if not obsolete else "obsolete"
            file_type = ("pdb" if file_format == "pdb" else
                         "mmCIF" if file_format == "mmCif" else "XML")
            url = self.pdb_server + "/pub/pdb/data/structures/%s/%s/%s/%s" % (
                pdb_dir,
                file_type,
                code[1:3],
                archive_fn,
            )
        elif file_format == "bundle":
            url = self.pdb_server + "/pub/pdb/compatible/pdb_bundle/%s/%s/%s" % (
                code[1:3],
                code,
                archive_fn,
            )
        else:
            url = "http://mmtf.rcsb.org/v1.0/full/%s" % code

        # Where does the final PDB file get saved?
        if pdir is None:
            path = self.local_pdb if not obsolete else self.obsolete_pdb
            if not self.flat_tree:  # Put in PDB-style directory tree
                path = os.path.join(path, code[1:3])
        else:  # Put in specified directory
            path = pdir
        if not os.access(path, os.F_OK):
            os.makedirs(path)
        filename = os.path.join(path, archive_fn)
        final = {
            "pdb": "pdb%s.ent",
            "mmCif": "%s.cif",
            "xml": "%s.xml",
            "mmtf": "%s.mmtf",
            "bundle": "%s-pdb-bundle.tar",
        }
        final_file = os.path.join(path, final[file_format] % code)

        # Skip download if the file already exists
        if not overwrite:
            if os.path.exists(final_file):
                if self._verbose:
                    print("Structure exists: '%s' " % final_file)
                return final_file

        # Retrieve the file
        if self._verbose:
            print("Downloading PDB structure '%s'..." % pdb_code)
        try:
            _urlcleanup()
            _urlretrieve(url, filename)
        except IOError:
            print("Desired structure doesn't exists")
        else:
            with gzip.open(filename, "rb") as gz:
                with open(final_file, "wb") as out:
                    out.writelines(gz)
            os.remove(filename)
        return final_file