def load_all_models_in_directory(dir_name, limit_extensions=True, recursive=False): """ Load all models in the specified directory, returning a list of file names and iotbx.file_reader objects. """ from iotbx.file_reader import any_file, guess_file_type assert os.path.isdir(dir_name) file_names_and_objects = [] for file_name in os.listdir(dir_name): full_path = os.path.join(dir_name, file_name) if os.path.isdir(full_path) and recursive : file_names_and_objects.extend( load_all_models_in_directory(dir_name=full_path, limit_extensions=limit_extensions, recursive=True)) elif os.path.isfile(full_path): if (limit_extensions) and (guess_file_type(full_path) != "pdb"): continue input_file = any_file(full_path, raise_sorry_if_not_expected_format=True) if (input_file.file_type == "pdb"): file_names_and_objects.append((full_path, input_file.file_object)) return file_names_and_objects
def load_all_models_in_directory (dir_name, limit_extensions=True, recursive=False) : """ Load all models in the specified directory, returning a list of file names and iotbx.file_reader objects. """ from iotbx.file_reader import any_file, guess_file_type assert os.path.isdir(dir_name) file_names_and_objects = [] for file_name in os.listdir(dir_name) : full_path = os.path.join(dir_name, file_name) if os.path.isdir(full_path) and recursive : file_names_and_objects.extend( load_all_models_in_directory(dir_name=full_path, limit_extensions=limit_extensions, recursive=True)) elif os.path.isfile(full_path) : if (limit_extensions) and (guess_file_type(full_path) != "pdb") : continue input_file = any_file(full_path, raise_sorry_if_not_expected_format=True) if (input_file.file_type == "pdb") : file_names_and_objects.append((full_path, input_file.file_object)) return file_names_and_objects
def fetch (id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None) : """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml :param mirror: remote site to use, either rcsb or pdbe :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml"] assert mirror in ["rcsb", "pdbe", "pdbj"] validate_pdb_id(id) if (log is None) : log = null_out() id = id.lower() if (not force_download) : if (local_cache is not None) and (data_type == "pdb") : from iotbx.file_reader import guess_file_type if (local_cache is Auto) : local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files : if (len(file_name) > 4) : file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id) : if (guess_file_type(file_name) == "pdb") : file_name = os.path.join(local_cache, file_name) print >> log, "Reading from cache directory:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and ("PDB_MIRROR_PDB" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)) : file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)) : print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)) : sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)) : file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)) : print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb") : url_base = "http://www.rcsb.org/pdb/files/" pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe") : url_base = "http://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj") : url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb") : compressed = True if (format == "pdb") : url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format == "cif") : url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray") : compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]) : url = "http://pdbj.org/app//downloadFasta4PDBID?pdbid=%s" % id if (url is None) and (data_type != "fasta") : raise Sorry("Can't determine PDBj download URL for this data/format "+ "combination.") if (data_type in ["fasta", "seq"]) : # XXX the RCSB doesn't appear to have a simple URL for FASTA files if (url is None) : # TODO PDBe equivalent doesn't exist? url = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=FASTA&compression=NO&structureId=%s" % id try : data = libtbx.utils.urlopen(url) except urllib2.HTTPError, e : if e.getcode() == 404 : raise RuntimeError("Couldn't download sequence for %s." % id) else : raise
def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None): """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml (or cif_or_pdb) :param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml", "cif_or_pdb"] assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"] validate_pdb_id(id) if (log is None): log = null_out() id = id.lower() if (not force_download): if (local_cache is not None) and (data_type == "pdb"): from iotbx.file_reader import guess_file_type if (local_cache is Auto): local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files: if (len(file_name) > 4): file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id): if (guess_file_type(file_name) == "pdb"): file_name = os.path.join(local_cache, file_name) print("Reading from cache directory:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and (format in ["pdb", "cif_or_pdb"]) and \ ("PDB_MIRROR_PDB" in os.environ): subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f if (data_type == "pdb") and (format in ["cif", "cif_or_pdb"]) and \ ("PDB_MIRROR_MMCIF" in os.environ): subdir = os.path.join(os.environ["PDB_MIRROR_MMCIF"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "%s.cif.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)): sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb"): url_base = 'https://files.rcsb.org/download/' pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe"): url_base = "https://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj"): url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb"): compressed = True if (format == "pdb"): url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format in ["cif", "cif_or_pdb"]): url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray"): compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]): url = "https://pdbj.org/rest/downloadPDBfile?format=fasta&id=%s" % id if (url is None) and (data_type != "fasta"): raise Sorry( "Can't determine PDBj download URL for this data/format " + "combination.") elif mirror == "pdb-redo": url_base = "https://pdb-redo.eu/db/" pdb_ext = "_final.pdb" cif_ext = "_final.cif" sf_prefix = "" sf_ext = "_final.mtz" if (data_type == 'pdb'): if (format == 'pdb'): url = url_base + "{id}/{id}{format}".format(id=id, format=pdb_ext) elif (format in ['cif', 'cif_or_pdb']): url = url_base + "{id}/{id}{format}".format(id=id, format=cif_ext) elif (data_type == 'xray'): url = url_base + "{id}/{id}{format}".format(id=id, format=sf_ext) if (data_type in ["fasta", "seq"]): if (url is None): # TODO PDBe equivalent doesn't exist? # Seems that this url should be working: url = "https://www.rcsb.org/fasta/entry/%s" % id try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError("Couldn't download sequence for %s." % id) else: raise elif data_type == "xray": if (url is None): url = url_base + sf_prefix + id + sf_ext try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError( "Couldn't download structure factors for %s." % id) else: raise else: if (url is None): if format == "pdb": url = url_base + id + pdb_ext elif format == "cif_or_pdb": url = url_base + id + "." + "cif" else: url = url_base + id + "." + format try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError("Couldn't download model for %s." % id) else: raise if (compressed): try: import gzip except ImportError: raise Sorry( "gzip module not available - please use an uncompressed " + "source of PDB data.") else: # XXX due to a bug in urllib2, we can't pass the supposedly file-like # object directly, so we read the data into a StringIO object instead return gzip.GzipFile(fileobj=StringIO(data.read())) return data
def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None): """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml :param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml"] assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"] validate_pdb_id(id) if (log is None): log = null_out() id = id.lower() if (not force_download): if (local_cache is not None) and (data_type == "pdb"): from iotbx.file_reader import guess_file_type if (local_cache is Auto): local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files: if (len(file_name) > 4): file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id): if (guess_file_type(file_name) == "pdb"): file_name = os.path.join(local_cache, file_name) print >> log, "Reading from cache directory:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and (format == "pdb") and \ ("PDB_MIRROR_PDB" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if (data_type == "pdb") and (format == "cif") and \ ("PDB_MIRROR_MMCIF" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_MMCIF"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "%s.cif.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)): sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb"): url_base = 'https://files.rcsb.org/download/' pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe"): url_base = "https://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj"): url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb"): compressed = True if (format == "pdb"): url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format == "cif"): url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray"): compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]): url = "https://pdbj.org/rest/downloadPDBfile?format=fasta&id=%s" % id if (url is None) and (data_type != "fasta"): raise Sorry( "Can't determine PDBj download URL for this data/format " + "combination.") elif mirror == "pdb-redo": url_base = "https://pdb-redo.eu/db/" pdb_ext = "_final.pdb" cif_ext = "_final.cif" sf_prefix = "" sf_ext = "_final.mtz" if (data_type == 'pdb'): if (format == 'pdb'): url = url_base + "{id}/{id}{format}".format(id=id, format=pdb_ext) elif (format == 'cif'): url = url_base + "{id}/{id}{format}".format(id=id, format=cif_ext) elif (data_type == 'xray'): url = url_base + "{id}/{id}{format}".format(id=id, format=sf_ext) if (data_type in ["fasta", "seq"]): # XXX the RCSB doesn't appear to have a simple URL for FASTA files if (url is None): # TODO PDBe equivalent doesn't exist? url = "https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=%s&compressionType=uncompressed" % id try: data = libtbx.utils.urlopen(url) except urllib2.HTTPError, e: if e.getcode() == 404: raise RuntimeError("Couldn't download sequence for %s." % id) else: raise