def execDSSP(pdb, outputname=None, outputdir=None, stderr=True): """Execute DSSP for given *pdb*. *pdb* can be a PDB identifier or a PDB file path. If *pdb* is a compressed file, it will be decompressed using Python :mod:`gzip` library. When no *outputname* is given, output name will be :file:`pdb.dssp`. :file:`.dssp` extension will be appended automatically to *outputname*. If :file:`outputdir` is given, DSSP output and uncompressed PDB file will be written into this folder. Upon successful execution of :command:`dssp pdb > out` command, output filename is returned. On Linux platforms, when *stderr* is false, standard error messages are suppressed, i.e. ``dssp pdb > outputname 2> /dev/null``. For more information on DSSP see http://swift.cmbi.ru.nl/gv/dssp/. If you benefited from DSSP, please consider citing [WK83]_. .. [WK83] Kabsch W, Sander C. Dictionary of protein secondary structure: pattern recognition of hydrogen-bonded and geometrical features. *Biopolymers* **1983** 22:2577-2637.""" dssp = which('mkdssp') if dssp is None: dssp = which('dssp') if dssp is None: raise EnvironmentError('command not found: dssp executable is not ' 'found in one of system paths') assert outputname is None or isinstance(outputname, str),\ 'outputname must be a string' assert outputdir is None or isinstance(outputdir, str),\ 'outputdir must be a string' if not os.path.isfile(pdb): pdb = fetchPDB(pdb, compressed=False) if pdb is None: raise ValueError('pdb is not a valid PDB identifier or filename') if os.path.splitext(pdb)[1] == '.gz': if outputdir is None: pdb = gunzip(pdb, os.path.splitext(pdb)[0]) else: pdb = gunzip(pdb, os.path.join(outputdir, os.path.split(os.path.splitext(pdb)[0])[1])) if outputdir is None: outputdir = '.' if outputname is None: out = os.path.join(outputdir, os.path.splitext(os.path.split(pdb)[1])[0] + '.dssp') else: out = os.path.join(outputdir, outputname + '.dssp') if not stderr and PLATFORM != 'Windows': status = os.system('{0} {1} > {2} 2> /dev/null'.format( dssp, pdb, out)) else: status = os.system('{0} {1} > {2}'.format(dssp, pdb, out)) if status == 0: return out
def execSTRIDE(pdb, outputname=None, outputdir=None): """Execute STRIDE program for given *pdb*. *pdb* can be an identifier or a PDB file path. If *pdb* is a compressed file, it will be decompressed using Python :mod:`gzip` library. When no *outputname* is given, output name will be :file:`pdb.stride`. :file:`.stride` extension will be appended automatically to *outputname*. If :file:`outputdir` is given, STRIDE output and uncompressed PDB file will be written into this folder. Upon successful execution of :command:`stride pdb > out` command, output filename is returned. For more information on STRIDE see http://webclu.bio.wzw.tum.de/stride/. If you benefited from STRIDE, please consider citing [DF95]_. .. [DF95] Frishman D, Argos P. Knowledge-Based Protein Secondary Structure Assignment. *Proteins* **1995** 23:566-579.""" stride = which('stride') if stride is None: raise EnvironmentError('command not found: stride executable is not ' 'found in one of system paths') assert outputname is None or isinstance(outputname, str),\ 'outputname must be a string' assert outputdir is None or isinstance(outputdir, str),\ 'outputdir must be a string' if not os.path.isfile(pdb): pdb = fetchPDB(pdb, compressed=False) if pdb is None: raise ValueError('pdb is not a valid PDB identifier or filename') if os.path.splitext(pdb)[1] == '.gz': if outputdir is None: pdb = gunzip(pdb, os.path.splitext(pdb)[0]) else: pdb = gunzip( pdb, os.path.join(outputdir, os.path.split(os.path.splitext(pdb)[0])[1])) if outputdir is None: outputdir = '.' if outputname is None: out = os.path.join( outputdir, os.path.splitext(os.path.split(pdb)[1])[0] + '.stride') else: out = os.path.join(outputdir, outputname + '.stride') status = os.system('{0} {1} > {2}'.format(stride, pdb, out)) if status == 0: return out
def execSTRIDE(pdb, outputname=None, outputdir=None): """Execute STRIDE program for given *pdb*. *pdb* can be an identifier or a PDB file path. If *pdb* is a compressed file, it will be decompressed using Python :mod:`gzip` library. When no *outputname* is given, output name will be :file:`pdb.stride`. :file:`.stride` extension will be appended automatically to *outputname*. If :file:`outputdir` is given, STRIDE output and uncompressed PDB file will be written into this folder. Upon successful execution of :command:`stride pdb > out` command, output filename is returned. For more information on STRIDE see http://webclu.bio.wzw.tum.de/stride/. If you benefited from STRIDE, please consider citing [DF95]_. .. [DF95] Frishman D, Argos P. Knowledge-Based Protein Secondary Structure Assignment. *Proteins* **1995** 23:566-579.""" stride = which('stride') if stride is None: raise EnvironmentError('command not found: stride executable is not ' 'found in one of system paths') assert outputname is None or isinstance(outputname, str),\ 'outputname must be a string' assert outputdir is None or isinstance(outputdir, str),\ 'outputdir must be a string' if not os.path.isfile(pdb): pdb = fetchPDB(pdb, compressed=False) if pdb is None: raise ValueError('pdb is not a valid PDB identifier or filename') if os.path.splitext(pdb)[1] == '.gz': if outputdir is None: pdb = gunzip(pdb, os.path.splitext(pdb)[0]) else: pdb = gunzip(pdb, os.path.join(outputdir, os.path.split(os.path.splitext(pdb)[0])[1])) if outputdir is None: outputdir = '.' if outputname is None: out = os.path.join(outputdir, os.path.splitext(os.path.split(pdb)[1])[0] + '.stride') else: out = os.path.join(outputdir, outputname + '.stride') status = os.system('{0} {1} > {2}'.format(stride, pdb, out)) if status == 0: return out
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format' .format(repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): if success: LOGGER.debug('PDB file is found in the local mirror ({0}).' .format(sympath(fn))) return fn else: if kwargs.get('report', True): LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append( int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where( abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append( int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where( abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = [ 'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange' ] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format( data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [ dbref.accession for dbref in header[data_dict['chain']].dbrefs ] try: if len(chain_accessions) > 0: right_part = np.where( np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError( 'There is no accession for a chain in the Header') except: LOGGER.warn( 'Could not map domains in {0}'.format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format( data_dict['chain'])).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where( ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = 'http://pfam.sanger.ac.uk/family/acc?id=' + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search('(?<=PF)[0-9]{5}$', acc): raise ValueError('{0} is not a valid Pfam ID or Accession Code' .format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError('alignment must be one of full, seed, ncbi or' ' metagenomics') if alignment == 'ncbi' or alignment == 'metagenomics': url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: if not kwargs: url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: align_format = kwargs.get('format', 'selex').lower() if align_format not in FORMAT_OPTIONS['format']: raise ValueError('alignment format must be of type selex' ' stockholm or fasta. MSF not supported') if align_format == SELEX: align_format, extension = 'pfam', '.slx' elif align_format == FASTA: extension = '.fasta' else: extension = '.sth' gaps = str(kwargs.get('gaps', 'dashes')).lower() if gaps not in FORMAT_OPTIONS['gaps']: raise ValueError('gaps must be of type mixed, dots, dashes, ' 'or None') inserts = kwargs.get('inserts', 'upper').lower() if(inserts not in FORMAT_OPTIONS['inserts']): raise ValueError('inserts must be of type lower or upper') order = kwargs.get('order', 'tree').lower() if order not in FORMAT_OPTIONS['order']: raise ValueError('order must be of type tree or alphabetical') url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/format?format=' + align_format + '&alnType=' + alignment + '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1') response = openURL(url, timeout=int(kwargs.get('timeout', 60))) outname = kwargs.get('outname', None) if not outname: outname = orig_acc folder = str(kwargs.get('folder', '.')) filepath = join(makePath(folder), outname + '_' + alignment + extension) if compressed: filepath = filepath + '.gz' if url_flag: f_out = open(filepath, 'wb') else: f_out = openFile(filepath, 'wb') f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, 'wb') as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info('Pfam MSA for {0} is written as {1}.' .format(orig_acc, filepath)) return filepath
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def testBuffer(self): buff = open(self.gzfn, 'rb').read() text = gunzip(buff) self.assertEqual(text, self.bytes)
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug( 'PDB file is found in working directory ({0}).'.format( sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug( 'PDB file is found in the local folder ({0}).'.format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None tp = kwargs.pop('tp', None) if tp is not None: tp = tp.lower() if tp == 'http': try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP failed ' '({0}).'.format(str(err))) elif tp == 'ftp': try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ' '({0}).'.format(str(err))) else: tryHTTP = False try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: tryHTTP = True if fns is None or isinstance(fns, list) and None in fns: tryHTTP = True elif isinstance(fns, list): downloads = [ not_found[i][1] for i in range(len(fns)) if fns[i] is None ] if len(downloads) > 0: tryHTTP = True if tryHTTP: LOGGER.info('Downloading PDB files via FTP failed, ' 'trying HTTP.') try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def fetchPfamMSA(acc, alignment="full", compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = "http://pfam.sanger.ac.uk/family/acc?id=" + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search("(?<=PF)[0-9]{5}$", acc): raise ValueError("{0} is not a valid Pfam ID or Accession Code".format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError("alignment must be one of full, seed, ncbi or" " metagenomics") if alignment == "ncbi" or alignment == "metagenomics": url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: if not kwargs: url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: align_format = kwargs.get("format", "selex").lower() if align_format not in FORMAT_OPTIONS["format"]: raise ValueError("alignment format must be of type selex" " stockholm or fasta. MSF not supported") if align_format == SELEX: align_format, extension = "pfam", ".slx" elif align_format == FASTA: extension = ".fasta" else: extension = ".sth" gaps = str(kwargs.get("gaps", "dashes")).lower() if gaps not in FORMAT_OPTIONS["gaps"]: raise ValueError("gaps must be of type mixed, dots, dashes, " "or None") inserts = kwargs.get("inserts", "upper").lower() if inserts not in FORMAT_OPTIONS["inserts"]: raise ValueError("inserts must be of type lower or upper") order = kwargs.get("order", "tree").lower() if order not in FORMAT_OPTIONS["order"]: raise ValueError("order must be of type tree or alphabetical") url = ( "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/format?format=" + align_format + "&alnType=" + alignment + "&order=" + order[0] + "&case=" + inserts[0] + "&gaps=" + gaps + "&download=1" ) response = openURL(url, timeout=int(kwargs.get("timeout", 60))) outname = kwargs.get("outname", None) if not outname: outname = orig_acc folder = str(kwargs.get("folder", ".")) filepath = join(makePath(folder), outname + "_" + alignment + extension) if compressed: filepath = filepath + ".gz" if url_flag: f_out = open(filepath, "wb") else: f_out = openFile(filepath, "wb") f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, "wb") as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info("Pfam MSA for {0} is written as {1}.".format(orig_acc, filepath)) return filepath
def parseGAF(database='PDB', **kwargs): """Parse a GO Association File (GAF) corresponding to a particular database collection into a dictionary for ease of querying. See `GAF`_ for more information on the file format .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/ :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str :arg filename: filename for the gaf of interest default is goa_ and the database name in lower case and .gaf.gz :type filename: str """ import Bio.UniProt.GOA as GOA if not isinstance(database, str): raise TypeError('database should be a string') database = database.upper() filename = kwargs.get('filename', None) if filename is None: if database == 'UNIPROT': filename = 'goa_' + database.lower() + '_all.gaf.gz' else: filename = 'goa_' + database.lower() + '.gaf' data_folder = kwargs.get('data_folder', os.getcwd()) # If the file doesn't already exist, download it gaf = os.path.join(data_folder, filename) if not (os.path.exists(gaf) and os.path.getsize(gaf) > 0): LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf)) data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() try: ftp.cwd('pub/databases/GO/goa') ftp.cwd(database) ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write) except: raise ValueError('Cannot find the requested GO association file') # Logout from FTP server ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() with open(filename, 'w') as gaf_fp: gaf_fp.write(rawdata) LOGGER.info('Download completed for file {0}'.format(filename)) with open(filename, 'rt') as gaf_fp: funcs = defaultdict(list) # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. LOGGER.info('Iterating through entries in {0}'.format(gaf)) for entry in GOA.gafiterator(gaf_fp): id = entry.pop('DB_Object_ID') funcs[id].append(entry) return funcs
def parseGAF(database='PDB', **kwargs): """Parse a GO Association File (GAF) corresponding to a particular database collection into a dictionary for ease of querying. See `GAF`_ for more information on the file format .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/ :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str :arg filename: filename for the gaf of interest default is goa_ and the database name in lower case and .gaf.gz :type filename: str """ import Bio.UniProt.GOA as GOA if not isinstance(database, str): raise TypeError('database should be a string') database = database.upper() filename = kwargs.get('filename', None) if filename is None: if database == 'UNIPROT': filename = 'goa_' + database.lower() + '_all.gaf.gz' else: filename = 'goa_' + database.lower() + '.gaf' data_folder = kwargs.get('data_folder', os.getcwd()) # If the file doesn't already exist, download it gaf = os.path.join(data_folder, filename) if not(os.path.exists(gaf) and os.path.getsize(gaf) > 0): LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf)) data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() try: ftp.cwd('pub/databases/GO/goa') ftp.cwd(database) ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write) except: raise ValueError('Cannot find the requested GO association file') # Logout from FTP server ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() with open(filename, 'w') as gaf_fp: gaf_fp.write(rawdata) LOGGER.info('Download completed for file {0}'.format(filename)) with open(filename, 'rt') as gaf_fp: funcs = defaultdict(list) # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. LOGGER.info('Iterating through entries in {0}'.format(gaf)) for entry in GOA.gafiterator(gaf_fp): id = entry.pop('DB_Object_ID') funcs[id].append(entry) return funcs
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange'] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [dbref.accession for dbref in header[data_dict['chain']].dbrefs] try: if len(chain_accessions) > 0: right_part = np.where(np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError('There is no accession for a chain in the Header') except: LOGGER.warn('Could not map domains in {0}' .format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format(data_dict['chain']) ).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where(ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug('PDB file is found in working directory ({0}).' .format(sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug('PDB file is found in the local folder ({0}).' .format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ({0}), ' 'trying HTTP.'.format(str(err))) try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def testFile(self): fn = gunzip(self.gzfn) text = open(fn).read() self.assertEqual(text, self.text)
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format'.format( repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if success: LOGGER.debug('PDB file is found in the local mirror ({0}).'.format( sympath(fn))) return fn else: LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(pdb, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) ftp.quit() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def fetchPDB(pdb, folder='.', compressed=True, copy=False, **kwargs): """Retrieve PDB, PDBML, or mmCIF file(s) for specified *pdb* identifier(s). *pdb* may be a string or a list. The function will return a filename or a list of filenames depending on input (see :ref:`fetchpdb` for examples). If *compressed* is ``False``, all files will be decompressed. If *copy* is ``True``, all files from local PDB mirror will copied to the user specified *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format="cif"`` will fetch an mmCIF file (e.g. :file:`1XXX.cif.gz`), similarly ``format="xml"`` will fetch a PDBML file. If PDBML header file is desired, ``format="xml", noatom=True`` will do the job (e.g. :file:`1XXX-noatom.xml.gz`) The order of file search operations are as follows: First, files are sought in *folder*. Second, local PDB mirror will be sought, if one is set by the user (see :func:`setPDBMirrorPath`). Then, local PDB folder will be sought, if one is set by the user (see :func:`setPDBLocalFolder`). Finally, if files are not found locally, they will be downloaded one of wwPDB FTP servers (use :func:`setWWPDBFTPServer` to specify one close to you).""" if isinstance(pdb, str): identifiers = [pdb] elif isinstance(pdb, list): identifiers = pdb else: raise TypeError('pdb may be a string or a list of strings') assert isinstance(folder, str), 'folder must be a string' assert isinstance(compressed, bool), 'compressed must be a boolean' assert isinstance(copy, bool), 'copy must be a boolean' format = kwargs.pop('format', 'pdb') assert isinstance(format, str), 'format must be a string' format = format.lower() assert format in _PDB_FORMATS, '{0:s} is not valid format'.format( repr(format)) noatom = kwargs.pop('noatom', False) assert isinstance(noatom, bool), 'noatom must be a boolean' if kwargs: raise TypeError('{0:s} is not a valid keyword argument for this' 'function'.format(repr(kwargs.iterkeys().next()))) if folder != '.': folder = makePath(folder) if not os.access(folder, os.W_OK): raise IOError('permission to write in {0:s} is denied, please ' 'specify another folder'.format(folder)) filenames = [] exists = 0 success = 0 failure = 0 download = False if format == 'pdb': divided = 'data/structures/divided/pdb' pdbext = '.ent.gz' extensions = ['.ent', '.pdb'] # '.pdb' should be the last item prefix = 'pdb' elif format == 'xml': if noatom: divided = 'data/structures/divided/XML-noatom' pdbext = '-noatom.xml.gz' extensions = ['-noatom.xml'] else: divided = 'data/structures/divided/XML' pdbext = '.xml.gz' extensions = ['.xml'] prefix = '' else: divided = 'data/structures/divided/mmCIF' pdbext = '.cif.gz' extensions = ['.cif'] # '.pdb' should be the last item prefix = '' pdbfnmap = {} for extension in extensions: for pdbfn in glob(os.path.join(folder, '*' + extension + '*')): if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS: pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn for pdbfn in glob(os.path.join(folder, '*' + extension.upper() + '*')): if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS: pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn for i, pdbid in enumerate(identifiers): # Check validity of identifiers if not isinstance(pdbid, str): LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid)) filenames.append(None) failure += 1 continue pdbid = pdbid.strip().lower() if not (len(pdbid) == 4 and pdbid.isalnum()): LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid)) filenames.append(None) failure += 1 continue # Check if file exists in working directory identifiers[i] = pdbid if noatom: fn = pdbfnmap.get(pdbid + '-noatom', None) else: fn = pdbfnmap.get(pdbid, None) or pdbfnmap.get('pdb'+pdbid, None) if fn: fn = relpath(fn) if not compressed: temp, ext = os.path.splitext(fn) if ext == '.gz': fn = gunzip(fn, temp) filenames.append(fn) LOGGER.debug('{0:s} ({1:s}) is found in the working directory.' .format(pdbid, fn)) exists += 1 continue # Check the PDB mirror mirror_path = getPDBMirrorPath() if mirror_path is not None and os.path.isdir(mirror_path): fn = os.path.join(mirror_path, divided, pdbid[1:3], prefix + pdbid + pdbext) if os.path.isfile(fn): if copy or not compressed: if compressed: filename = os.path.join(folder, pdbid + extension + '.gz') shutil.copy(fn, filename) else: filename = os.path.join(folder, pdbid + extension) gunzip(fn, filename) filenames.append(filename) LOGGER.debug('{0:s} copied from local mirror ({1:s})' .format(pdbid, filename)) success += 1 else: filenames.append(fn) LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the local ' 'mirror.'.format(pdbid, fn[:fn[1:].index(os.path.sep)+2], fn[-15:])) exists += 1 continue # Check the PDB mirror local_folder = getPDBLocalFolder() if format and local_folder: local_folder, is_divided = local_folder if is_divided: fn = os.path.join(local_folder, pdbid[1:3], 'pdb' + pdbid + '.pdb.gz') else: fn = os.path.join(local_folder, pdbid + '.pdb.gz') if os.path.isfile(fn): if copy or not compressed: if compressed: filename = os.path.join(folder, pdbid + extension + '.gz') shutil.copy(fn, filename) else: filename = os.path.join(folder, pdbid + extension) gunzip(fn, filename) filenames.append(filename) LOGGER.debug('{0:s} copied from local PDB folder ({1:s})' .format(pdbid, filename)) success += 1 else: filenames.append(fn) LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the PDB ' 'local folder.'.format(pdbid, fn[:fn[1:].index(os.path.sep)+2], fn[-15:])) exists += 1 continue filenames.append(pdbid) download = True if download: from ftplib import FTP ftp_name, ftp_host, ftp_path = getWWPDBFTPServer() LOGGER.debug('Connecting wwPDB FTP server {0:s}.'.format(ftp_name)) if format == 'pdb' and not copy and local_folder: folder = local_folder compressed = True if is_divided: getfn = lambda folder, pdbid, ext: \ os.path.join(makePath(os.path.join(local_folder, pdbid[1:3])), 'pdb' + pdbid + ext) else: getfn = lambda folder, pdbid, ext: os.path.join(folder, pdbid + ext) else: getfn = lambda folder, pdbid, ext: os.path.join(folder, pdbid + ext) try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: #ftp_path = os.path.join(ftp_path, divided) ftp.login('') for i, pdbid in enumerate(identifiers): if pdbid != filenames[i]: continue filename = getfn(folder, pdbid, extension) if compressed: filename += '.gz' pdbfile = open(filename, 'w+b') fn = prefix + pdbid + pdbext try: ftp.cwd(ftp_path) ftp.cwd(divided) ftp.cwd(pdbid[1:3]) ftp.retrbinary('RETR ' + fn, pdbfile.write) except Exception as error: pdbfile.close() os.remove(filename) if fn in ftp.nlst(): LOGGER.debug('{0:s} download failed ({1:s}). It ' 'is possible that you don\'t have ' 'rights to download .gz files in the ' 'current network.'.format(pdbid, str(error))) else: LOGGER.debug('{0:s} download failed. {1:s} does not ' 'exist on {2:s}.' .format(fn, pdbid, ftp_host)) failure += 1 filenames[i] = None else: pdbfile.close() if not compressed: gunzip(filename) filename = relpath(filename) LOGGER.debug('{0:s} downloaded ({1:s})' .format(pdbid, filename)) success += 1 filenames[i] = filename ftp.quit() if len(identifiers) == 1: return filenames[0] else: LOGGER.info('PDB download completed ({2:d} found, ' '{0:d} downloaded, {1:d} failed).' .format(success, failure, exists)) return filenames