def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder + '/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})'.format( go_obo_url, sympath(filename))) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(go_obo_url)) else: go_obo = data_folder + '/go-basic.obo' return obo_parser.GODag(go_obo)
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if(not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if(e.errno != 17): raise e else: raise Exception('Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.') # Check if the file exists already if(not os.path.isfile(data_folder+'/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder+'/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})' .format(go_obo_url, sympath(filename))) else: LOGGER.warn('{0} download failed, reason unknown.' .format(go_obo_url)) else: go_obo = data_folder+'/go-basic.obo' return obo_parser.GODag(go_obo)
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug( 'PDB file is found in working directory ({0}).'.format( sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug( 'PDB file is found in the local folder ({0}).'.format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None tp = kwargs.pop('tp', None) if tp is not None: tp = tp.lower() if tp == 'http': try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP failed ' '({0}).'.format(str(err))) elif tp == 'ftp': try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ' '({0}).'.format(str(err))) else: tryHTTP = False try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: tryHTTP = True if fns is None or isinstance(fns, list) and None in fns: tryHTTP = True elif isinstance(fns, list): downloads = [ not_found[i][1] for i in range(len(fns)) if fns[i] is None ] if len(downloads) > 0: tryHTTP = True if tryHTTP: LOGGER.info('Downloading PDB files via FTP failed, ' 'trying HTTP.') try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format'.format( repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if success: LOGGER.debug('PDB file is found in the local mirror ({0}).'.format( sympath(fn))) return fn else: LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug('PDB file is found in working directory ({0}).' .format(sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug('PDB file is found in the local folder ({0}).' .format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ({0}), ' 'trying HTTP.'.format(str(err))) try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format' .format(repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): if success: LOGGER.debug('PDB file is found in the local mirror ({0}).' .format(sympath(fn))) return fn else: if kwargs.get('report', True): LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(pdb, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) ftp.quit() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames