def getSeqres(pdbid, chids): import prody sequences = [] polymers = prody.parsePDBHeader(pdbid, 'polymers') for polymer in polymers: if polymer.chid in chids: sequences.append(polymer.sequence) return sequences
def iterate_over_pairs(pdbid, ref_chid): """ iter<ref_chain, other_chain> """ header = prody.parsePDBHeader(pdbid) for p in header['polymers']: if p.chid == ref_chid: continue yield (ref_chid, p.chid)
def split_receptor(bucket, table_idx, param, datum): # todo (maksym) param = params; try: # todo (maksym) datum = pdb_name if type(datum).__name__ in ['tuple', 'list']: datum = datum[0] receptor = datum # todo receptor = pdb_name !!!!! output_folder = param['output_folder'] output_folder = '{}_{}'.format(table_idx, output_folder) input_download_folder = param['input_download_folder'] input_pdb_dir = os.path.join(data_dir, input_download_folder) input_pdb_path = os.path.join(input_pdb_dir, receptor + '.pdb') parsed_pdb = prody.parsePDB(input_pdb_path) parsed_header = prody.parsePDBHeader(input_pdb_path) output_rec_dir = os.path.join(data_dir, output_folder, receptor) _makedir(output_rec_dir) ligands = [] for chem in parsed_header['chemicals']: chain, resnum, resname = chem.chain, chem.resnum, chem.resname ligands.append([chain, str(resnum), resname]) for chain, resnum, resname in ligands: try: rec = parsed_pdb.select('not (chain {} resnum {})'.format( chain, resnum)) rec = rec.select('not water') heavy_atom = rec.select('not hydrogen').numAtoms() rec_name = '_'.join( [receptor, chain, resnum, resname, 'receptor']) + '.pdb' prody.writePDB(os.path.join(output_rec_dir, rec_name), rec) record = [ receptor, chain, resnum, resname, heavy_atom, parsed_header['experiment'], parsed_header['resolution'], 1, 'success' ] records = [record] db.insert(table_idx, records, bucket=bucket) except Exception as e: record = [ receptor, chain, resnum, resname, 0, 0, 0, 0, str(e) ] # datum = failure_message records = [record] print(records) db.insert(table_idx, records, bucket=bucket) # TODO: (maksym) I believe this is controllable with logging except Exception as e: print(e) raise Exception(str(e))
def split(receptor, pdb_outpath, init='split_init'): init = eval(init) rec_dir = os.path.join(init.data_dir, init.rec_folder) lig_dir = os.path.join(init.data_dir, init.lig_folder) pdb_path = os.path.join(init.data_dir, pdb_outpath) parsed_pdb = prody.parsePDB(pdb_path) parsed_header = prody.parsePDBHeader(pdb_path) ligands = [] for chem in parsed_header['chemicals']: ligands.append([chem.chain, str(chem.resnum), chem.resname]) splited = [] for chain, resnum, resname in ligands: lig = parsed_pdb.select('chain {} resnum {}'.format(chain, resnum)) rec = parsed_pdb.select('not (chain {} resnum {})'.format( chain, resnum)) if lig is None: continue resid = lig.getHierView().iterResidues().next().getResindex() resid = str(resid) heavy_lig = lig.select('not hydrogen') heavy_atom = heavy_lig.numAtoms() heavy_coord = heavy_lig.getCoords() #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0)) #Changing max_size_on_axis to max pairwise distance between coords #max_size_on_axis = max(scipy.spatial.distance.pdist(heavy_coord).tolist()) lig_name = '_'.join([receptor, chain, resnum, resname, 'ligand' ]) + '.pdb' if not os.path.exists(os.path.join(lig_dir, receptor)): os.makedirs(os.path.join(lig_dir, receptor)) prody.writePDB(os.path.join(lig_dir, receptor, lig_name), lig) rec_name = '_'.join([receptor, chain, resnum, resname, 'receptor' ]) + '.pdb' if not os.path.exists(os.path.join(rec_dir, receptor)): os.makedirs(os.path.join(rec_dir, receptor)) prody.writePDB(os.path.join(rec_dir, receptor, rec_name), rec) splited.append([ receptor, str(resname), os.path.join(init.rec_folder, receptor, rec_name), os.path.join(init.lig_folder, receptor, lig_name) ]) return splited
def split_pdb(split_pdb, cuttoff_dist=None, discard_h=True, init="split_pdb_init"): """ Iterates through every molecule of the ligand in the receptor. Crops atoms of the receptor (and any chemicals, but not water) within within the cutoff distance of any atoms of the ligand. Saves ligand + binding site crop as pairs into folders. :param split_pdb: string (relative path the the file to split) :param cuttoff_dist: float (distance of any atoms in the binding site from any atom of the ligand to be saved) :param discard_h: Bool T/F (discard all hydrogens ?) :param init: string (init function in this module) :return: """ init = eval(init) pdb_path = os.path.join(init.db_path, split_pdb) # parse header and coordinates from the PDB file pr_pdb = pr.parsePDB(pdb_path) pr_header = pr.parsePDBHeader(pdb_path) pdb_id = pr_header["identifier"] ligs = [] # retrieve names of the chemicals from the header of the PDB file for chem in pr_header['chemicals']: ligs.append([chem.chain, str(chem.resnum), chem.resname]) out_filenames = [] for chain, resnum, resname in ligs: if discard_h: lig = pr_pdb.select('noh chain {} and resnum {}'.format( chain, resnum)) rec = pr_pdb.select( 'noh and not water not (chain {} resnum {})'.format( chain, resnum)) else: lig = pr_pdb.select('chain {} resnum {}'.format(chain, resnum)) rec = pr_pdb.select('not water not (chain {} resnum {})'.format( chain, resnum)) # write the ligand file and the receptor file pair_name = '_'.join([pdb_id, chain, resnum, resname]) lig_name = pair_name + "_ligand.pdb" rec_name = pair_name + "_receptor.pdb" os.makedirs(os.path.join(init.db_path, init.split_dir, pair_name)) lig_outpath = os.path.join(init.split_dir, pair_name, lig_name) rec_outpath = os.path.join(init.split_dir, pair_name, rec_name) pr.writePDB(os.path.join(init.db_path, lig_outpath), lig) pr.writePDB(os.path.join(init.db_path, rec_outpath), rec) out_filenames.append([lig_outpath, rec_outpath]) return out_filenames
def download(bucket, table_idx, param, input_data): # todo(maksym) input_data = pdb_ids ''' Download pdb file from rcsb Args: table_idx: id for download table param: dict { 'output_folder':'...', ... } input_data: str pdb id Returns: ''' try: receptor = input_data[1:] # todo(maksym) pdb_id output_folder = param['output_folder'] output_folder_name = '{}_{}'.format(table_idx, output_folder) dest_dir = os.path.join(data_dir, output_folder_name) _makedir(dest_dir) pdb_path = os.path.join(dest_dir, receptor + '.pdb') #pdb_path = '/home/maksym/ryan/labeled_pdb/crystal_ligands/'+receptor+'/'+receptor+'.pdb' #print ('pdb',pdb_path) if not os.path.exists(pdb_path): download_address = 'https://files.rcsb.org/download/{}.pdb'.format( receptor) os.system('wget --no-check-certificate -P {} {}'.format( dest_dir, download_address)) header = prody.parsePDBHeader(pdb_path) record = [ receptor, header['experiment'], header['resolution'], 1, 'success' ] records = [record] db.insert(table_idx, records, bucket=bucket) except Exception as e: "Exception causing non success" print e record = [input_data, 'unk', 0, 0, str(e)] # todo maksym (unk) = failed records = [record] db.insert(table_idx, records, bucket=bucket)
def split_pdb(pdb_outpath, cutoff_dist=None, discard_h=True, init='split_pdb_init'): init = eval(init) pdb_path = os.path.join(init.data_dir, pdb_outpath) pr_pdb = pr.parsePDB(pdb_path) pr_header = pr.parsePDBHeader(pdb_path) pdb_id = pr_header['identifier'] ligs = [] for chem in pr_header['chemicals']: ligs.append([chem.chain, str(chem.resnum), chem.resname]) out_filenames = [] for chain, resnum, resname in ligs: if discard_h: lig = pr_pdb.select('noh chain {} and resnum {}'.format( chain, resnum)) rec = pr_pdb.select( 'noh and not water not (chain {} resnum {})'.format( chain, resnum)) else: lig = pr_pdb.select('chain{} resnum {}'.format(chain, resnum)) rec = pr_pdb.select('not water not (chain {} resnum {})'.format( chain, resnum)) pair_name = '_'.join([pdb_id, chain, resnum, resname]) lig_name = pair_name + '_ligand.pdb' rec_name = pair_name + '_receptor.pdb' split_dir = os.path.join(init.data_dir, init.split_folder, pair_name) if not os.path.exists(split_dir): os.makedirs(split_dir) lig_outpath = os.path.join(init.split_folder, pair_name, lig_name) rec_outpath = os.path.join(init.split_folder, pair_name, rec_name) pr.writePDB(os.path.join(init.data_dir, lig_outpath), lig) pr.writePDB(os.path.join(init.data_dir, rec_outpath), rec) out_filenames.append([rec_outpath, lig_outpath]) return out_filenames
def split(receptor, pdb_path, rec_dir, lig_dir): """ split pdb into receptor and ligand args: receptor:: str 4 letters pdb id pdb_path:: str path of download pdb file rec_dir:: str dir for output lig_dir:: str dir for output returns: receptor:: str 4 letters pdb id resname:: str res id rec_path:: str path of splited receptor lig_path:: str path of splited ligand """ parsed_pdb = prody.parsePDB(pdb_path) parsed_header = prody.parsePDBHeader(pdb_path) ligands = [] for chem in parsed_header['chemicals']: ligands.append([chem.chain, str(chem.resnum), chem.resname]) splited = [] for chain, resnum, resname in ligands: lig = parsed_pdb.select('chain {} resnum {}'.format(chain, resnum)) rec = parsed_pdb.select('not (chain {} resnum {})'.format(chain, resnum)) if lig is None: continue resid = lig.getHierView().iterResidues().next().getResindex() resid = str(resid) heavy_lig = lig.select('not hydrogen') heavy_atom = heavy_lig.numAtoms() heavy_coord =heavy_lig.getCoords() #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0)) #Changing max_size_on_axis to max pairwise distance between coords #max_size_on_axis = max(scipy.spatial.distance.pdist(heavy_coord).tolist()) lig_name = '_'.join([receptor,chain,resnum,resname,'ligand']) + '.pdb' if not os.path.exists(os.path.join(lig_dir,receptor)): os.makedirs(os.path.join(lig_dir,receptor)) prody.writePDB(os.path.join(lig_dir,receptor, lig_name), lig) rec_name = '_'.join([receptor, chain, resnum, resname, 'receptor']) + '.pdb' if not os.path.exists(os.path.join(rec_dir,receptor)): os.makedirs(os.path.join(rec_dir,receptor)) prody.writePDB(os.path.join(rec_dir,receptor, rec_name), rec) splited.append(['"'+receptor+'"','"'+str(resname)+'"','"'+os.path.join(rec_dir,receptor, rec_name)+'"','"'+os.path.join(lig_dir,receptor, lig_name)+'"']) return splited
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, search_b=False, skip_a=False, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.sanger.ac.uk/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') urlextension = '' if kwargs: ga = int(kwargs.get('ga', 1)) if not (ga == 1 or ga == 0): raise ValueError('ga must be either 0 or 1') evalue = kwargs.get('evalue', None) if evalue: if not float(evalue) <= 10.0: raise ValueError('evalue must be a valid float < 10.0') urlextension = urlextension + '&evalue=' + str(evalue) else: urlextension = urlextension + '&ga=' + str(ga) search_b = int(bool(search_b)) skip_a = int(bool(skip_a)) if skip_a == 1: search_b = 1 urlextension = urlextension + '&searchBs=' + str(search_b) urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) + urlextension + '&output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) try: url = dictElement(root[0], prefix)['result_url'] except (IndexError, KeyError): raise ValueError('failed to parse results XML, check URL: ' + url) else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' else: url = ('http://pfam.sanger.ac.uk/protein/' + idcode + '?output=xml') else: url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass #else: # if xml: # break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def blast(pdb_path): cdir = os.getcwd() tdir = tempfile.mkdtemp() os.chdir(tdir) receptor = os.path.basename(os.path.splitext(pdb_path)[0]) pdbHead = prody.parsePDBHeader(pdb_path) pdbFile = prody.parsePDB(pdb_path) ligands = [] for chem in pdbHead['chemicals']: ligands.append([chem.chain, str(chem.resnum), chem.resname, chem.name]) blast_result = [] for chain, resnum, resname, name in ligands: rec = pdbFile.select('not (chain {} resnum {})'.format(chain, resnum)) ligand = pdbFile.select('chain {} resnum {}'.format(chain, resnum)) cen_ligand = prody.calcCenter(ligand) res_coll = [] ligCoords = ligand.getCoords() print('lig_size', len(ligCoords)) sequence = '' i = 4 while len(sequence) < 100: for center in ligCoords: around_atoms = rec.select( 'same residue as within {} of center'.format(i), center=center) if around_atoms is None: continue res_coll.append(around_atoms) #res_indices = around_atoms.getResindices() #print(around_atoms.getHierView()['A'].getSequence()) #print (res_indices) #res_coll = res_coll | set(res_indices) resindices = reduce(lambda x, y: x | y, res_coll) sequence = resindices.getHierView()['A'].getSequence() print('sequence', i, len(sequence), sequence) i += 1 with open('sequence.fasta', 'w') as fout: fout.write(">receptor\n" + sequence + '\n') cmd = 'blastp -db {} -query sequence.fasta -outfmt 5 -out result'.format( BLASTDB) #print(os.getcwd()) cl = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) cl.wait() #print(os.listdir(os.getcwd())) dtree = xml.dom.minidom.parse("result") collection = dtree.documentElement hits = collection.getElementsByTagName("Hit") hit_result = [] for hit in hits: hit_id = hit.getElementsByTagName('Hit_id')[0].childNodes[0].data hsps = hit.getElementsByTagName('Hit_hsps')[0] identity = hsps.getElementsByTagName( 'Hsp_identity')[0].childNodes[0].data align_len = hsps.getElementsByTagName( 'Hsp_align-len')[0].childNodes[0].data qseq = hsps.getElementsByTagName('Hsp_qseq')[0].childNodes[0].data hseq = hsps.getElementsByTagName('Hsp_hseq')[0].childNodes[0].data midline = hsps.getElementsByTagName( 'Hsp_midline')[0].childNodes[0].data blast_result.append([ receptor, hit_id, str(identity), str(align_len), str(len(sequence)), midline, hseq, sequence ]) return blast_result
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib.request.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def split_pdb(uid, pdb_file, init="split_pdb_init"): """ Iterates through every ligand molecule in the crystal structure (frequently there are a few). For each ligand selects and crops its binding site (any atom of Protein/DNA/Cofactor within the cutoff distance). Saves pairs of files: ligand + this ligand's binding site. Example: <pre lang="python"> split_pdb('105M','download/105M.pdb') </pre> Output: <pre lang="python"> [['05M_A_155_HEM','split/105M_A_155_HEM/105M_A_155_HEM_receptor.pdb','split/105M_A_155_HEM/105M_A_155_HEM_ligand.pdb',30,100]] </pre> :param pdb_file: string (relative path the the file to split) :param cutoff_dist: float (distance of any atoms in the binding site from any atom of the ligand to be saved) :param min_rec_atoms: minimun number of atoms be saved as binding site :param min_lig_atoms: minumum number of atoms for ligand to be saved :param init: string (init function in this module) :return: nested list of pairs of file names of dimensions [num_pairs x [string,string,int,int]] or \ [num_pairs x [lig_file,bindsite_file,lig_num_atoms,bindsite_num_atoms]] """ init = eval(init) pdb_path = os.path.join(init.db_root, pdb_file) # parse PDB file pr_pdb = pr.parsePDB(pdb_path) assert pr_pdb.numAtoms() > (init.min_lig_atoms+init.min_rec_atoms), \ "not enough atoms in this pdb" + str(pr_pdb.numAtoms()) # parse header of the PDB file pr_header = pr.parsePDBHeader(pdb_path) pdb_id = pr_header["identifier"] ligs = [] # retrieve names of the chemicals from the header of the PDB file for chem in pr_header['chemicals']: ligs.append([chem.chain, str(chem.resnum), chem.resname]) out_filenames = [] for lig_chain, lig_resnum, lig_resname in ligs: if init.discard_h: lig = pr_pdb.select('noh chain {} and resnum {}'.format( lig_chain, lig_resnum)) rec = pr_pdb.select( 'noh and not water not (chain {} resnum {})'.format( lig_chain, lig_resnum)) else: lig = pr_pdb.select('chain {} resnum {}'.format( lig_chain, lig_resnum)) rec = pr_pdb.select('not water not (chain {} resnum {})'.format( lig_chain, lig_resnum)) lig_coords = lig.getCoords() lig_atom_num = lig.numAtoms() # escape the loop without writing anything if the number of ligand atoms is too small if lig_atom_num < init.min_lig_atoms: continue # select residues of the binding site bindsite_Segindices = [] bindsite_Chids = [] bindsite_Resnums = [] for atom_coord in lig_coords: around_atoms = rec.select( 'same residue as within {} of center'.format(init.cutoff_dist), center=atom_coord) bindsite_Segindices = bindsite_Segindices + list( around_atoms.getSegindices()) bindsite_Chids = bindsite_Chids + list(around_atoms.getChids()) bindsite_Resnums = bindsite_Resnums + list( around_atoms.getResnums()) # select only unique atoms bindsite_names = [ str(bindsite_Segindices[i]) + str(bindsite_Chids[i]) + str(bindsite_Resnums[i]) for i in range(len(bindsite_Segindices)) ] bindsite_names, unq_idx = np.unique(np.asarray(bindsite_names), return_index=True) segindices = np.asarray(bindsite_Segindices)[unq_idx] chids = np.asarray(bindsite_Chids)[unq_idx] resnums = np.asarray(bindsite_Resnums)[unq_idx] # proofcheck that ligand and receptor residues do not overlap lig_names = np.unique( np.asarray([ str(lig.getSegindices()[i]) + str(lig.getChids()[i]) + str(lig.getResnums()[i]) for i in range(lig_atom_num) ])) assert len(np.intersect1d(bindsite_names, lig_names) ) == 0, "broken selection: binding site and ligand overlap" # select the receptor atoms to save bindsite_resid = len(unq_idx) prody_cmd = " or ".join([ "(segindex {} chid {} resnum {})".format(segindices[i], chids[i], resnums[i]) for i in range(bindsite_resid) ]) binding_site = rec.select(prody_cmd) # write the ligand file and the receptor file pair_name = '_'.join( [uid, str(lig_chain), str(lig_resnum), str(lig_resname)]) lig_name = pair_name + "_ligand.pdb" bindsite_name = pair_name + "_bindsite.pdb" os.makedirs(os.path.join(init.db_root, init.split_dir, pair_name)) lig_outpath = os.path.join(init.split_dir, pair_name, lig_name) bindsite_outpath = os.path.join(init.split_dir, pair_name, bindsite_name) pr.writePDB(os.path.join(init.db_root, lig_outpath), lig) pr.writePDB(os.path.join(init.db_root, bindsite_outpath), binding_site) out_filenames.append([ pair_name, lig_outpath, bindsite_outpath, lig.numAtoms(), binding_site.numAtoms() ]) return out_filenames
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = { 'format' : 'tsv' } enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace('results','download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND','RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def split_ligand(bucket, table_idx, param, input_data): ''' Split ligand form PDB file Parse PDB header, ligand records under key 'chemicals', get the Args: table_idx: id for split ligand table param: dict { 'output_folder':'...', 'input_download_folder':'...', } input_data: [str], (str) or str ['1a2b'] ,('3eml') ,'3eln' Returns: ''' try: if type(input_data).__name__ in ['tuple', 'list']: input_data = input_data[0] # do not allow x = x[0] receptor = input_data # todo (maksym) better representation of the input data #fit_box_size = param['fit_box_size'] output_folder = param['output_folder'] output_folder = '{}_{}'.format(table_idx, output_folder) input_download_folder = param['input_download_folder'] pdb_dir = os.path.join( data_dir, input_download_folder ) # todo (maksym) download_folder = source_folder pdb_path = os.path.join(pdb_dir, receptor + '.pdb') parsed_pdb = prody.parsePDB(pdb_path) parsed_header = prody.parsePDBHeader(pdb_path) output_lig_dir = os.path.join( data_dir, output_folder, receptor) # todo(maksym) datadir is config.data_dir _makedir(output_lig_dir) ligands = [] for chem in parsed_header['chemicals']: ligands.append([chem.chain, str(chem.resnum), chem.resname]) for chain, resnum, resname in ligands: try: lig = parsed_pdb.select('chain {} resnum {}'.format( chain, resnum)) resid = lig.getHierView().iterResidues().next().getResindex() resid = str(resid) heavy_lig = lig.select('not hydrogen') heavy_atom = heavy_lig.numAtoms() heavy_coord = heavy_lig.getCoords() #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0)) #Changing max_size_on_axis to max pairwise distance between coords max_size_on_axis = max( scipy.spatial.distance.pdist(heavy_coord).tolist()) lig_name = '_'.join( [receptor, chain, resnum, resname, 'ligand']) + '.pdb' prody.writePDB(os.path.join(output_lig_dir, lig_name), lig) record = [ receptor, chain, resnum, resname, resid, heavy_atom, max_size_on_axis, 1, 'success' ] records = [record] db.insert(table_idx, records, bucket=bucket) except Exception as e: record = [receptor, chain, resnum, resname, 0, 0, 0, 0, str(e)] records = [record] db.insert(table_idx, records, bucket=bucket) except Exception as e: print(e) raise Exception(str(e))