def _processPDB(self): """ Processes the PDB file, i.e. adds all relevant atoms to a dataframe and determines the peptide an phosphate chains""" self._logger.info("Processing PDB") ppb=CaPPBuilder() d = [] peptide_chains = {} phosphate_chains = set() # Loop over all chains for chain_idx, chain in enumerate(self._pdb[0]): isPeptideChain = False isPhosphateChain = False # try to create peptide sequence pp = ppb.build_peptides(chain) if pp: # tag chain as peptide chain isPeptideChain = True peptide_chains[chain.get_id()] = pp[0].get_sequence().tostring() # loop over residues in chain for residue in chain: resn = residue.get_resname() if resn in ['PTR', 'TPO', 'SEP']: # Chain contains a phospho-residue; tag as phosphateChain isPhosphateChain = True phosphate_chains.add(chain.get_id()) # process atoms only if residue is not water and is part of a peptide or phospho chain if residue.get_id()[0] != 'W' and (isPeptideChain or isPhosphateChain): resi = residue.get_id()[1] inscode = residue.get_id()[2].strip() hasPhosphate = False for atom in residue: vdw = -1 if isPeptideChain: elem = atom.element if elem: elem = elem if len(elem)==1 else elem[0]+elem[1].lower() vdw = self._periodicTable.GetRvdw(self._periodicTable.GetAtomicNumber(elem)) coords = atom.get_coord() sn = atom.get_serial_number() # append to dataframe d.append((chain.get_id().strip(), chain_idx, resn, resi, inscode, sn, atom.get_name(), isPeptideChain, isPhosphateChain, coords[0], coords[1], coords[2], vdw)) if resn in ['PTR', 'TPO', 'SEP'] and atom.get_name().strip() == 'P' and len(coords)==3: hasPhosphate = True if resn in ['PTR', 'TPO', 'SEP'] and not hasPhosphate: # del residue because no annotated phosphate d = d[:-len(residue)] if len(d) == 0: raise Exception('No amino acids found.') # save list to dataframe data = np.zeros((len(d), ), dtype=[('chain', 'a1'), ('chain_idx', 'a1'), ('resn', 'a3'), ('resi', 'i4'), ('inscode', 'a1'), ('sn', 'i4'), ('an', 'a4'), ('peptideChain', 'b'), ('phosphateChain', 'b'), ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('vdw', 'f4')]) data[:] = d idf = pd.DataFrame(data) idf[['peptideChain', 'phosphateChain']] = idf[['peptideChain', 'phosphateChain']].astype('bool') self._df = idf self._peptideChains = peptide_chains self._phosphateChains = phosphate_chains
def __init__(self, model, radius=12.0, offset=0): """Initialize. A residue's exposure is defined as the number of CA atoms around that residues CA atom. A dictionary is returned that uses a L{Residue} object as key, and the residue exposure as corresponding value. :param model: the model that contains the residues :type model: L{Model} :param radius: radius of the sphere (centred at the CA atom) :type radius: float :param offset: number of flanking residues that are ignored in the calculation of the number of neighbors :type offset: int """ assert(offset >= 0) ppb = CaPPBuilder() ppl = ppb.build_peptides(model) fs_map = {} fs_list = [] fs_keys = [] for pp1 in ppl: for i in range(0, len(pp1)): fs = 0 r1 = pp1[i] if not is_aa(r1) or not r1.has_id('CA'): continue ca1 = r1['CA'] for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: continue r2 = pp2[j] if not is_aa(r2) or not r2.has_id('CA'): continue ca2 = r2['CA'] d = (ca2 - ca1) if d < radius: fs += 1 res_id = r1.get_id() chain_id = r1.get_parent().get_id() # Fill the 3 data structures fs_map[(chain_id, res_id)] = fs fs_list.append((r1, fs)) fs_keys.append((chain_id, res_id)) # Add to xtra r1.xtra['EXP_CN'] = fs AbstractPropertyMap.__init__(self, fs_map, fs_keys, fs_list)
def __init__(self, model, radius=12.0, offset=0): """ A residue's exposure is defined as the number of CA atoms around that residues CA atom. A dictionary is returned that uses a L{Residue} object as key, and the residue exposure as corresponding value. @param model: the model that contains the residues @type model: L{Model} @param radius: radius of the sphere (centred at the CA atom) @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int """ assert (offset >= 0) ppb = CaPPBuilder() ppl = ppb.build_peptides(model) fs_map = {} fs_list = [] fs_keys = [] for pp1 in ppl: for i in range(0, len(pp1)): fs = 0 r1 = pp1[i] if not is_aa(r1) or not r1.has_id('CA'): continue ca1 = r1['CA'] for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: continue r2 = pp2[j] if not is_aa(r2) or not r2.has_id('CA'): continue ca2 = r2['CA'] d = (ca2 - ca1) if d < radius: fs += 1 res_id = r1.get_id() chain_id = r1.get_parent().get_id() # Fill the 3 data structures fs_map[(chain_id, res_id)] = fs fs_list.append((r1, fs)) fs_keys.append((chain_id, res_id)) # Add to xtra r1.xtra['EXP_CN'] = fs AbstractPropertyMap.__init__(self, fs_map, fs_keys, fs_list)
def __init__(self, model, radius, offset=0, hse_up_key='HSE_U', hse_down_key='HSE_D', angle_key=None, check_chain_breaks=False, check_knots=False, receptor=None, signprot=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert(offset>=0) # For PyMOL visualization self.ca_cb_list=[] ppb=CaPPBuilder() ppl=ppb.build_peptides(model) hse_map={} hse_list=[] hse_keys=[] ### GP if model.get_id()!=0: model = model[0] residues_in_pdb,residues_with_proper_CA=[],[] if check_chain_breaks==True: # for m in model: for chain in model: for res in chain: # try: if is_aa(res): residues_in_pdb.append(res.get_id()[1]) # except: # if is_aa(chain): # residues_in_pdb.append(chain.get_id()[1]) # print('chain', chain, res) # break self.clash_pairs = [] self.chain_breaks = [] if check_knots: possible_knots = PossibleKnots(receptor, signprot) knot_resis = possible_knots.get_resnums() self.remodel_resis = {} for pp1 in ppl: for i in range(0, len(pp1)): residues_with_proper_CA.append(pp1[i].get_id()[1]) if i==0: r1=None else: r1=pp1[i-1] r2=pp1[i] if i==len(pp1)-1: r3=None else: r3=pp1[i+1] # This method is provided by the subclasses to calculate HSE result=self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle=result hse_u=0 hse_d=0 ca2=r2['CA'].get_vector() residue_up=[] ### GP residue_down=[] ### GP for pp2 in ppl: for j in range(0, len(pp2)): try: if r2.get_id()[1]-1!=r1.get_id()[1] or r2.get_id()[1]+1!=r3.get_id()[1]: pass else: raise Exception except: if pp1 is pp2 and abs(i-j)<=offset: # neighboring residues in the chain are ignored continue ro=pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao=ro['CA'].get_vector() d=(cao-ca2) if d.norm()<radius: if d.angle(pcb)<(math.pi/2): hse_u+=1 ### GP # Puts residues' names in a list that were found in the upper half sphere residue_up.append(ro) ### end of GP code else: hse_d+=1 ### GP # Puts residues' names in a list that were found in the lower half sphere residue_down.append(ro) ### end of GP code res_id=r2.get_id() chain_id=r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)]=(hse_u, hse_d, angle) hse_list.append((r2, (residue_up, residue_down, hse_u, hse_d, angle))) ### GP residue_up and residue_down added to hse_list hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key]=hse_u r2.xtra[hse_down_key]=hse_d if angle_key: r2.xtra[angle_key]=angle ### GP checking for knots if check_knots: for knot in knot_resis: if knot[0][1]==pp1[i].get_id()[1] and knot[0][0]==pp1[i].get_parent().get_id(): print(pp1[i].get_parent().get_id(),pp1[i]) for r in residue_up: if r.get_parent().get_id()==knot[1][0] and r.get_id()[1] in knot[1][1]: print('close: ', r.get_parent().get_id(),r) resi_range = [knot[1][1][0], knot[1][1][-1]] if knot[1][0] not in self.remodel_resis: self.remodel_resis[knot[1][0]] = [resi_range] else: if resi_range not in self.remodel_resis[knot[1][0]]: self.remodel_resis[knot[1][0]].append(resi_range) ### GP checking for atom clashes include_prev, include_next = False, False try: if pp1[i].get_id()[1]-1!=pp1[i-1].get_id()[1]: include_prev = True except: include_prev = False try: if pp1[i].get_id()[1]+1!=pp1[i+1].get_id()[1]: include_next = True except: include_next = False for atom in pp1[i]: ref_vector = atom.get_vector() for other_res in residue_up: try: if other_res==pp1[i-1] and include_prev==False: continue elif len(pp1)>=i+1 and other_res==pp1[i+1] and include_next==False: continue else: raise Exception except: for other_atom in other_res: other_vector = other_atom.get_vector() d = other_vector-ref_vector if d.norm()<2: if len(str(pp1[i]['CA'].get_bfactor()).split('.')[1])==1: clash_res1 = float(str(pp1[i]['CA'].get_bfactor())+'0') else: clash_res1 = pp1[i]['CA'].get_bfactor() if len(str(other_res['CA'].get_bfactor()).split('.')[1])==1: clash_res2 = float(str(other_res['CA'].get_bfactor())+'0') else: clash_res2 = other_res['CA'].get_bfactor() self.clash_pairs.append([(clash_res1, pp1[i].get_id()[1]), (clash_res2, other_res.get_id()[1])]) if check_chain_breaks==True: for r in residues_in_pdb: if r not in residues_with_proper_CA: self.chain_breaks.append(r)
def __init__(self, model, radius, offset, hse_up_key, hse_down_key, angle_key=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert (offset >= 0) # For PyMOL visualization self.ca_cb_list = [] ppb = CaPPBuilder() ppl = ppb.build_peptides(model) hse_map = {} hse_list = [] hse_keys = [] for pp1 in ppl: for i in range(0, len(pp1)): if i == 0: r1 = None else: r1 = pp1[i - 1] r2 = pp1[i] if i == len(pp1) - 1: r3 = None else: r3 = pp1[i + 1] # This method is provided by the subclasses to calculate HSE result = self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle = result hse_u = 0 hse_d = 0 ca2 = r2['CA'].get_vector() for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: # neighboring residues in the chain are ignored continue ro = pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao = ro['CA'].get_vector() d = (cao - ca2) if d.norm() < radius: if d.angle(pcb) < (pi / 2): hse_u += 1 else: hse_d += 1 res_id = r2.get_id() chain_id = r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)] = (hse_u, hse_d, angle) hse_list.append((r2, (hse_u, hse_d, angle))) hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key] = hse_u r2.xtra[hse_down_key] = hse_d if angle_key: r2.xtra[angle_key] = angle AbstractPropertyMap.__init__(self, hse_map, hse_keys, hse_list)
def main(argv=None): # IGNORE:C0111 '''Command line options.''' if argv is None: argv = sys.argv else: sys.argv.extend(argv) parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") # parser.add_argument("-dir", "--structs_dir", required = True ) parser.add_argument("-db", "--database_name", default='pdb') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("--procesados", default='/tmp/pdbs_dist_procesados.txt') parser.add_argument("--domains", default='/data/databases/pdb/processed/dns_pdbs.tlb') parser.add_argument( "--seqs", default='/data/databases/pdb/processed/pdb_seq_res.fasta') parser.add_argument("--pdbs", default='/data/databases/pdb/') parser.add_argument( "--distances", default='/data/databases/pdb/processed/distances.tbl', help= "Final output: table with atom distances between residues and ligands. Only for distances less than 'dist' parameter" ) parser.add_argument("--dist", default=5) parser.add_argument( "--pdbs_with_drug", default='/data/databases/pdb/processed/pdbs_with_drug.txt', help="Output: list of PDB codes with an associated ligand") args = parser.parse_args() if not os.path.exists(args.pdbs): sys.stderr.write( "%s not found. Specify where is pdbs/divided directory" % (parser.pdbs)) sys.exit(1) PDB_PATH = args.pdbs CONTACT_DIST = args.dist pdbs_with_drug_path = args.pdbs_with_drug if not os.path.exists(os.path.dirname(args.pdbs_with_drug)): sys.stderr.write("can't %s create %s. Set pdbs_with_drug correctly" % (pdbs_with_drug_path)) sys.exit(1) if not os.path.exists(os.path.dirname(args.distances)): sys.stderr.write("can't %s create %s. Set distances correctly" % (args.distances)) sys.exit(1) pdbs_procesados_path = args.procesados print( "In %s the processed pdbs are kept, if the file is deleted, the process starts from scratch " % pdbs_procesados_path) print("Outputs: '%s' and '%s' " % (pdbs_with_drug_path, args.distances)) pdbs_procesados = [] if os.path.exists(pdbs_procesados_path): with open(pdbs_procesados_path) as handle: pdbs_procesados = [x.strip() for x in handle.readlines()] pdbs_procesados = {x: 1 for x in pdbs_procesados} pdbs_iterator = PDBsIterator(pdb_dir=args.pdbs) def not_processed_iter(): for pdb, pdb_path in pdbs_iterator: if pdb not in pdbs_procesados: yield [pdb, pdb_path] DNsPDBs = args.domains if not os.path.exists(DNsPDBs): seqs_from_pdb = args.seqs if not os.path.exists(seqs_from_pdb): sys.stderr.write( "%s does not exists and %s not found. Specify where it is." % (DNsPDBs, seqs_from_pdb)) sys.exit(1) sys.stderr.write( "%s not found. You can create it with the following command: \n" % DNsPDBs) sys.stderr.write( "hmmscan --cut_tc --domtblout dns_pdbs.tlb --acc -o pdb_seq_res.hmm Pfam-A.hmm seqs_from_pdb.fasta" ) sys.exit(1) drugcompounds = [ x for x, y in compound_type.items() if y in ["DRUG", "COFACTOR"] ] othercompounds = [ x for x, y in compound_type.items() if y in ["METAL", "SUGAR", "NUCLEOTIDE", "LIPID"] ] aminoacidcompounds = [ x for x, y in compound_type.items() if y in ["MODIFIED", "RESIDUE"] ] drugcompounds = othercompounds + drugcompounds pdbs_with_drug_path = "/data/databases/pdb/processed/pdbs_with_drug.txt" _log.info("proceced pdbs: %i" % len(pdbs_procesados)) ppb = CaPPBuilder() p = PDBParser(PERMISSIVE=1, QUIET=1) pdbs_with_drug = [] if os.path.exists(pdbs_with_drug_path): _log.info("pdbs with drugs already loaded") with open(pdbs_with_drug_path) as handle: for x in handle.readlines(): pdbs_with_drug.append(x.strip()) else: with open(pdbs_with_drug_path, "a") as handle: _log.info("pdbs with drugs will be loaded") pdbs = list(pdbs_iterator) for pdb, file_path in tqdm(pdbs): try: if pdb not in pdbs_with_drug: structure = p.get_structure(pdb, file_path) for res in structure.get_residues(): if res.resname in drugcompounds: pdbs_with_drug.append(pdb) handle.write(pdb + "\n") handle.flush() break except Exception as ex: print(str(ex)) # import re # dns_table = re.sub(r" +", "\t","\n".join( [str(i) + "\t" + x for i,x in enumerate(open('/data/databases/pdb/processed/dns_pdbs.tlb').readlines()) if not x.startswith("#") ]) ) if not os.path.exists(DNsPDBs + "2"): cols = [ "target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score1", "bias1", "#", "of", "c-Evalue", "i-Evalue", "score2", "bias2", "from1", "to1", "from2", "to2", "from3", "to3", "acc" ] _log.info("correcting hmmer-pdb output") regexp = re.compile(" +") items = [] for x in tqdm(open(DNsPDBs).readlines()): if not x.startswith("#"): line = regexp.split(x) items.append(line[0:len(cols)]) # record = {c: line[i] for i, c in enumerate(cols)} df_hmm = pd.DataFrame.from_records(items, columns=cols) # df_hmm = df = pd.read_table('/data/databases/pdb/processed/dns_pdbs.tlb', index_col=None, header=None, delimiter=r"\s+",comment="#",names=cols) # df_hmm = df_hmm.dropna() df_hmm = df_hmm[["accession", "query_name", "from3", "to3"]] df_hmm.to_csv(DNsPDBs + "2") df_hmm["pdb"] = map(lambda x: x.split("_")[0].lower().strip(), df_hmm["query_name"]) df_hmm["chain"] = map(lambda x: x.split("_")[1].upper().strip(), df_hmm["query_name"]) df_hmm["start_res"] = map(lambda x: x.split("_")[2].upper().strip(), df_hmm["query_name"]) df_hmm["end_res"] = map(lambda x: x.split("_")[3].upper().strip(), df_hmm["query_name"]) else: df_hmm = pd.read_csv(DNsPDBs + "2") df_hmm["pdb"] = map(lambda x: x.split("_")[0].lower().strip(), df_hmm["query_name"]) df_hmm["chain"] = map(lambda x: x.split("_")[1].upper().strip(), df_hmm["query_name"]) df_hmm["start_res"] = map(lambda x: x.split("_")[2].upper().strip(), df_hmm["query_name"]) df_hmm["end_res"] = map(lambda x: x.split("_")[3].upper().strip(), df_hmm["query_name"]) print(len(df_hmm)) lock = Lock() def centeroid(arr): length = len(arr) sum_x = np.sum([x.coord[0] for x in arr]) sum_y = np.sum([x.coord[1] for x in arr]) sum_z = np.sum([x.coord[2] for x in arr]) return sum_x / length, sum_y / length, sum_z / length def residues_near_drug(drug_centroid, aa_residues): residues_near = [] for r in aa_residues: for a in list(r): dist = a - Struct(coord=drug_centroid) if dist > 20: break if dist < 10: residues_near.append(r) break return residues_near def juan(pdb_raw): try: pepe(pdb_raw) except Exception: traceback.print_exc() finally: with lock: pdbs_procesados.append(pdb_raw) with open(pdbs_procesados_path, "a") as handle: handle.write(pdb_raw + "\n") def pepe(pdb): ppb = CaPPBuilder() p = PDBParser(PERMISSIVE=1, QUIET=1) path_dir = PDB_PATH + "/" + pdb[1:3].lower() + "/" path = path_dir + "pdb" + pdb.lower() + ".ent" model = list(p.get_structure('X', path))[0] for chain_obj in list(model): chain = chain_obj.id hmm_residues = {} pdb_seq = list(model[chain].get_residues()) if pdb_seq: hmm_contacts = {} hmm_residues = {} hmms = df_hmm[(df_hmm["pdb"] == pdb) & (df_hmm["chain"] == chain) & (df_hmm["start_res"] == str(pdb_seq[0].id[1]))] for j, hmm in hmms.iterrows(): try: hmm_start = int(hmm["from3"]) - 1 hmm_end = int(hmm["to3"]) - 1 hmm_chain_name = "_".join( map(str, [ hmm["accession"].split(".")[0], hmm["chain"], pdb_seq[hmm_start].id[1], pdb_seq[hmm_end].id[1] ])) hmm_contacts[hmm_chain_name] = [] hmm_residues.update({ res.id[1]: hmm_chain_name for res in pdb_seq[hmm_start:hmm_end] }) except IndexError: print(pdb, hmm["accession"], hmm["chain"], hmm_start, hmm_end, pdb_seq) aa_residues = [] drug_molecules = [] for res_obj in chain_obj.get_residues(): if res_obj.resname in drugcompounds: drug_molecules.append(res_obj) elif res_obj.resname in aminoacidcompounds: aa_residues.append(res_obj) for res_drug_obj in drug_molecules: drug_centroid = centeroid(list(res_drug_obj)) near_residues = residues_near_drug(drug_centroid, aa_residues) for drug_atom in list(res_drug_obj): for near_residue in near_residues: for residue_atom in list(near_residue): distance = (residue_atom - drug_atom) if distance > 20: break if distance < CONTACT_DIST: with open(args.distances, "a") as handle: hmm_name = hmm_residues[ near_residue.id[1]] if near_residue.id[ 1] in hmm_residues else "NoDn" fields = [ pdb, chain, hmm_name, near_residue.id[1], near_residue.resname, residue_atom.serial_number, res_drug_obj.id[1], res_drug_obj.resname, drug_atom.serial_number, distance ] handle.write("\t".join(map(str, fields)) + "\n") _log.info("processing distances file") for x in tqdm(set(pdbs_with_drug)): if x not in pdbs_procesados: juan(x) # pool = ThreadPool(1) # pool.map(juan, set(pdbs_with_drug) - set(pdbs_procesados)) print("Finished!!!")
def pepe(pdb): ppb = CaPPBuilder() p = PDBParser(PERMISSIVE=1, QUIET=1) path_dir = PDB_PATH + "/" + pdb[1:3].lower() + "/" path = path_dir + "pdb" + pdb.lower() + ".ent" model = list(p.get_structure('X', path))[0] for chain_obj in list(model): chain = chain_obj.id hmm_residues = {} pdb_seq = list(model[chain].get_residues()) if pdb_seq: hmm_contacts = {} hmm_residues = {} hmms = df_hmm[(df_hmm["pdb"] == pdb) & (df_hmm["chain"] == chain) & (df_hmm["start_res"] == str(pdb_seq[0].id[1]))] for j, hmm in hmms.iterrows(): try: hmm_start = int(hmm["from3"]) - 1 hmm_end = int(hmm["to3"]) - 1 hmm_chain_name = "_".join( map(str, [ hmm["accession"].split(".")[0], hmm["chain"], pdb_seq[hmm_start].id[1], pdb_seq[hmm_end].id[1] ])) hmm_contacts[hmm_chain_name] = [] hmm_residues.update({ res.id[1]: hmm_chain_name for res in pdb_seq[hmm_start:hmm_end] }) except IndexError: print(pdb, hmm["accession"], hmm["chain"], hmm_start, hmm_end, pdb_seq) aa_residues = [] drug_molecules = [] for res_obj in chain_obj.get_residues(): if res_obj.resname in drugcompounds: drug_molecules.append(res_obj) elif res_obj.resname in aminoacidcompounds: aa_residues.append(res_obj) for res_drug_obj in drug_molecules: drug_centroid = centeroid(list(res_drug_obj)) near_residues = residues_near_drug(drug_centroid, aa_residues) for drug_atom in list(res_drug_obj): for near_residue in near_residues: for residue_atom in list(near_residue): distance = (residue_atom - drug_atom) if distance > 20: break if distance < CONTACT_DIST: with open(args.distances, "a") as handle: hmm_name = hmm_residues[ near_residue.id[1]] if near_residue.id[ 1] in hmm_residues else "NoDn" fields = [ pdb, chain, hmm_name, near_residue.id[1], near_residue.resname, residue_atom.serial_number, res_drug_obj.id[1], res_drug_obj.resname, drug_atom.serial_number, distance ] handle.write("\t".join(map(str, fields)) + "\n")
def __init__(self, model, radius, offset=0, hse_up_key='HSE_U', hse_down_key='HSE_D', angle_key=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert (offset >= 0) # For PyMOL visualization self.ca_cb_list = [] ppb = CaPPBuilder() ppl = ppb.build_peptides(model) hse_map = {} hse_list = [] hse_keys = [] ### GP self.clash_pairs = [] for pp1 in ppl: for i in range(0, len(pp1)): if i == 0: r1 = None else: r1 = pp1[i - 1] r2 = pp1[i] if i == len(pp1) - 1: r3 = None else: r3 = pp1[i + 1] # This method is provided by the subclasses to calculate HSE result = self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle = result hse_u = 0 hse_d = 0 ca2 = r2['CA'].get_vector() residue_up = [] ### GP residue_down = [] ### GP for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: # neighboring residues in the chain are ignored continue ro = pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao = ro['CA'].get_vector() d = (cao - ca2) if d.norm() < radius: if d.angle(pcb) < (math.pi / 2): hse_u += 1 ### GP # Puts residues' names in a list that were found in the upper half sphere residue_up.append(ro) ### end of GP code else: hse_d += 1 ### GP # Puts residues' names in a list that were found in the lower half sphere residue_down.append(ro) ### end of GP code res_id = r2.get_id() chain_id = r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)] = (hse_u, hse_d, angle) hse_list.append( (r2, (residue_up, residue_down, hse_u, hse_d, angle))) ### GP residue_up and residue_down added to hse_list hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key] = hse_u r2.xtra[hse_down_key] = hse_d if angle_key: r2.xtra[angle_key] = angle ### GP checking for atom clashes for atom in pp1[i]: ref_vector = atom.get_vector() for other_res in residue_up: try: if other_res != pp1[i - 1] and other_res != pp1[i + 1]: for other_atom in other_res: other_vector = other_atom.get_vector() d = other_vector - ref_vector if d.norm() < 2: self.clash_pairs.append([ (pp1[i]['CA'].get_bfactor(), pp1[i].get_id()[1]), (other_res['CA'].get_bfactor(), other_res.get_id()[1]) ]) except: pass
def __init__(self, model, radius, offset, hse_up_key, hse_down_key, angle_key=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert(offset>=0) # For PyMOL visualization self.ca_cb_list=[] ppb=CaPPBuilder() ppl=ppb.build_peptides(model) hse_map={} hse_list=[] hse_keys=[] for pp1 in ppl: for i in range(0, len(pp1)): if i==0: r1=None else: r1=pp1[i-1] r2=pp1[i] if i==len(pp1)-1: r3=None else: r3=pp1[i+1] # This method is provided by the subclasses to calculate HSE result=self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle=result hse_u=0 hse_d=0 ca2=r2['CA'].get_vector() for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i-j)<=offset: # neighboring residues in the chain are ignored continue ro=pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao=ro['CA'].get_vector() d=(cao-ca2) if d.norm()<radius: if d.angle(pcb)<(pi/2): hse_u+=1 else: hse_d+=1 res_id=r2.get_id() chain_id=r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)]=(hse_u, hse_d, angle) hse_list.append((r2, (hse_u, hse_d, angle))) hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key]=hse_u r2.xtra[hse_down_key]=hse_d if angle_key: r2.xtra[angle_key]=angle AbstractPropertyMap.__init__(self, hse_map, hse_keys, hse_list)
def build_peptides(self, structure): pp_list = self.ppb.build_peptides(structure, aa_only=False) if len(pp_list) == 0: #case of failure pp_list = CaPPBuilder().build_peptides(structure, aa_only=False) return pp_list
class ContactMapper(FeaturesComputer): ''' Extends FeaturesComputer class. Extracts res and chainIds for training and predicting and computes contact maps for training for a given complex ''' def __init__(self, rFname, lFname, computedFeatsRootDir=None, boundAvailable=True, res2res_dist=6.0, isForPrediction=False, statusManager=None): ''' @param rFname: str. path to receptor pdb file @param lFname: str. path to ligand pdb file @param computedFeatsRootDir: str. path where features will be stored @param boundAvailable: bool. True if bound structures are available. False otherwise. Bound structures must be located at the same path that unbound structures and need to be named as in the following example: 1A2K_l_u.pdb 1A2K_r_b.pdb @param res2res_dist: float. max distance between any heavy atoms of 2 amino acids to be considered as interacting (Amstrongs) @param isForPrediction: bool. False to compute contacts between amino acids, True otherwise. Positive contacts will be tag as 1, negative as -1. If True, all amino acids will have as tag np.nan @param statusManager: class that implements .setStatus(msg) to communicate ''' FeaturesComputer.__init__(self, rFname, lFname, computedFeatsRootDir) self.prefixR = os.path.split(rFname)[1].split(".")[0].split("_")[0] self.prefixL = os.path.split(lFname)[1].split(".")[0].split("_")[0] if self.prefixR == self.prefixL: self.prefix = self.prefixR else: if "<" in self.prefixL: raise FeatureComputerException( "Error. Ligand pdbFile name %s must not contain '<' or '>' character" % lFname) if ">" in self.prefixR: raise FeatureComputerException( "Error. Receptor pdbFile name %s must not contain '<' or'>' character" % rFname) self.prefixR = self.getExtendedPrefix(rFname) self.prefixL = self.getExtendedPrefix(lFname) self.prefix = self.prefixL + "<->" + self.prefixR self.isForPrediction = isForPrediction self.res2res_dist = res2res_dist self.boundAvailable = boundAvailable self.outPath = myMakeDir(self.computedFeatsRootDir, "common/contactMaps") self.outName = os.path.join(self.outPath, self.prefix + ".cMap.tab") self.parser = PDBParser(QUIET=True) # self.ppb=PPBuilder( radius= 200) # To not worry for broken chains self.ppb = CaPPBuilder() self.computeFun = self.contactMapOneComplex def mapBoundToUnbound(self, structureUnbound, structureBound, skipBoundChainsIds=set([])): ''' Obtains correspondence between unbound structure and bound structure when available. Returns a dictionary that maps bound_residue --> equivalent unbound_residue @param structureUnbound: Bio.PDB.Structure. Structure in bound state @param structureBound: Bio.PDB.Structure. Structure in unbound state @param skipBoundChainsIds: Set of Chars. Set of chain ids that will be skipped for calculations. @return bound2UnboundMapDict: Dict {Bio.PDB.Residue (from bound structure): Bio.PDB.Residue (from unbound structure)} ''' bound2UnboundMapDict = {} pp_list_unbound = self.ppb.build_peptides(structureUnbound, aa_only=False) if structureBound is None: # if there is no bound structure, use just unbound. boundToUnboundMap = lambda x: x #For a given residue will return the same residue pp_list_bound = pp_list_unbound else: pp_list_bound = self.ppb.build_peptides(structureBound, aa_only=False) mapper = BoundUnboundMapper( pp_list_unbound, pp_list_bound) # res_bound->res_unbound mapper object mapper.build_correspondence() boundToUnboundMap = mapper.mapBoundToUnbound #For a given bound residue will return its unbound equivalent for pp in pp_list_bound: for resBound in pp: chainBound = resBound.get_full_id()[2] # str chainId if chainBound in skipBoundChainsIds: continue resUnbound = boundToUnboundMap(resBound) if not resUnbound is None: #In case there is no equivalent unbound residue for a given bound residue bound2UnboundMapDict[resBound] = resUnbound return bound2UnboundMapDict def fixHomooligomers(self, structureL, structureR, positiveContacts, chainsInContactL, chainsInContactR): ''' For each interacting pair of residues (resL_1, resR_2), it will add to positiveContacts (res_1L', resR_2) and/or (resL_1, resR_2') where resL_1' is an equivalent residue in homooligomers of ligand @param structureL: Bio.PDB.Structure. Structure of ligand @param structureR: Bio.PDB.Structure. Structure of receptor @param positiveContacts: [(ligandResId, receptorResId)]: ligandResId and receptorResIds are full_ids of Bio.PDB.Residue @param chainsInContactL: [(ligandResId)]: ligandResId and receptorResIds are full_ids of Bio.PDB.Residue @param chainsInContactR: [(receptorResId)]: ligandResId and receptorResIds are full_ids of Bio.PDB.Residue @return positiveContacts, chainsInContactL, chainsInContactR. Updated with equivalent residues interactions added ''' pp_list_l = self.ppb.build_peptides(structureL, aa_only=False) equivalentLmapper = HomoOligomerFinder(pp_list_l, positiveContacts, chainType="l") positiveContacts, chainsInContactL = equivalentLmapper.update_interactions( ) pp_list_r = self.ppb.build_peptides(structureR, aa_only=False) equivalentRmapper = HomoOligomerFinder(pp_list_r, positiveContacts, chainType="r") positiveContacts, chainsInContactR = equivalentRmapper.update_interactions( ) return positiveContacts, chainsInContactL, chainsInContactR def getPairsOfResiduesInContact(self, structureL, structureR): ''' Computes which amino acids of ligand are in contact with which amino acids of receptor @param structureL: Bio.PDB.Structure. Structure of ligand (bound state if available) @param structureR: Bio.PDB.Structure. Structure of receptor (bound state if available). @return positiveContacts: Set {(Bio.PDB.Residue.fullResId (from bound structure structureL), Bio.PDB.Residue.fullResId (from bound structure structureR))} @return chainsNotContactL: Set { str(chainId structureL)} @return chainsNotContactR: Set { str(chainId structureR)} ''' try: atomListL = [ atom for atom in structureL.child_list[0].get_atoms() if not atom.name.startswith("H") ] except IndexError: raise NoValidPDBFile("Problems parsing pdbFile 1") try: atomListR = [ atom for atom in structureR.child_list[0].get_atoms() if not atom.name.startswith("H") ] except IndexError: raise NoValidPDBFile("Problems parsing pdbFile 2") searcher = NeighborSearch(atomListL + atomListR) allNeigs = searcher.search_all(self.res2res_dist, level="R") lStructId = structureL.get_id() rStructId = structureR.get_id() positiveContacts = set([]) chainsInContactL = set([]) chainsInContactR = set([]) for res1, res2 in allNeigs: pdbId1, modelId1, chainId1, resId1 = res1.get_full_id() pdbId2, modelId2, chainId2, resId2 = res2.get_full_id() fullResId1 = res1.get_full_id() fullResId2 = res2.get_full_id() if pdbId1 == lStructId and pdbId2 == rStructId: positiveContacts.add((fullResId1, fullResId2)) chainsInContactL.add(fullResId1[2]) chainsInContactR.add(fullResId2[2]) elif pdbId1 == rStructId and pdbId2 == lStructId: positiveContacts.add((fullResId2, fullResId1)) chainsInContactL.add(fullResId2[2]) chainsInContactR.add(fullResId1[2]) if CONSIDER_HOMOOLIG_AS_POS: positiveContacts, chainsInContactL, chainsInContactR = self.fixHomooligomers( structureL, structureR, positiveContacts, chainsInContactL, chainsInContactR) allChainsL = set([elem.get_id() for elem in structureL[0].get_list()]) allChainsR = set([elem.get_id() for elem in structureR[0].get_list()]) chainsNotContactL = allChainsL.difference(chainsInContactL) chainsNotContactR = allChainsR.difference(chainsInContactR) return positiveContacts, chainsNotContactL, chainsNotContactR def contactMapOneComplex(self): ''' Computes the contact map of a complex. Initial input for complex codification. Contact map is a file written at self.computedFeatsRootDir/common/contactMaps/ with name prefix.cMap.tab where prefix is either the common name of ligand and receptor pdb files or the concatenation of ligand and receptor names. 1A2K_l_u.pdb and 1A2K_r_u.pdb --> 1A2K.cMap.tab 1A2K_l_u.pdb and 1A22.pdb --> 1A2K-1A22.cMap.tab ''' outName = self.outName print(outName) if os.path.isfile(outName): print('Already computed contact map') return 0 lStructId = self.prefixL + "_l_u.pdb" rStructId = self.prefixR + "_r_u.pdb" structureL_u = self.parser.get_structure(lStructId, self.lFname) structureR_u = self.parser.get_structure(rStructId, self.rFname) if self.boundAvailable == False or self.isForPrediction: structureL_b = None structureR_b = None else: try: lStructId_b = self.prefix + "_l_b.pdb" rStructId_b = self.prefix + "_r_b.pdb" lFname_b = os.path.join( os.path.split(self.lFname)[0], lStructId_b) rFname_b = os.path.join( os.path.split(self.rFname)[0], rStructId_b) structureL_b = self.parser.get_structure(lStructId_b, lFname_b) structureR_b = self.parser.get_structure(rStructId_b, rFname_b) except IOError as e: # in this case there are just unbound pdbs available structureL_b = None structureR_b = None if self.isForPrediction: positiveContacts = None chainsNotContactR = set([]) chainsNotContactL = set([]) elif structureL_b is None or structureR_b is None: #Compute contacs in bound structures positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact( structureL_u, structureR_u) else: #Compute contacs in unbound structures positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact( structureL_b, structureR_b) if JUST_INTERACTING_CHAINS == False: chainsNotContactR = set([]) chainsNotContactL = set([]) rResDict = self.mapBoundToUnbound(structureR_u, structureR_b, skipBoundChainsIds=chainsNotContactR) lResDict = self.mapBoundToUnbound(structureL_u, structureL_b, skipBoundChainsIds=chainsNotContactL) nResiduesL = len(lResDict) nResiduesR = len(rResDict) if not (self.minNumResiduesPartner < nResiduesL < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesL, "1") if not (self.minNumResiduesPartner < nResiduesR < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesR, "2") outFile = open(outName, "w") outFile.write( "chainIdL structResIdL resNameL chainIdR structResIdR resNameR categ\n" ) # print(sorted(lResDict, key= lambda x: x.get_id())) # a= raw_input() try: for resL_bound in sorted(lResDict, key=lambda x: x.get_full_id()): # print(resL_bound.get_full_id()) resL_unbound = lResDict[resL_bound] pdbIdL, modelL, chainIdL, resIdL = resL_unbound.get_full_id() resIdL = self.makeStrResId(resIdL) try: letraL = three_to_one(resL_unbound.resname) if letraL != three_to_one(resL_bound.resname): continue except KeyError: continue for resR_bound in sorted(rResDict, key=lambda x: x.get_full_id()): resR_unbound = rResDict[resR_bound] pdbIdR, modelR, chainIdR, resIdR = resR_unbound.get_full_id( ) try: letraR = three_to_one(resR_unbound.resname) if letraR != three_to_one(resR_bound.resname): continue except KeyError: continue if self.isForPrediction: categ = np.nan elif (resL_bound.get_full_id(), resR_bound.get_full_id()) in positiveContacts: categ = 1 else: categ = -1 resIdR = self.makeStrResId(resIdR) if chainIdL == " ": chainIdL = "*" if chainIdR == " ": chainIdR = "*" # print("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, letraL, chainIdR, resIdR, letraR, categ) ) # raw_input("enter") outFile.write("%s %s %s %s %s %s %s\n" % (chainIdL, resIdL, letraL, chainIdR, resIdR, letraR, categ)) outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise def makeStrResId(self, resId): valList = [str(elem) for elem in resId[1:]] finalId = "".join(valList).strip() return finalId