def ligandfilter(pdb): """ Remove water and other ligands from pdb. :param pdb: PDB.Structure.Structure :return: None """ # Remove non amino acid residues # To upkeep the integrity due to detaching, iterate over child_list copy! for model in pdb.child_list[:]: for chain in model.child_list[:]: for res in chain.child_list[:]: if not PDB.is_aa(res): chain.detach_child(res.id) if len(chain) == 0: model.detach_child(chain) if len(model) == 0: pdb.detach_child(model) # if the pdb still has more than one model, it's probably an NMR structure # simply keep the first model if len(pdb) > 1: for model in pdb.child_list[1:]: pdb.detach_child(model.id) if len(pdb.child_list[0]) > 1: model = pdb.child_list[0] for chain in model.child_list[1:]: model.detach_child(chain.id) # There is only one model left assert len(pdb) == 1 # This model has only one chain assert len(pdb.child_list[0]) == 1
def pdb2cd(name): f = name + ".pdb" dssp_tuple = dssp_dict_from_pdb_file(f) dssp_dict = dssp_tuple[0] p = PDBParser(QUIET=True).get_structure("file", f) # Initiates and fills array ("cc") with chains. cc = [chain.get_id() for model in p for chain in model] # Determines length of sequence, initiates an array ("ss") of same length. howLong = ss_out = 0 for c in cc: howLong += len([_ for _ in p[0][c].get_residues() if PDB.is_aa(_)]) if not howLong == len(dssp_tuple[1]): howLong = len(dssp_tuple[1]) ss = np.arange(1, howLong + 1) # Fills the array ("ss") with secondary structures. for i in ss: ss_lib = dssp_dict[dssp_tuple[1][ i - 3]] # ss_lib = dssp_dict[(dssp_tuple[1][0][0], (' ', i-1, ' '))] dict_ss = ss_lib[1] if dict_ss == 'H': ss_out = 0 if dict_ss == 'E': ss_out = 1 if dict_ss == '-': # else:# dict_ss == '-': ss_out = 2 ss[i - 1] = ss_out # Returns the fractional composition of alpha helix, beta sheet or random coil. alpha = (ss == 0).sum() / ss.__len__() beta = (ss == 1).sum() / ss.__len__() coil = (ss == 2).sum() / ss.__len__() abc = [alpha, beta, coil] return abc
def assign_sensitivity(structure, md_df, chain, pdb_path, go): """ Changed: lookup the sensitivities directly in the df, no dict. :param structure: :param md_df: :param chain: :param pdb_path: :return: """ seq_pdb = [] residues = structure[0][chain] for res in residues: # move along the protein chain if not pdb.is_aa(res): continue aa = three2single[res.get_resname()] seq_pdb.append(aa) # get the sequence: aas = ''.join(md_df['AA'].values[1:].tolist()) # align seq_md = ''.join(md_df['AA'][1:]) aligned_md, aligned_pdb, identity = water(seq_md, seq_pdb) gos = [c for c in md_df.columns if c.startswith('GO:')] for aa_md, aa_pdb, res, pos in zip(aligned_md, aligned_pdb, residues, range(len(aligned_md))): if aa_md == '-' or aa_pdb == '-': continue res.sensitivity = {go: md_df.loc[pos, go] for go in gos} return structure
def parse_structure(self): for residue in self.structure.get_residues(): # if PDB.is_aa(residue, standard=True): # only the standard 20 if PDB.is_aa(residue): res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues self.residues.append(res) self.atoms.extend(atoms_method(self.contact_defn, residue))
def parse_structure(self): for residue in self.structure.get_residues(): if PDB.is_aa(residue, standard=True): #only consider standard 20 residues res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues (ex. 1ORC) self.residues.append(res) self.d_sequence[res] = Polypeptide.three_to_one( Residue.Residue.get_resname(residue))
def extract_residues(model): """Returns a list of protein residues given a PDB model""" #TODO : return a list of protein residues given a PDB model residues = [] for chain in model: for residue in chain: if PDB.is_aa(residue, standard=True): residues.append(residue) # print(residues) return residues
def get_AAlist(aa_name, prot): """This functions takes in an amino acids string name and a protein pdb structure and returns a list of the AAs inside that structure""" aa_list = [] #create empty list assert PDB.is_aa( aa_name ) # Make sure aa_name is amino acid (works for string or residue object) for res in prot.get_residues(): #loop through all residues in the protein if res.get_resname() == aa_name: #check the right AA is selected aa_list.append(res) return aa_list
def load_model(modelfile, debug, complex_state=None): '''Loads the model file and generates a fasta with dashes for any skipped residue #'s Takes a filename and returns a dict with filename: model filename fasta: fastaseq (of chain, with - for skipped res #) chain: chainID (only uses first chain if multiple) icodes: sequence of icodes (' ' for none) resnums: list of residue numbers''' debug_head = "DEBUG: IO: load_model: " if debug: print debug_head + "Loading model {}".format(modelfile) structure = parser.get_structure("Model", modelfile) if complex_state is None: complex_state = model_information(modelfile) if debug: print debug_head + "model has complex_state: {}".format(complex_state) # I'm assuming that all chains in a model are of the target protein # I can't find any models that are hetero even if they come from a heteroolig template # So, for now I need to stick with this assumption because I can't see any way to determine # which chain is the target protein in any cases of a heterooligomer model # which is why I think that never happens for cc in structure[ 0]: # For now, only the first chain in the model is taken chain = cc break resnum = 0 fasta = "" icodes = "" resnums = list() for residue in chain: #Skip hetatoms if not PDB.is_aa(residue): continue resid = residue.get_id() #Fill in gaps with - while resid[1] > resnum + 1: fasta += '-' resnum += 1 hetflag, resnum, icode = resid aa = AA[residue.get_resname()] fasta += aa icodes += icode resnums.append(resnum) if debug: print debug_head + "{} has resnums".format(modelfile, resnums) return { 'filename': modelfile, 'fasta': fasta, 'chain': chain.get_id(), 'icodes': icodes, 'resnums': resnums, 'complex_state': complex_state }
def find_contact(fname): if os.path.isfile(fname)==True: #checks if the name saved in file_list is a file print(fname) parser=PDBParser(PERMISSIVE=1) #parser for PDB file temp_struct=parser.get_structure(fname[0:-4], fname) # parsing of PDB file model=temp_struct[0] contact_count=0 contacts=[] resnum=0 res_ids=[] chain_seq=[] for res in model.get_residues(): if PDB.is_aa(res,standard=True): ### considers only the 20 standard aa not modified ones res_ids.append(res.get_id()[1]) chain_seq.append(res.get_parent().get_id()) resnum=resnum+1 else: pass for i in range(0,len(res_ids)): #if temp_struct[0][chain_seq[i]][res_ids[i]]['CA']: if temp_struct[0][chain_seq[i]][res_ids[i]]['CA'].is_disordered(): atom1=temp_struct[0][chain_seq[i]][res_ids[i]]['CA'] atom1.set_altloc(' ') else: atom1=temp_struct[0][chain_seq[i]][res_ids[i]]['CA'] #else: # pass for j in range(0,len(res_ids)): #print(res_ids[j]) #if temp_struct[0][chain_seq[j]][res_ids[j]]['CA']: if temp_struct[0][chain_seq[j]][res_ids[j]]['CA'].is_disordered(): atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA'] atom2.set_altloc(' ') else: atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA'] # atom2=temp_struct[0][chain_seq[j]][res_ids[j]]['CA'] # else: # pass if (atom1-atom2 <= 7) and (abs(int(res_ids[i])-res_ids[j])>2): contact=(res_ids[i],chain_seq[i],res_ids[j],chain_seq[j]) contact_rev=(res_ids[j],chain_seq[j],res_ids[i],chain_seq[i]) if contact in contacts or contact_rev in contacts : pass else: #contacts.append((atom1.get_id(),resid1.get_id()[1],atom2.get_id(),resid2.get_id()[1])) contacts.append(contact) contact_count=contact_count+1 else: pass return contacts
def get_sidechain(res): '''Get a list of side chain atoms from a residue''' assert PDB.is_aa(res) #make sure residue is actually an amino acid sidechain = [] exclude = ["N", "C", "O", "OXT" ] #exclude nitrogen, carbonyl carbon, oxygen and special case for atom in res.get_atoms(): if atom.get_id( ) in exclude or atom.element == "H": #ignore exclusion list and hydrogen atoms continue else: sidechain.append(atom) return sidechain #either returns list of side chain atoms or an empty list
def _get_residues_from_structure(self, pdb_structure): """ _get_residues_from_structure: Given a pdb_structure object, parse residues into a list and return it """ res_ids = [] num_res = 0 my_res = pdb_structure.get_residues() for r_ele in my_res: if PDB.is_aa(r_ele): num_res += 1 res_ids.append(r_ele.get_id()) return (num_res, res_ids)
def __get_residues__(self, structure): """ Gets all amino acids residues from a given structure and stores them in an array. parameters: ---------------- structure: PDB strutore obj, openened PDB structure file object Returns: --------------------------------------- array: np-arr, residue objects from Bio.PDB """ residues_arr = [] for res_i in structure.get_residues(): if PDB.is_aa(res_i): residues_arr.append(res_i) return(np.array(residues_arr))
def _file_to_data(self, file_path): """Do the PDB conversion""" parser = PDB.PDBParser(PERMISSIVE=1) ppb = PPBuilder() pdb1 = file_path structure = parser.get_structure("test", pdb1) model = structure[0] chain_no = 0 res_no = 0 atom_no = 0 pp_list = [] pp_no = 0 for model in structure: for chain in model: chain_no += 1 for residue in model.get_residues(): if PDB.is_aa(residue): res_no += 1 for atom in residue.get_atoms(): atom_no += 1 for pp in ppb.build_peptides(structure): pp_no += 1 my_seq = pp.get_sequence() pp_list += str(my_seq) seq = ''.join(pp_list) data = { 'name': os.path.basename(file_path), 'num_chains': chain_no, 'num_residues': res_no, 'num_atoms': atom_no, 'protein': { 'id': os.path.basename(file_path), 'sequence': seq, 'md5': hashlib.md5(seq.encode()).hexdigest() }, } return data, pp_no
def find_contact(fname): if os.path.isfile( fname) == True: #checks if the name saved in file_list is a file print(fname) parser = PDBParser(PERMISSIVE=1) #parser for PDB file temp_struct = parser.get_structure(fname[0:-4], fname) # parsing of PDB file model = temp_struct[0] contact_count = 0 contacts = [] resnum = 0 res_ids = [] chain_seq = [] for res in model.get_residues(): if PDB.is_aa(res): res_ids.append(res.get_id()[1]) chain_seq.append(res.get_parent().get_id()) resnum = resnum + 1 for i in range(0, len(res_ids)): for j in range(0, len(res_ids)): atom1 = temp_struct[0][chain_seq[i]][res_ids[i]]['CA'] atom2 = temp_struct[0][chain_seq[j]][res_ids[j]]['CA'] if (atom1 - atom2 <= 7) and (abs(int(res_ids[i]) - res_ids[j]) > 2): contact = (str(res_ids[i]), chain_seq[i], str(res_ids[j]), chain_seq[j]) contact_rev = (str(res_ids[j]), chain_seq[j], str(res_ids[i]), chain_seq[i]) if contact in contacts or contact_rev in contacts: pass else: #contacts.append((atom1.get_id(),resid1.get_id()[1],atom2.get_id(),resid2.get_id()[1])) contacts.append(contact) contact_count = contact_count + 1 else: pass return contacts
def get_dis(name): p = PDBParser(PERMISSIVE=1) pdb_name = name try: s = p.get_structure("X", pdb_name) s = s[0] except: return None, None, None, None, None, None, None, None res_list = PDB.Selection.unfold_entities(s, 'R') aa_list = [] for a in res_list: if PDB.is_aa(a): aa_list.append(a) t = aa_list[0].get_id()[1] aa_list_full = [] error = 0 for a in aa_list: while 1: if a.get_id()[1] < t: error = 1 break if a.get_id()[1] == t: aa_list_full.append(a) t += 1 break else: aa_list_full.append(None) t += 1 if error == 1: return None, None, None, None, None, None, None, None try: depth = PDB.ResidueDepth(s) except: return None, None, None, None, None, None, None, None dep_dict = depth.property_dict dep_keys = depth.property_keys dep_list = depth.property_list dps = [] for a in aa_list_full: try: aa_id = (a.get_parent().get_id(), a.get_id()) if dep_dict.get(aa_id): dps.append(dep_dict[aa_id]) else: dps.append([None, None]) except: dps.append([None, None]) dps = np.array(dps) try: HSEA = PDB.HSExposureCA(s) except: return None, None, None, None, None, None, None, None HSEA_dict = HSEA.property_dict HSEA_keys = HSEA.property_keys HSEA_list = HSEA.property_list hse_a = [] for a in aa_list_full: try: aa_id = (a.get_parent().get_id(), a.get_id()) if HSEA_dict.get(aa_id): hse_a.append(HSEA_dict[aa_id]) else: hse_a.append([None, None, None]) except: hse_a.append([None, None, None]) hse_a = np.array(hse_a) try: HSEB = PDB.HSExposureCB(s) except: return None, None, None, None, None, None, None, None HSEB_dict = HSEB.property_dict HSEB_keys = HSEB.property_keys HSEB_list = HSEB.property_list hse_b = [] for a in aa_list_full: try: aa_id = (a.get_parent().get_id(), a.get_id()) if HSEB_dict.get(aa_id): hse_b.append(HSEB_dict[aa_id]) else: hse_b.append([None, None, None]) except: hse_b.append([None, None, None]) hse_b = np.array(hse_b) seq_list = '' for a in aa_list_full: try: t = a.get_resname() if t in t_dic: seq_list += t_dic[t] else: seq_list += 'X' except: seq_list += 'X' ca_list = [] for a in aa_list_full: try: t = a['CA'] ca_list.append(t) except: t = None ca_list.append(t) cb_list = [] for a in aa_list_full: try: t = a['CB'] cb_list.append(t) except: t = None cb_list.append(t) n_list = [] for a in aa_list_full: try: t = a['N'] n_list.append(t) except: t = None n_list.append(t) c_list = [] for a in aa_list_full: try: t = a['C'] c_list.append(t) except: t = None c_list.append(t) angle = [] for j in range(len(ca_list)): angle_t = [] for k in range(len(ca_list)): if ca_list[j] != None and ca_list[k] != None: ca1 = ca_list[j].get_vector() ca2 = ca_list[k].get_vector() if cb_list[j] != None: cb = cb_list[j].get_vector() t1 = PDB.vectors.calc_angle(cb, ca1, ca2) else: if c_list[j] != None and n_list[j] != None and ca_list[ j] != None: ca_v = ca_list[j].get_vector().get_array() c_v = c_list[j].get_vector().get_array() n_v = n_list[j].get_vector().get_array() cb = calha1(n_v, c_v, ca_v) cb = PDB.vectors.Vector(cb) t1 = PDB.vectors.calc_angle(cb, ca1, ca2) else: t1 = None if n_list[j] != None: n_ = n_list[j].get_vector() t2 = PDB.vectors.calc_angle(n_, ca1, ca2) else: t2 = None if c_list[j] != None: c_ = c_list[j].get_vector() t3 = PDB.vectors.calc_angle(c_, ca1, ca2) else: t3 = None angle_t.append([t1, t2, t3]) else: angle_t.append([None, None, None]) angle.append(angle_t) angle_d = [] for j in range(len(angle)): angle_dt = [] for k in range(len(angle[j])): angle_dt.append(angle[j][k] + angle[k][j]) angle_d.append(angle_dt) angle_d = np.array(angle_d) ca_num = len(ca_list) ca_dist = [] for j in range(len(ca_list)): for k in range(len(ca_list)): if ca_list[j] != None and ca_list[k] != None: ca_dist.append(ca_list[j] - ca_list[k]) else: ca_dist.append(None) ca_dist = np.array(ca_dist) ca_dist = ca_dist.reshape(ca_num, ca_num) mask = [] for j in range(len(ca_list)): if ca_list[j] != None: mask.append(1) else: mask.append(0) ids = ca_dist == None ca_dist[ids] = 100 ca_dist_cs = [] angle_cs = [] num_cs = [] for j in range(len(ca_dist)): t = ca_dist[j] s = t.argsort() ca_dist_cs.append(t[s[1:17]]) angle_cs.append(angle_d[j][s[1:17]]) num_cs.append(s[1:17]) return seq_list, num_cs, mask, ca_dist_cs, angle_cs, dps, hse_a, hse_b
def cal(i): print(pdbid[i],pdbchain[i]) pdb_name='pdb_/pdb'+pdbid[i].lower()+'.ent' #pdb name try: s = p.get_structure("1",pdb_name) #read pdb struture s = s[0][pdbchain[i]] #choose chain res_list = PDB.Selection.unfold_entities(s, 'R') #read aminoacid except: return 0 aa_list = [] for a in res_list: if PDB.is_aa(a): aa_list.append(a) #get acid error=0 t=aa_list[0].get_id()[1] aa_list_full=[] for a in aa_list: while 1: if a.get_id()[1]<t: error=1 break if a.get_id()[1]==t: aa_list_full.append(a) t+=1 break else: aa_list_full.append(None) t+=1 if error==1: return 0 try: depth=PDB.ResidueDepth(s) #氨基酸到蛋白质表面距离 except: return 0 dep_dict=depth.property_dict dep_keys=depth.property_keys dep_list=depth.property_list dps=[] for a in aa_list_full: try: aa_id=(a.get_parent().get_id(),a.get_id()) if dep_dict.get(aa_id): dps.append(dep_dict[aa_id]) else: dps.append([None,None]) except: dps.append([None,None]) dps=np.array(dps) try: HSEA=PDB.HSExposureCA(s) except: return 0 HSEA_dict=HSEA.property_dict HSEA_keys=HSEA.property_keys HSEA_list=HSEA.property_list hse_a=[] for a in aa_list_full: try: aa_id=(a.get_parent().get_id(),a.get_id()) if HSEA_dict.get(aa_id): hse_a.append(HSEA_dict[aa_id]) else: hse_a.append([None,None,None]) except: hse_a.append([None,None,None]) hse_a=np.array(hse_a) try: HSEB=PDB.HSExposureCB(s) except: return 0 HSEB_dict=HSEB.property_dict HSEB_keys=HSEB.property_keys HSEB_list=HSEB.property_list hse_b=[] for a in aa_list_full: try: aa_id=(a.get_parent().get_id(),a.get_id()) if HSEB_dict.get(aa_id): hse_b.append(HSEB_dict[aa_id]) else: hse_b.append([None,None,None]) except: hse_b.append([None,None,None]) hse_b=np.array(hse_b) seq_list='' for a in aa_list_full: try: t=a.get_resname() if t in t_dic: seq_list+=t_dic[t] else: seq_list+='X' except: seq_list+='X' ca_list=[] for a in aa_list_full: try: t=a['CA'] ca_list.append(t) except: t=None ca_list.append(t) cb_list=[] for a in aa_list_full: try: t=a['CB'] cb_list.append(t) except: t=None cb_list.append(t) n_list=[] for a in aa_list_full: try: t=a['N'] n_list.append(t) except: t=None n_list.append(t) c_list=[] for a in aa_list_full: try: t=a['C'] c_list.append(t) except: t=None c_list.append(t) angle=[] #三个角两个氨基酸相对位置 for j in range(len(ca_list)): angle_t=[] for k in range(len(ca_list)): if ca_list[j]!=None and ca_list[k]!=None: ca1=ca_list[j].get_vector() ca2=ca_list[k].get_vector() if cb_list[j]!=None: cb=cb_list[j].get_vector() t1=PDB.vectors.calc_angle(cb,ca1,ca2) else: if c_list[j]!=None and n_list[j]!=None and ca_list[j]!=None: ca_v=ca_list[j].get_vector().get_array() c_v=c_list[j].get_vector().get_array() n_v=n_list[j].get_vector().get_array() cb=calha1(n_v,c_v,ca_v) cb=PDB.vectors.Vector(cb) t1=PDB.vectors.calc_angle(cb,ca1,ca2) else: t1=None if n_list[j]!=None: n_=n_list[j].get_vector() t2=PDB.vectors.calc_angle(n_,ca1,ca2) else: t2=None if c_list[j]!=None: c_=c_list[j].get_vector() t3=PDB.vectors.calc_angle(c_,ca1,ca2) else: t3=None angle_t.append([t1,t2,t3]) else: angle_t.append([None,None,None]) angle.append(angle_t) angle_d=[] #六个角 for j in range(len(angle)): angle_dt=[] for k in range(len(angle[j])): angle_dt.append(angle[j][k]+angle[k][j]) angle_d.append(angle_dt) angle_d=np.array(angle_d) ca_num=len(ca_list) ca_dist=[] #CA距离 for j in range(len(ca_list)): for k in range(len(ca_list)): if ca_list[j]!=None and ca_list[k]!=None: ca_dist.append(ca_list[j]-ca_list[k]) else: ca_dist.append(None) ca_dist=np.array(ca_dist) ca_dist=ca_dist.reshape(ca_num,ca_num) mask=[] #是否有CA for j in range(len(ca_list)): if ca_list[j]!=None: mask.append(1) else: mask.append(0) ids=ca_dist==None ca_dist[ids]=100 #算不出来距离的设置为100 ca_dist_cs=[] angle_cs=[] num_cs=[] for j in range(len(ca_dist)): t=ca_dist[j] s=t.argsort() ca_dist_cs.append(t[s[1:17]]) angle_cs.append(angle_d[j][s[1:17]]) num_cs.append(s[1:17]) dic_r={} dic_r['dis']=ca_dist_cs dic_r['angle']=angle_cs dic_r['mask']=mask dic_r['ids']=num_cs dic_r['seq']=seq_list dic_r['dps']=dps dic_r['hsea']=hse_a dic_r['hseb']=hse_b out_name='pdb_other_cb/'+pdbid[i].lower()+pdbchain[i]+'_all.npy' np.save(out_name,dic_r)
line=line.split('\t') if line[3]=="Chl": chl_list.append(line[0]) for microen in chl_list: # try: dirname=microen.split('_')[0] dirname=dirname.split('.')[1] structure = parser.get_structure('pdb','F:\microfolds_8_2018/new_all/'+dirname+'/'+microen+'.pdb') model = structure[0] res_no = 0 non_resi = 0 #in_file=open('/home/hraanan/MicrofoldsPDBs/ChlorophyllNewCenter/'+dirname+'/'+filename,'r') for model in structure: for residue in model.get_residues(): if PDB.is_aa(residue): res_no += 1 elif residue.resname in chl: non_resi += 1 # print ("Residues2: %i" % (res_no)) # print ("Other2: %i" % (non_resi)) ratio=res_no/non_resi out_file.write(microen+'\t'+str(res_no)+'\t'+str(non_resi)+'\t'+str(ratio)+'\n') out_file.close() print('end')
def main(): # parse command line arguments parser = ArgumentParser() parser.add_argument('-p', '--pdb', dest='pdb', help='input PDB file') parser.add_argument('-c', '--chains', dest='chains', help='chains to extract sequence for') parser.add_argument('-o', '--output', dest='output', help='output fasta file') parser.add_argument('-r', '--resseq', dest='resseq', help='residue seq number in the PDB file') parser.add_argument('-i', '--interactive', dest='interactive', action='store_true', help='select sequenceinteractively') args = parser.parse_args() print("input file: " + args.pdb) print("output file: " + args.output) print("chains: " + args.chains) # extract sequences from SEQRES records pdb_id = basename(args.pdb).split('.')[0] with open(args.pdb, 'rt') as f: seqres_sequences = list(SeqIO.parse(f, 'pdb-seqres')) # extract sequences from residues with resolved coordinates structure = PDB.PDBParser().get_structure(id=pdb_id, file=args.pdb) model = structure[0] peptide_builder = PDB.Polypeptide.PPBuilder() sequences = [] resseq_ids = [] for c in args.chains: residues = [r for r in model[c].get_residues() if PDB.is_aa(r)] resseq_ids.append(r.get_id()[1] for r in residues) chain = peptide_builder.build_peptides(model[c]) coord_sequence = chain[0].get_sequence() print('Sequence for chain ' + c + ' extracted from coordinates:') print(coord_sequence, '\n') # if the SEQRES records are missing from the PDB file, use sequence from coordinates if len(seqres_sequences) == 0: print( 'SEQRES records in the given PDB file are missing! Using sequence ' 'extracted from coordinates.') sequence = coord_sequence else: for record in seqres_sequences: if c == record.id: print('Sequence for chain ' + c + ' in SEQRES:') print(record.seq, '\n') # pairwise alignment between the two sequences print('Here is an alignment of the two sequences:') alignment = pairwise2.align.globalms(coord_sequence, record.seq, 1, -0.5, -10, 0) print(pairwise2.format_alignment(*alignment[0])) if args.interactive: # ask for which sequence to choose s = input( 'Which sequence are you interested? 1 for sequence from coordinates ' '2 for sequence from SEQRES: ') if int(s) == 1: sequence = coord_sequence else: sequence = record.seq else: sequence = record.seq # append sequence for the current chain sequences.append( SeqRecord.SeqRecord(seq=sequence, id=pdb_id.upper() + ':' + c, description='')) # write sequences to a fasta file with open(args.output, 'wt') as f: SeqIO.write(sequences, f, 'fasta') # write resseq ids if args.resseq is not None: with open(args.resseq, 'wt') as f: for i, resseq in enumerate(resseq_ids): f.write('> ' + args.chains[i] + '\n') f.write(','.join(str(j) for j in resseq))
io.set_structure(s) # Remove disordered atoms, io.save(folder_name + file_name + "_ordered2.pdb", select=NotDisordered()) s = parser.get_structure("my_pdb", folder_name + file_name + "_ordered.pdb") io = PDBIO() io.set_structure(s) #Remove heteroatoms io.save(folder_name + file_name + "_ordered1.pdb", NonHetSelect()) s = parser.get_structure("my_pdb", folder_name + file_name + "_ordered.pdb") model = s[0] chain = model atoms = [a for a in chain.get_atoms() if pdb.is_aa(a.parent)] # Renumber residues to be sequential parents = [] counter = 0 for a in chain.get_atoms(): if pdb.is_aa(a.parent): parents.append(a.parent) counter = counter + 1 xyzs = [(a.coord) for a in atoms] xyzarr = np.array(xyzs) f = open(folder_name + file_name + '_ordered.pdb', 'w') id_counter = 0 # Write to PDB file for i in range(0, len(atoms)):
def align_for_modeller(option): for chain in structure.get_chains(): last_chain = chain.id tnum_res = (len([_ for _ in chain.get_residues() if PDB.is_aa(_)])) for pp in ppb.build_peptides(structure[0][str(last_chain)], aa_only=option): with open("model.fasta", "a") as f: sequence = pp.get_sequence() f.write(str(sequence)) mdel = structure[0] chain = mdel[last_chain] res_list = PDB.Selection.unfold_entities(chain, "R") for residue in res_list: nid_1aa = (res_list[0].get_id())[1] nid_lastaa = (res_list[tnum_res - 1].get_id())[1] # Modify files so they have only 1 identifier + seq (id_1 or query)# with open('model.fasta', 'r') as o: data = o.read() with open('model.fasta', 'w') as mo: mo.write(">" + file_name + "\n" + data) with open("query.fasta") as o2: line2 = o2.readlines() line2[0] = ">query\n" with open("query.fasta", "w") as m1: m1.writelines(line2) # Combine original sequence and first hit into a fasta input file for the alignment# filenames = ["query.fasta", "model.fasta"] with open('alignment.fasta', 'w+') as aligninput: for files in filenames: with open(files) as infile: aligninput.write(infile.read()) aligninput.write("\n") # Profile-profile alignment using salign from modeller# log.none() aln = alignment(env, file='alignment.fasta', alignment_format='FASTA') aln.salign(rr_file='${LIB}/blosum62.sim.mat', gap_penalties_1d=(-500, 0), output='', align_block=15, align_what='PROFILE', alignment_type='PAIRWISE', comparison_type='PSSM', similarity_flag=True, substitution=True, smooth_prof_weight=10.0) aln.write(file='salign.ali', alignment_format='PIR') print("Alignment of template and query for modelling successfull.") # Fix formatting of the .ali file to specify the 1st and last aminoacid and the structure of the model protein# shutil.copyfile("salign.ali", "salign1.ali") with open("salign1.ali", "r") as file: filedata = file.read() replacement = filedata.replace(">P1;" + str(file_name) + "\nsequence:: : : : :::-1.00:-1.00", ">P1;" + str(file_name) + "\nstructureX" + ":" + str(file_name) + ":" + str( nid_1aa) + ":" + str(last_chain) + ":" + str(nid_lastaa) + ": ::::") with open("salign1.ali", "w+") as f: f.write(replacement) # Make a single model of the query sequence using: salign1.ali and model.pdb# a = automodel(env, alnfile="salign1.ali", knowns=id_1.casefold(), sequence="query") a.starting_model = 1 a.ending_model = 1 a.make() # Check how good the alignment is# pir_alignment = AlignIO.read("salign1.ali", "pir") total_length = len(pir_alignment[0]) gaps_1 = 0 gaps_2 = 0 for aas in pir_alignment[0].seq: if aas == "-": gaps_1 += 1 for aas in pir_alignment[1].seq: if aas == "-": gaps_2 += 1 if gaps_1 / total_length > 0.5 or gaps_2 / total_length > 0.5: print( "\nYour model protein covers less than half of the query. The created model could be inaccurate, " + "please check the result before continuing with anything else.\n") time.sleep(3)
def atoms(self): """obtain all atoms in the protein""" for residue in self.structure.get_residues(): if PDB.is_aa(residue, standard=True): self.atom_list.extend(residue.get_atoms())
break t=s.split()[0][:4] t2=s.split()[0][4:] pdbid.append(t) pdbchain.append(t2) for i in range(len(pdbid)): print(pdbid[i],pdbchain[i]) pdb_name='pdb_/pdb'+pdbid[i].lower()+'.ent' s = p.get_structure("1",pdb_name) s = s[0][pdbchain[i]] res_list = PDB.Selection.unfold_entities(s, 'R') aa_list = [] for a in res_list: if PDB.is_aa(a): aa_list.append(a) error=0 t=aa_list[0].get_id()[1] aa_list_full=[] for a in aa_list: while 1: if a.get_id()[1]<t: error=1 break if a.get_id()[1]==t: aa_list_full.append(a) t+=1 break else:
def is_no_aa_chain(chain): """ Test if a chain contains no amino acids. """ return all([(not PDB.is_aa(r)) for r in chain])
def get_aa_list(res_list): aa_list = [a for a in res_list if PDB.is_aa(a)] return aa_list
def getPolygonalChain(PDBfilename, outNumpy_b=0, optionOut_b=False, outputFile=''): '''Derives the polygonal chain representation of the protein from its PDB file. Output: array of line segments between the C-alpha atoms and ordered by these.''' CaChain = {} polyCaChain = {} structure = parser.get_structure(PDBfilename, PDBfilename) #file id = PDBfilename #we trust that the top100 files all have crystallograhic content for modelId = 0: model = structure[0] #first loop though the chains; for each residue we store the CA info: cntCA = 0 if outNumpy_b == 1: for chain in model: print "Reading chain: %s" % chain if chain.id.isspace(): chain.id = '>' print "Chain id is blank and gets subst with: %s" % chain.id for residue in chain: if PDB.is_aa(residue): if residue.id[ 0] == ' ': #residue.id[0] is the hetero-flag; if not blank the residue will contain hetero-atoms (HETATM in pdb format) #we only add chain as key if there is an aa in the chain: if not (CaChain.has_key(chain.id)): print "Recording a new chain id in the structure: %s" % chain.id CaChain[chain.id] = [] polyCaChain[chain.id] = [] cntCA = 0 # CA = residue['CA'] vCA = np.array(residue['CA'].get_vector()) CaChain[chain.id].append(vCA) if cntCA > 0: # v = vCA - vPrev polyCaChain[chain.id].append([vPrev, vCA]) #[vCA,vPrev]?? # print v vPrev = vCA cntCA += 1 # print cntCA # print vCA else: for chain in model: print "Reading chain: %s" % chain if chain.id.isspace(): chain.id = '>' print "Chain id is blank and gets subst with: %s" % chain.id for residue in chain: if PDB.is_aa(residue): if residue.id[ 0] == ' ': #residue.id[0] is the hetero-flag; if not blank the residue will contain hetero-atoms (HETATM in pdb format) #we only add chain as key if there is an aa in the chain: if not (CaChain.has_key(chain.id)): print "Recording a new chain id in the structure: %s" % chain.id CaChain[chain.id] = [] polyCaChain[chain.id] = [] cntCA = 0 # CA = residue['CA'] vCA = residue['CA'].get_vector() CaChain[chain.id].append(vCA) if cntCA > 0: # v = vCA - vPrev polyCaChain[chain.id].append([vPrev, vCA]) #[vCA,vPrev]?? # print v vPrev = vCA cntCA += 1 # print cntCA # print vCA if optionOut_b: with open(outputFile, 'w') as of: for k in polyCaChain.keys(): of.write('file: ' + PDBfilename + ';' + 'Chain: ' + k) for v in polyCaChain[k]: x0, y0, z0 = v[0] # print x0, y0, z0 x1, y1, z1 = v[1] s = str(x0) + ';' + str(y0) + ';' + str(z0) + ';' + str( x1) + ';' + str(y1) + ';' + str(z1) + '\n' of.write(s) return CaChain, polyCaChain
def one_entry(entry, radius, out_path, sen_path, plot_path, prefix): print(entry) pdbid = entry.loc['PDB'] chain = entry.loc['Chain'] go = entry.loc['GO'] name = entry.loc['Name'] ligand = entry.loc['Ligand'] try: lig_ids = entry.loc['LigID'].split(',') except AttributeError: print('LigID not given for {}_{}'.format(pdbid, chain)) return '!LigID', entry, None # call parser: parser = pdb.PDBParser() # get data sensitivity_file = os.path.join(sen_path, 'masked_{}_{}_1.txt'.format(pdbid, chain)) print('Reading {}'.format(sensitivity_file)) try: md_df = pd.read_csv(sensitivity_file, sep='\t', index_col=0) except FileNotFoundError: print('File not found: {}'.format(sensitivity_file)) return '!Senstivity file', entry, None pdb_file = os.path.join(pdb_path, '{}.pdb'.format(pdbid)) print('Using PDB-File {}'.format(pdb_file)) try: struc = parser.get_structure(id='{}_{}'.format(pdbid, chain), file=pdb_file) except FileNotFoundError: print('File not found: {}'.format(pdb_file)) return '!PDB file', entry, None # calculate each residues distance to the ligand # select ligands ligs = [] for lig_id in lig_ids: lig_chain, lig_name, lig_resid = lig_id.split('/') try: lig = struc[0][lig_chain][' ', int(lig_resid), ' '] except KeyError: lig_name = 'H_{}'.format(lig_name) lig = struc[0][lig_chain][lig_name, int(lig_resid), ' '] print(lig) ligs.append(lig) # Min instead of mean seq = ''.join(md_df['AA'][1:]) distances = [] seq_matched = '' for res in struc[0][chain].get_residues(): # move along the protein chain tmp = [] if not pdb.is_aa(res): continue for lig in ligs: for lig_at in lig.get_atoms(): for res_at in res.get_atoms(): tmp.append(lig_at - res_at) try: aa = three2single[res.get_resname()] except KeyError: continue seq_matched += aa distances.append(min(tmp)) alignment_obj = pairwise2.align.globalxx(seq, seq_matched, one_alignment_only=True)[0] aligned_md, aligned_pdb = alignment_obj[:2] aligned_distances = [] current_pos = 0 for aa_md, aa_pdb in zip(aligned_md, aligned_pdb): if aa_md == '-': continue if aa_pdb == '-': aligned_distances.append(float('nan')) else: aligned_distances.append(distances[current_pos]) current_pos += 1 md_df['d_ligand'] = [0.0] + aligned_distances md_df.to_csv(os.path.join(out_path, 'masked_{}_{}_1.txt'.format(pdbid, chain)), sep='\t') # this is saved now # rest of the calculation try: c, d, zero_distance_idxes = separate_values(md_df, go, radius, prefix) except IndexError: # (KeyError, IndexError): print('GO not in data for {}: {} not in {}'.format( go, md_df.columns, pdbid)) return '!GO', entry, None stat, p, zero_distance_below_threshold = plot_and_compare( c, d, zero_distance_idxes, plot_path, pdbid, chain, go, name, ligand) print( '{} {} - {:.1e}, {:.2f} \t-> {}, zero distance values below percentile: {}' .format(pdbid, chain, p, stat, ligand, zero_distance_below_threshold)) entry.loc['p'] = p entry.loc['stat'] = stat entry.loc['nc'] = len(c) entry.loc['nd'] = len(d) org = entry['Organism'] name = entry['Name'] lig = entry['Ligand'] head = '{} {} - {}\n{} {}, {}, p = {:.3f}, t = {:.2f}\n\n'.format( org, name, lig, pdbid, chain, go, p, stat) return head, entry, zero_distance_below_threshold
warnings.simplefilter('ignore', BiopythonWarning) parser = PDBParser(PERMISSIVE=1) structure_id = pdbid filename = pdbid + ".pdb" structure = parser.get_structure(structure_id, filename) ''' model=structure[0] chain=model["A"] print(len(model.get_list())) ''' print(structure.get_full_id()) for chain in structure.get_chains(): i = 0 for temp in chain.get_residues(): if PDB.is_aa(temp): i += 1 print(chain, " length : ", i) #length of each chain ''' print("\nfor the chain : ",end='') print(chain) residues=chain.get_residues() #print all residues of a chain for res in residues: print("residue name : ",end='') print(res.get_resname(),end='') if PDB.is_aa(res): print(" amino acid") else: print(" not amino acid") print("atoms and their coordinates in the residue : ") atoms=res.get_atom()
def parse_pdb_length(name): pdb = PDBParser().get_structure(name, "../../../0-identify_structure/2-get_pdb_chain/{0}/{1}.pdb".format(organism, name)) chain = list(pdb.get_chains())[0] #only 1 chain present return len([_ for _ in chain.get_residues() if PDB.is_aa(_)]) #omits missing residues