def pdb_neighbors(pdb_f, pdb_id): structure = PDBParser().get_structure(pdb_id, pdb_f) atom_list = Selection.unfold_entities(structure, 'A') ns = NeighborSearch(atom_list) center_res = [res for res in structure.get_residues() if res.get_resname() in ['PTR','SEP','TPO']] neighbors = [] for res in center_res: if res.get_resname() == 'PTR': atoms = [atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OH']] elif res.get_resname() == 'SEP': atoms = [atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG']] elif res.get_resname() == 'TPO': atoms = [atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG1']] atom_neighbors = [ns.search(a.get_coord(),BOND_CUTOFF) for a in atoms] atom_neighbors = [atom for atoms in atom_neighbors for atom in atoms] atom_neighbors = list(set(atom_neighbors)) atom_neighbors = [atom for atom in atom_neighbors if 'N' in atom.get_name() or 'O' in atom.get_name()] atom_neighbors = list(set(Selection.unfold_entities(atom_neighbors,'R'))) atom_neighbors = [r for r in atom_neighbors if r != res] if len(atom_neighbors) > 0: res = res.get_resname()+'_'+str(res.get_id()[1])+'_'+res.get_parent().get_id() atom_neighbors = [n.get_resname()+'_'+str(n.get_id()[1])+'_'+n.get_parent().get_id() for n in atom_neighbors] neighbors.append((pdb_id,res,atom_neighbors)) return neighbors
class Protein(object): def __init__(self, pdb, contact_defn): self.pdb = pdb self.contact_defn = contact_defn pdbs_dir = '../../../0-identify_structure/2-get_pdb_chain' self.structure = PDBParser().get_structure('X', "{0}/{1}/{2}.pdb".format(pdbs_dir, organism, pdb)) self.residues = [] #only consider actual residues self.atoms = [] self.parse_structure() def parse_structure(self): for residue in self.structure.get_residues(): # if PDB.is_aa(residue, standard=True): # only the standard 20 if PDB.is_aa(residue): res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues self.residues.append(res) self.atoms.extend(atoms_method(self.contact_defn, residue)) def get_residues(self): return self.residues def get_max_residue(self): return max(self.residues)+1
class Protein(object): """ This is a class that obtains protein sequence and structure information Attributes: d_seq (dict of {int: str): connects PDB residue number with its aminoacid type """ def __init__(self, pdb_path): self.structure = PDBParser().get_structure("", pdb_path) self.residues = [] self.d_sequence = {} self.parse_structure() def parse_structure(self): for residue in self.structure.get_residues(): if PDB.is_aa(residue, standard=True): #only consider standard 20 residues res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues (ex. 1ORC) self.residues.append(res) self.d_sequence[res] = Polypeptide.three_to_one( Residue.Residue.get_resname(residue)) def get_residues_sequence(self): return self.residues, self.d_sequence
def extract_seq_from_pdb (pedxxxx,ensemble,confomer,chain): pdb = ("%s_%s-%s_%s.pdb"%(pedxxxx, ensemble,conformer,chain)) structure = PDBParser().get_structure("pdb",pdb) seq_pdb=[] seq_pdb_complete=[] for residue in structure.get_residues(): if residue.id[0] == " ": seq_pdb.append(residue.get_resname()) seq_pdb = ''.join(seq_pdb) seq_pdb= seq1(seq_pdb) longitud=len(seq_pdb) return (seq_pdb)
class IterationTests(unittest.TestCase): def setUp(self): self.struc = PDBParser(PERMISSIVE=True).get_structure('X', "PDB/a_structure.pdb") def test_get_chains(self): """Yields chains from different models separately.""" chains = [chain.id for chain in self.struc.get_chains()] self.assertEqual(chains, ['A','A', 'B', ' ']) def test_get_residues(self): """Yields all residues from all models.""" residues = [resi.id for resi in self.struc.get_residues()] self.assertEqual(len(residues), 167) def test_get_atoms(self): """Yields all atoms from the structure, excluding duplicates and ALTLOCs which are not parsed.""" atoms = ["%12s"%str((atom.id, atom.altloc)) for atom in self.struc.get_atoms()] self.assertEqual(len(atoms), 756)
def get_residues_df(pdb_id, data_path): pdb_file = data_path + pdb_id + '.pdb' structure = PDBParser().get_structure(pdb_id, pdb_file) ###Get residue coordinates temp_listy = [] cols = [ 'residue_number', 'amino_acid', 'chain', 'CA_coords', 'CB_coords', 'SCcenter_coords' ] for residue in structure.get_residues(): if is_aa(residue): temp_dict = process_residue(residue) if type(temp_dict) == str: continue temp_listy.append([temp_dict[col] for col in cols]) residues_df = pd.DataFrame(temp_listy, columns=cols) return residues_df
def test_maxent_from_contacts(): code = '1ptq' fileName = testFilePath + code + '.pdb' refStructure = PDBParser().get_structure(code, fileName) # NOTE remove all unneeded residues to make sure, atom naming is consistent rToRemove = [ r for r in refStructure.get_residues() if r.get_id()[0] != ' ' ] for r in rToRemove: refStructure[0]['A'].detach_child(r.get_id()) pass contacts = get_contacts(refStructure[0], cutOff=5., minSeqDist=0) sequences = [] with open(fileName, 'r') as f: sequences = [r.seq for r in SeqIO.parse(f, "pdb-seqres")] pass ds = Distructure('test', sequences, [[r.get_id() for r in c if r.get_id()[0] == ' '] for c in refStructure[0]]) ds.generate_primary_contacts() ds.set_tertiary_contacts(contacts) ds.run() sup = Superimposer() sup.set_structures(refStructure, ds) RMSD = sup.rms # TODO deterministic initialize print("this test sometimes fails depending on initialization.") print("test with deterministic initialization is coming.") assert RMSD < 0.15 return
def test_RNA(): code = "2gis" fileName = testFilePath + code + '.pdb' refStructure = PDBParser().get_structure(code, fileName) rToRemove = [ r for r in refStructure.get_residues() if r.get_id()[0] != ' ' ] for r in rToRemove: refStructure[0]['A'].detach_child(r.get_id()) pass contacts = get_contacts(refStructure[0], cutOff=6., minSeqDist=0) sequences = [] for chain in refStructure[0]: s = ''.join( [r.get_resname().strip() for r in chain if r.get_id()[0] == ' ']) s = Seq(s, unambiguous_rna) sequences.append(s) pass ds = Distructure('test', sequences) ds.generate_primary_contacts() ds.set_tertiary_contacts(contacts) ds.run() sup = Superimposer() sup.set_structures(refStructure, ds) RMSD = sup.rms # TODO deterministic initialize print("this test sometimes fails depending on initialization.") print("test with deterministic initialization is coming.") assert RMSD < 6. return
def get_residues_df(pdb_id, path): pdb_file = path + pdb_id + '.pdb' structure = PDBParser().get_structure(pdb_id, pdb_file) atoms = structure.get_atoms() ###Get residue coordinates temp_listy = [] cols = [ 'residue_number', 'amino_acid', 'chain', 'CA_coords', 'CB_coords', 'SCcenter_coords' ] for residue in structure.get_residues(): if is_aa(residue): temp_dict = process_residue(residue) if type(temp_dict) == str: continue temp_listy.append([temp_dict[col] for col in cols]) else: #print('Problem with this residue: {}'.format(residue)) qwe = 1 residues_df = pd.DataFrame(temp_listy, columns=cols) return residues_df
class MDTensor(): def __init__(self, folder, cutoff=5): self.folder = folder self.cutoff = cutoff self.three2one = three2one self.one2three = one2three self.id2name = None self.create_tensor() def create_id2name(self, pdb): mol = bg.Pmolecule(pdb) net = mol.network(cutoff=self.cutoff) self.structure = PDBParser().get_structure('X', pdb)[0] residues = [] for residue in self.structure.get_residues(): residues.append(self.three2one[residue.resname]) old_labels = net.nodes labels = [a + b[1:] + ':' + b[0] for a, b in zip(residues, old_labels)] self.id2name = dict(zip(old_labels, labels)) def create_tensor(self): net = None L_adjacency = [] for filepath in tqdm(sorted(listdir(self.folder))): L_adjacency.append(self.create_adj(jn(self.folder, filepath))) self.tensor = np.stack(L_adjacency, axis=-1) def create_adj(self, pdb): if not self.id2name: self.create_id2name(pdb) mol = bg.Pmolecule(pdb) net = mol.network(cutoff=self.cutoff) return nx.to_numpy_array(net) def save_tensor(self, path): np.save(path, self.tensor)
class Family(object): '''A class that compiles information about a protein structure and related sequences. This information is meant to be sufficient to filter the residues and calculate an ez-beta moment from those that remain. Attributes: stru_name: Name of the structure stru_path: Path of the PDB format structure file stru: the structure, as a Biopython entity msa: a dictionary mapping sequence identifiers to rows in a multiple sequence alignment template_seq: the row of the MSA containing the sequence of the structure res_to_pos: a dictionary mapping residues from structures to their column number in the MSA (asssuming the first column is numbered 0) dssp: a Biopython DSSP object with a DSSP for the structure calc: an Ez-beta calculator''' def __init__(self, stru_name, stru_path, msa_path, template_name, param_path): '''Requires a name for the structure (your choice), a path to a PDB format structure file, a path to a multiple sequence alignment containing a row with exactly the same sequence as the structure, the sequence identifier of this row, and a path to a CSV file of Ez-beta parameters (see zenergy.Calculator for how to make these files)''' self.stru_name = stru_name self.stru_path = stru_path with warnings.catch_warnings(): warnings.simplefilter('ignore') self.stru = PDBParser().get_structure(stru_name, stru_path) # When Daniel created the aligned structures, he removed heteroatoms # (though the procedure he used seems to have removed anything # without the residue identifer of one of the 20 standard amino # acids, leading to main chain selenomethionines being removed from # the 1FEP structure). However, he also added a sort of box of # water atoms (perhaps as a visual aid, so you can tell how the # coordinate system is defined?) # Therefore, remove all waters waters = [i for i in self.stru.get_residues() \ if i.get_resname() == 'HOH'] for chain in self.stru.get_chains(): for water in waters: try: chain.detach_child(water.get_id()) # Maybe it's not in this chain except KeyError: pass msa = Bio.AlignIO.read(open(msa_path), 'clustal') self.msa = dict((seq.id, seq) for seq in msa) self.template_seq = self.msa[template_name] self.res_to_pos = map_res_to_pos(self.stru.get_residues(), self.template_seq) self.dssp = DSSP_win.DSSP(self.stru.child_dict[0], stru_path) params = csv.reader(open(param_path, 'rb')) self.calc = zenergy.Calculator(params)
def create_fragment(self, fragment_file_name): tokens = fragment_file_name.strip().replace('.pdb', '').split('_') #Not the most efficient way, but gives the overview on what is going on generic_num = float("%s.%s" % (tokens[0], tokens[1])) res_name = tokens[2] protein_entry_name = tokens[3] pdb_code = tokens[4] if len(tokens) > 5: if len(tokens) == 7: feature = '_'.join([tokens[5], tokens[6]]) elif len(tokens) == 6: feature = tokens[5] #Checking the if the crystal is in the database try: s = Structure.objects.get(pdb_code__index=pdb_code) except Structure.DoesNotExist: self.logger.warning( 'Cannot find the structure {} in the database. Skipping the fragment {}' .format(pdb_code, fragment_file_name.strip().replace('.pdb', ''))) return #ResidueFragmentInteractionType try: i, created = ResidueFragmentInteractionType.objects.get_or_create( slug=feature, name=self.interactions[feature]) except Exception: self.logger.info( "Failed to find or create feature {}...".format(feature)) #Rotamer and Fragment try: fragment_struct = PDBParser(PERMISSIVE=True).get_structure( 'frag', os.sep.join([self.fragments_dir, fragment_file_name]))[0] fragment_pdb_data = '' r = None for residue in fragment_struct.get_residues(): hetfield, resseq, icode = residue.get_id() if hetfield == ' ': #Amino acid try: r = Residue.objects.get( sequence_number=int(resseq), amino_acid=polypeptide.three_to_one( residue.resname), protein_conformation=s.protein_conformation) d, created = PdbData.objects.get_or_create( pdb=extract_pdb_data(residue)) rot, created = Rotamer.objects.get_or_create( residue=r, structure=s, pdbdata=d) #rot.save() except Exception as msg: self.logger.error( 'Failed to add rotamer {}:{}{}\n'.format( pdb_code, resseq, msg)) return else: fragment_pdb_data += extract_pdb_data(residue) try: fd, created = PdbData.objects.get_or_create( pdb=fragment_pdb_data) #Taking the first ligand from the list, since existing fragments do not contain the ligand info f, created = Fragment.objects.get_or_create( residue=r, ligand=s.ligands.all()[0], structure=s, pdbdata=fd) #f.save() except Exception as msg: self.logger.error('Failed to add fragment {}\n{}'.format( fragment_file_name, msg)) except Exception as msg: self.logger.error('Failed to add fragment {} to the db\n{}'.format( fragment_file_name, msg)) #StructureLigandInteraction try: lr, created = LigandRole.objects.get_or_create(name='unknown', slug='unknown') sli, created = StructureLigandInteraction.objects.get_or_create( structure=s, ligand=s.ligands.all()[0], ligand_role=lr) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format( fragment_file_name, msg)) try: rfi, created = ResidueFragmentInteraction.objects.get_or_create( structure_ligand_pair=sli, rotamer=rot, fragment=f, interaction_type=i) self.logger.info( "Successfully added interacting fragment {}".format( fragment_file_name)) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format( fragment_file_name, msg))
def pdb_neighbors(pdb_f, pdb_id): structure = PDBParser().get_structure(pdb_id, pdb_f) atom_list = Selection.unfold_entities(structure, 'A') ns = NeighborSearch(atom_list) center_res = [ res for res in structure.get_residues() if res.get_resname() in ['PTR', 'SEP', 'TPO'] ] neighbors = [] for res in center_res: if res.get_resname() == 'PTR': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OH'] ] elif res.get_resname() == 'SEP': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG'] ] elif res.get_resname() == 'TPO': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG1'] ] atom_neighbors = [ ns.search(a.get_coord(), BOND_CUTOFF) for a in central_atoms ] atom_neighbors = [atom for atoms in atom_neighbors for atom in atoms] positive_atom_neighbors = [ ns.search(a.get_coord(), POSITIVE_BOND_CUTOFF) for a in central_atoms ] positive_atom_neighbors = [ atom for atoms in positive_atom_neighbors for atom in atoms ] positive_atom_neighbors = [ atom for atom in positive_atom_neighbors if atom.get_name() in ['NE2', 'ND1', 'NZ', 'NE', 'NH2', 'NH1'] ] atom_neighbors.extend(positive_atom_neighbors) atom_neighbors = list(set(atom_neighbors)) #filter self atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_parent() == res ] # only consider those containing N or O atom_neighbors = [ atom for atom in atom_neighbors if 'N' in atom.get_name() or 'O' in atom.get_name() ] ## ignore water atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_parent().get_resname() == 'HOH' ] # filter main_chain O, they are not donor atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_name() == 'O' ] # filter O in N Q, they are not donor atom_neighbors = [ atom for atom in atom_neighbors if not (atom.get_name() == 'OD1' and atom.get_parent().get_resname() == 'ASN') ] atom_neighbors = [ atom for atom in atom_neighbors if not (atom.get_name() == 'OE1' and atom.get_parent().get_resname() == 'GLN') ] # filter O in D E, they are not donor # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD1' and atom.get_parent().get_resname() == 'ASP')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD2' and atom.get_parent().get_resname() == 'ASP')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE1' and atom.get_parent().get_resname() == 'GLU')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE2' and atom.get_parent().get_resname() == 'GLU')] # ignore residues on the same chain of res using main-chain atom # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'N' and atom.get_parent().get_parent() == res.get_parent())] ## filter non-standard residues STAND_RES = [ 'VAL', 'ILE', 'LEU', 'GLU', 'GLN', 'ASP', 'ASN', 'HIS', 'TRP', 'PHE', 'TYR', 'ARG', 'LYS', 'SER', 'THR', 'MET', 'ALA', 'GLY', 'PRO', 'CYS' ] for atom in atom_neighbors: if atom.get_parent().get_resname() not in STAND_RES: atom_neighbors = [] ## filter same chain # for atom in atom_neighbors: # if atom.get_parent().get_parent() == res.get_parent(): # atom_neighbors = [] ## filter entry containing main_chain O of residues on different chain of res # for atom in atom_neighbors: # if atom.get_name() == 'N': # atom_neighbors = [] atom_neighbors = list( set(Selection.unfold_entities(atom_neighbors, 'R'))) atom_neighbors = [r for r in atom_neighbors if r != res] if len(atom_neighbors) > 0: res = res.get_resname() + '_' + str( res.get_id()[1]) + '_' + res.get_parent().get_id() atom_neighbors = [ n.get_resname() + '_' + str(n.get_id()[1]) + '_' + n.get_parent().get_id() for n in atom_neighbors ] neighbors.append((pdb_id, res, atom_neighbors)) return neighbors
class Blueprint: def __init__(self, blueprint_file=None, pdbfile=None, structure=None, segments=None, data=None): if pdbfile: self.structure = PDBParser().get_structure(pdbfile, pdbfile) else: self.structure = structure if segments: self.segments = segments self.bp_data = [] self.segment_dict = {} for seg in segments: self.bp_data += seg.bp_data self.segment_dict[seg.id] = seg if blueprint_file and not data: # read the blueprint file and initialize segments # if self.structure is available put the residues in the segments. #self.segments = [ ] foldinfo_register = "" hsstriplet_register = "" register = re.compile('^\s*(\d+)\s+(\w+)\s+(\w+)\s+(.+)') data = [] for line in open(blueprint_file): if line.startswith('FOLDINFO'): foldinfo_register = line.strip() elif line.startswith('HSSTRIPLET'): hsstriplet_register = line.strip() elif line.startswith('HSSTRIAD'): hsstriplet_register = line.strip() elif line.startswith('SSPAIR'): #r = re.compile("(\d+)-(\d+).(\w).") r = re.compile("(\d+)-(\d+).(\w).([-]?\d+)") self.sspairs = r.findall(line) elif line.startswith('HHPAIR') or line[0] == '#': pass else: r = register.split(line) data.append([int(r[1]), r[2], r[3], r[4]]) if blueprint_file or data: # group the tuples in lists by their secondary structure and initiliaze the segments # grab the residues from the structure if this is available # self.bp_data contains all blueprint residue data # self segment_dict is a dict of segments where the keys are the ID for the ss segment. For example # H3 means Helix 3. self.segments = [] self.bp_data = [] self.segment_dict = {} res_index = 0 segment_count = {'L': 1, 'H': 1, 'E': 1} residues = list( self.structure.get_residues()) if self.structure else None for sstype, bp_data in groupby(data, key=lambda x: x[2][0]): resdata = list(bp_data) self.bp_data += resdata id = sstype + str(segment_count[sstype]) segment_count[sstype] += 1 seg = None if self.structure: segment_residues = [] for data in resdata: segment_residues.append(residues[res_index]) res_index += 1 seg = Segment(id, sstype, resdata, segment_residues) else: seg = Segment(id, sstype, resdata) # append the segment to the segment list self.segments.append(seg) # insert the segment to the segment dict self.segment_dict[id] = seg #use the segment_dict to fill foldinfo and hsstriplet ## I AM GOING TO FINISH THIS LATER BECAUSE IT IS GOING TO BE TRICKY TO SET UP THE FOLDS WITH THE SWAPP ## MEANWHILE I AM GOING TO MODIFY dump_blueprint to take the foldinfo and hss tripplet as arguments #get_fold_tokens = re.compile('(\d+-\d+\.[AP]\.-?\d)') #fold_tokens = get_fold_tokens.findall(foldinfo_register) #for ft in fold_tokens: # pass def topology(self): return reduce(lambda x, y: x + '-' + y, [s.id for s in self.segments]) def topology_lengths(self): topol1 = reduce(lambda x, y: x + '-' + y, [s.id for s in self.segments]) elements = re.compile("[HEL]\d+") ss_lst = elements.findall(topol1) topol2 = '' topol3 = '' for ss in ss_lst: seg = self.segment_dict[ss] n = len(seg.bp_data) topol2 += '%s%s-' % (ss[0], n) topol3 += '%s[%s-%s]' % (ss[0], n, n) return topol2, topol3 def ss_tag(self): H = 0 E = 0 for s in self.segments: if s.sstype == 'H': H += 1 elif s.sstype == 'E': E += 1 else: pass return "%dH%dE" % (H, E) def freeze_all(self): for res in self.bp_data: res[3] = '.' def remodel_all(self): for res in self.bp_data: res[3] = 'R' def remodel_segment(self, index=None, id=None, index_to_zero=False, loop_edge=True): res_for_remodel = [] if index: for res in self.segments[index].bp_data: res_for_remodel.append(res) elif id: for res in self.segment_dict[id].bp_data: res_for_remodel.append(res) for res in res_for_remodel: if index_to_zero: res[0] = 0 res[3] = 'R' if loop_edge: for i in range(1, len(self.segments) - 1): prev_seg = self.segments[i - 1] seg = self.segments[i] next_seg = self.segments[i + 1] if seg.sstype == 'L': if seg.bp_data[0][3] == 'R': prev_seg.bp_data[-1][3] = 'R' if seg.bp_data[-1][3] == 'R': next_seg.bp_data[0][3] = 'R' def residue_segment(self, pos): its_segment = '' for segment in self.segment_dict.keys(): seg = self.segment_dict[segment] for res in seg.bp_data: if res[0] == pos: its_segment = segment break else: continue break return its_segment def segment_lengths(self): return reduce(lambda i, j: i + '-' + j, [s.sstype + str(len(s.bp_data)) for s in self.segments]) def reindex_blueprint(self, start=1, rebuild_index_to_zero=False): indexer = start for bp_data in self.bp_data: if rebuild_index_to_zero and bp_data[3] == 'R': bp_data[0] = 0 else: bp_data[0] = indexer indexer += 1 def segment_list(self): r = re.compile('([HEL]\d+)-?') seg_list = r.findall(self.topology()) return seg_list def dump_blueprint(self, filename, header_lines=[]): '''header lines are for setting foldinfo, hsstriplet or any other register on the top of the blueprint.''' out = open(filename, 'w') for line in header_lines: line.strip() # avoid doble carriage return out.write(line + '\n') for r in self.bp_data: out.write("%d %s %s %s\n" % tuple(r)) out.close() def dump_pdb(self, filename): io = PDBIO() io.set_structure(self.structure) io.save(filename) def swapp_segments(self, index1, index2): '''This function swaps the segments, reindexes the blueprint and PDB file and set for remodelling the segments directly conected to the swapped segments. The rest of the structure is set frozen.''' #freeze the structure and delete the residues conected to the swapped segments for remodel #add to the blueprint the corresponding insertions for the deleted residues. self.freeze_all() self.remodel_segment(index1 - 1, index_to_zero=True) self.remodel_segment(index1 + 1, index_to_zero=True) self.remodel_segment(index2 - 1, index_to_zero=True) self.remodel_segment(index2 + 1, index_to_zero=True) #wapp the self.segments self.segments[index1], self.segments[index2] = self.segments[ index2], self.segments[index1] #renumerate the blueprint and the residues indexer = 1 residues_to_detach = set() for segment in self.segments: for i in range(0, len(segment.bp_data)): if segment.bp_data[i][0] == 0: residues_to_detach.add(segment.residues[i]) continue segment.bp_data[i][0] = indexer id = segment.residues[i].id segment.residues[i].id = (id[0], indexer, id[2]) indexer += 1 # detach the residues of the residues directly connected to the swapp # this is done to avoid clashes during the remodelling for res in residues_to_detach: p = res.get_parent() p.detach_child(res.id) # sort the residues in the structure accoriding to the new indexing for chain in self.structure.get_chains(): chain.child_list = sorted(chain.child_list, key=lambda r: r.id[1]) #now that the elements have been reindexed self.bp_data and self.residues must be updated self.bp_data = reduce(lambda x, y: x + y, [s.bp_data for s in self.segments]) self.residues = reduce(lambda x, y: x + y, [s.residues for s in self.segments])
# coding: utf-8 import json import os import pickle import shutil from Bio.PDB import PDBIO, PDBParser with open('ranking_debug.json') as f: order = json.load(f)['order'] os.makedirs('backup') for i in range(5): with open(f'result_{order[i]}.pkl', 'rb') as f: plddt = pickle.load(f)['plddt'] for model_fn in [ f'ranked_{i}.pdb', f'relaxed_{order[i]}.pdb', f'unrelaxed_{order[i]}.pdb' ]: structure = PDBParser().get_structure('model', model_fn) assert len(list(structure.get_residues())) == len(plddt) for res_i, res in enumerate(structure.get_residues()): for atom in res.get_atoms(): atom.set_bfactor(plddt[res_i]) # now save model io = PDBIO() io.set_structure(structure) shutil.copy(model_fn, f'backup/{model_fn}') io.save(model_fn)
def create_fragment(self, fragment_file_name): tokens = fragment_file_name.strip().replace('.pdb', '').split('_') #Not the most efficient way, but gives the overview on what is going on generic_num = float("%s.%s" %(tokens[0], tokens[1])) res_name = tokens[2] protein_entry_name = tokens[3] pdb_code = tokens[4] if len(tokens) > 5: if len(tokens) == 7: feature = '_'.join([tokens[5], tokens[6]]) elif len(tokens) == 6: feature = tokens[5] #Checking the if the crystal is in the database try: s = Structure.objects.get(pdb_code__index=pdb_code) except Structure.DoesNotExist: self.logger.warning('Cannot find the structure {} in the database. Skipping the fragment {}'.format(pdb_code, fragment_file_name.strip().replace('.pdb', ''))) return #ResidueFragmentInteractionType try: i, created = ResidueFragmentInteractionType.objects.get_or_create(slug=feature, name=self.interactions[feature]) except Exception: self.logger.info("Failed to find or create feature {}...".format(feature)) #Rotamer and Fragment try: fragment_struct = PDBParser(PERMISSIVE=True).get_structure('frag', os.sep.join([self.fragments_dir, fragment_file_name]))[0] fragment_pdb_data = '' r = None for residue in fragment_struct.get_residues(): hetfield, resseq, icode=residue.get_id() if hetfield == ' ': #Amino acid try: r = Residue.objects.get(sequence_number=int(resseq), amino_acid=polypeptide.three_to_one(residue.resname),protein_conformation=s.protein_conformation) d, created = PdbData.objects.get_or_create(pdb=extract_pdb_data(residue)) rot, created = Rotamer.objects.get_or_create(residue=r, structure=s, pdbdata=d) #rot.save() except Exception as msg: self.logger.error('Failed to add rotamer {}:{}{}\n'.format(pdb_code, resseq, msg)) return else: fragment_pdb_data += extract_pdb_data(residue) try: fd, created = PdbData.objects.get_or_create(pdb=fragment_pdb_data) #Taking the first ligand from the list, since existing fragments do not contain the ligand info f, created = Fragment.objects.get_or_create(residue=r, ligand=s.ligands.all()[0], structure=s, pdbdata=fd) #f.save() except Exception as msg: self.logger.error('Failed to add fragment {}\n{}'.format(fragment_file_name, msg)) except Exception as msg: self.logger.error('Failed to add fragment {} to the db\n{}'.format(fragment_file_name, msg)) #StructureLigandInteraction try: lr, created = LigandRole.objects.get_or_create(name='unknown',slug='unknown') sli, created = StructureLigandInteraction.objects.get_or_create(structure=s, ligand=s.ligands.all()[0], ligand_role=lr) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg)) try: rfi, created = ResidueFragmentInteraction.objects.get_or_create(structure_ligand_pair=sli, rotamer=rot, fragment=f, interaction_type=i) self.logger.info("Successfully added interacting fragment {}".format(fragment_file_name)) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg))
def assemble_multiscale_visualization(topology_fn, rmf_fn, pdb_dir, outprefix=None, chimerax=True, xl_fn=None): """ Render multiscale versions of rigid bodies from PDB files + flexible beads from RMF files w/o mapped crosslinks. Args: topology_fn (str): Topolgy file in pipe-separated-value (PSV) format as required in integrative modeling using IMP. For details on how to write a topology file, see: https://integrativemodeling.org/2.13.0/doc/ref/classIMP_1_1pmi_1_1topology_1_1TopologyReader.html rmf_fn (str): Name of the RMF file. pdb_dir (str): Directory containing all the PDB files for the rigid bodies used in modeling. outprefix (str, optional): Prefix for output files. Defaults to None. chimerax (bool, optional): If true, a Chimerax script will be written (extension ".cxc"). Defaults to True. xl_fn (str, optional): A file containing a XL dataset. Defaults to None. If this dataset is supplied, then it will be mapped on to the overall structure with satisfied XLs drawn in blue and violated XLs drawn in red. A XL dataset should be supplied in a comma-separated-value (CSV) format containing at least the following fields protein1, residue1, protein2, residue2, sat where the last field <sat> is a boolean 1 or 0 depending on whether the particular XL is satisfied (in the ensemble sense) as a result of the integrative modeling exercise. """ # ------------------------------------------- # read the RMF file and extract all particles # ------------------------------------------- of = RMF.open_rmf_file_read_only(rmf_fn) rmf_model = IMP.Model() hier = IMP.rmf.create_hierarchies(of, rmf_model)[0] IMP.rmf.load_frame(of, 0) particles = IMP.core.get_leaves(hier) rmf_ps = {} for p in particles: molname = p.get_parent().get_parent().get_parent().get_name().strip() name = p.get_name().strip() coord = IMP.core.XYZ(p).get_coordinates() rmf_ps[(molname, name)] = coord # -------------------------------------------------------------- # map pdb residues to rmf particles for each rigid body pdb file # -------------------------------------------------------------- # read the topology file t = TopologyReader(topology_fn, pdb_dir=pdb_dir) components = t.get_components() map_pdb2rmf = {} rigid_body_models = {} rigid_body_residues = {} chain_ids = {} # these are matched to the chimerax rmf plugin chain_id_count = 0 for c in components: # ignore unstructured residues if c.pdb_file == "BEADS": continue mol = c.molname pdb_prefix = os.path.basename(c.pdb_file).split(".pdb")[0] chain_id = c.chain resrange = c.residue_range offset = c.pdb_offset r0 = resrange[0] + offset r1 = resrange[1] + 1 + offset if mol not in chain_ids: chain_ids[mol] = string.ascii_uppercase[chain_id_count] chain_id_count += 1 if pdb_prefix not in map_pdb2rmf: map_pdb2rmf[pdb_prefix] = {} this_rigid_body_model = PDBParser().get_structure("x", c.pdb_file)[0] this_rigid_body_residues = {(r.full_id[2], r.id[1]): r for r in this_rigid_body_model.get_residues()} rigid_body_models[pdb_prefix] = this_rigid_body_model rigid_body_residues[pdb_prefix] = this_rigid_body_residues for r in range(r0, r1): key = (chain_id, r) val = (mol, r) if key in rigid_body_residues[pdb_prefix]: map_pdb2rmf[pdb_prefix][key] = val # -------------------------------- # align all pdb files with the rmf # -------------------------------- print("\nAligning all rigid body structures...") align = SVDSuperimposer() for pdb_prefix, mapper in map_pdb2rmf.items(): pdb_coords = [] pdb_atoms = [] rmf_coords = [] residues = rigid_body_residues[pdb_prefix] for (chain, pdb_res), (mol, rmf_res) in mapper.items(): r = residues[(chain, pdb_res)] pdb_coords.append(r["CA"].coord) pdb_atoms.extend([a for a in r.get_atoms()]) rmf_coords.append(rmf_ps[(mol, str(rmf_res))]) pdb_coords = np.array(pdb_coords) rmf_coords = np.array(rmf_coords) align.set(rmf_coords, pdb_coords) align.run() rotmat, vec = align.get_rotran() [a.transform(rotmat, vec) for a in pdb_atoms] # -------------------------- # assemble the composite pdb # -------------------------- mols = set(sorted([c.molname for c in components])) print("\nChain IDs by molecule:") for k, v in chain_ids.items(): print("molecule %s, chain ID %s" % (k, v)) reslists = {mol: [] for mol in mols} for pdb_prefix, mapper in map_pdb2rmf.items(): residues = rigid_body_residues[pdb_prefix] for (chain, pdb_res), (mol, rmf_res) in mapper.items(): r = residues[(chain, pdb_res)] ; resid = rmf_res new_id = (r.id[0], resid, r.id[2]) new_resname = r.resname new_segid = r.segid new_atoms = r.get_atoms() new_residue = Residue.Residue(id=new_id, resname=new_resname, segid=new_segid) [new_residue.add(a) for a in new_atoms] reslists[mol].append(new_residue) composite_model = Model.Model(0) for mol, chain_id in chain_ids.items(): this_residues = sorted(reslists[mol], key=lambda r: r.id[1]) this_chain = Chain.Chain(chain_id) [this_chain.add(r) for r in this_residues] composite_model.add(this_chain) # save the composite pdb to file io = PDBIO() io.set_structure(composite_model) if outprefix is None: outprefix = "centroid_model" io.save(outprefix + ".pdb") # ------------------------------------------------------------------- # chimerax rendering (hide most of the rmf except unstructured beads) # ------------------------------------------------------------------- if not chimerax: exit() print("\nWriting UCSF Chimerax script...") s = "" s += "open %s\n" % (outprefix + ".pdb") s += "open %s\n" % rmf_fn s += "hide\n" s += "show cartoon\n" s += "color #%d %s\n" % (CHIMERAX_PDB_MODEL_NUM, STRUCT_COLOR) s += "color #%d %s\n" % (CHIMERAX_RMF_MODEL_NUM, UNSTRUCT_COLOR) s += "hide #%d\n" % CHIMERAX_RMF_MODEL_NUM struct_residues = [] for key, val in map_pdb2rmf.items(): struct_residues.extend(list(val.values())) unstruct_atomspec = {} for p in rmf_ps: molname, particle_name = p rmf_chain_id = chain_ids[molname] if "bead" in particle_name: r0, r1 = particle_name.split("_")[0].split("-") r0 = int(r0) ; r1 = int(r1) this_atomspec = "#%d/%s:%d-%d" % \ (CHIMERAX_RMF_MODEL_NUM, rmf_chain_id, r0, r1) for r in range(r0, r1+1): unstruct_atomspec[(molname, r)] = this_atomspec else: if (molname, int(particle_name)) not in struct_residues: r = int(particle_name) this_atomspec = "#%d/%s:%d" % \ (CHIMERAX_RMF_MODEL_NUM, rmf_chain_id, r) unstruct_atomspec[(molname, r)] = this_atomspec s += "show %s\n" % (" ".join(set(unstruct_atomspec.values()))) # ---------------------------------------------------------- # if crosslink data is supplied, write out a pseudobond file # ---------------------------------------------------------- if xl_fn is not None: # parse XL data df = pd.read_csv(os.path.abspath(xl_fn)) xls = [] for i in range(len(df)): this_df = df.iloc[i] p1 = this_df["protein1"] ; r1 = this_df["residue1"] p2 = this_df["protein2"] ; r2 = this_df["residue2"] sat = this_df["sat"] xls.append((p1, r1, p2, r2, sat)) # get lists of struct atomspecs atomspec = {} for (mol, particle_name) in rmf_ps: if "bead" in particle_name: continue if (mol, int(particle_name)) in unstruct_atomspec: continue chain_id = chain_ids[mol] resid = int(particle_name) atomspec[(mol, resid)] = "#%d/%s:%d@CA" % \ (CHIMERAX_PDB_MODEL_NUM, chain_id, resid) # now add in all the unstruct atomspecs atomspec.update(unstruct_atomspec) # write pseudobond script s_pb = "" s_pb += "; radius = %2.2f\n" % XL_RADIUS s_pb += "; dashes = 0\n" for xl in xls: p1, r1, p2, r2, sat = xl atomspec_1 = atomspec[(p1, r1)] atomspec_2 = atomspec[(p2, r2)] if atomspec_1 == atomspec_2: continue color = SAT_XL_COLOR if sat else VIOL_XL_COLOR s_pb += "%s %s %s\n" % (atomspec_1, atomspec_2, color) s_pb += "\n" pb_fn = outprefix + "_XLs.pb" with open(pb_fn, "w") as of: of.write(s_pb) s += "open %s\n" % pb_fn s += "preset 'overall look' publication\n" chimerax_out_fn = outprefix + ".cxc" with open(chimerax_out_fn, "w") as of: of.write(s)
class AANetwork: def __init__(self, cutoff=5): # self.pos1 = pos1 # self.pos2 = pos2 self.three2one = three2one self.one2three = one2three self.cutoff = cutoff def create(self, pdb): """ Creates the amino acid network using biographs""" mol = bg.Pmolecule(pdb) self.net = mol.network(cutoff=self.cutoff, weight=True) self.structure = PDBParser().get_structure('X', pdb)[0] # if self.pos1 and self.pos2: # for node in list(self.net.nodes): # pos = int(node[1::]) # if pos not in range(self.pos1, self.pos2): # self.net.remove_node(node) residues = [] for residue in self.structure.get_residues(): if residue.resname in self.three2one: residues.append(self.three2one[residue.resname]) else: residues.append(residue.resname) old_labels = self.net.nodes labels = [a + b[1:] + ':' + b[0] for a, b in zip(residues, old_labels)] mapping = dict(zip(old_labels, labels)) self.net = nx.relabel_nodes(self.net, mapping) return self.net def save(self, output): """Saves the network""" assert self.net, print('No network loaded') nx.write_gpickle(self.net, output) def create_average(self, folder): """Creates the average network over a folder. The average is computed at each step for memory issues. This has been demonstrated to be equivalent to a global average.""" net = None weights = dict() for filepath in tqdm(listdir(folder)): if net: _net = self.create(join(folder, filepath)) for u, v in _net.edges(): if (u, v) in weights: weights[(u, v)] += _net.get_edge_data(u, v)['weight'] else: weights[(u, v)] = _net.get_edge_data(u, v)['weight'] net = nx.compose(net, _net) else: net = self.create(join(folder, filepath)) for u, v in net.edges(): weights[(u, v)] = net.get_edge_data(u, v)['weight'] new_weights = {} for elt in weights: new_weights[elt] = weights[elt] / len(listdir(folder)) nx.set_edge_attributes(net, name='weight', values=new_weights) self.net = net return self.net def get_interface(self, output=False): assert self.net, print('No network defined') L_edges_to_remove = [] for u, v in self.net.edges(): if u[-1] == v[-1]: # verify they don't belong to the same chain L_edges_to_remove.append((u, v)) self.net.remove_edges_from(L_edges_to_remove) self.net.remove_nodes_from(list(nx.isolates(self.net))) if output: weights = np.sum(nx.to_numpy_matrix(self.net), axis=1) degrees = np.sum(nx.to_numpy_matrix(self.net, weight=None), axis=1) nw = np.divide(weights, degrees) df = pd.DataFrame(data=np.concatenate([degrees, weights, nw], axis=-1), index=self.net.nodes(), columns=['Degree', 'Weight', 'NW']) df.to_excel(output) def threshold_loop(self, output, pdb, increment=1): threshold = 0 empty = False while not empty: to_remove_edges = [] for u, v in self.net.edges(): weight = self.net.get_edge_data(u, v)['weight'] if weight < threshold: to_remove_edges.append((u, v)) self.net.remove_edges_from(to_remove_edges) self.net.remove_nodes_from(list(nx.isolates(self.net))) if len(self.net.edges()) != 0: output_path = output.replace('.p', '_' + str(threshold) + '.p') self.save(output_path) pos = self.get_pos(pdb) nx.draw(self.net, pos=pos, font_weight='bold', labels={node: node for node in self.net.nodes()}, node_size=100, node_shape='o', font_size=15, node_color='lightgrey') plt.savefig(output_path + 'ng') plt.close() threshold += increment else: empty = True def get_pos(self, pdb): structure = PDBParser().get_structure('X', pdb)[0] pos = {} for atom in structure.get_atoms(): if atom.id == 'CA': residue = atom.parent c = 1 * (residue.parent.id == 'A') #Separate chains if residue.resname in three2one: "tbh these are random values" y = (atom.coord[2]) * (1 - 0.5 * c) x = atom.coord[1] pos[three2one[residue.resname] + str(residue.id[1]) + ':' + residue.parent.id] = (x, y) return pos def shortest_pathways(self, scale_method, cut_method, cut_number): assert self.net, print('network non existing') weights = nx.get_edge_attributes(self.net, 'weight') new_weights = {} if scale_method == 'log': for elt in weights: new_weights[elt] = -log(weights[elt]) if scale_method == 'inv': for elt in weights: new_weights[elt] = 1 / weights[elt] nx.set_edge_attributes(self.net, 'weight', new_weights) ###not finished def node_weigths_line(self, method, output): assert self.net, print('network non existing') colors = nx.get_edge_attributes(self.net, 'color') sign = {edge: 2 * (colors[edge] == 'g') - 1 for edge in colors} nx.set_edge_attributes(self.net, sign, 'sign') signs = nx.to_numpy_matrix(self.net, weight='sign') weights = nx.to_numpy_matrix(self.net) if method == 'sum': mat = np.sum(np.multiply(signs, weights), axis=1) elif method == 'n1': mat = np.sum(weights, axis=1) elif method == 'n2': mat = np.sqrt(np.sum(np.square(weights), axis=1)) elif method == 'degs': mat = np.sum(signs, axis=1) elif method == 'deg1': mat = np.sum(np.abs(signs), axis=1) elif method == 'deg2': mat = np.sqrt(np.sum(np.square(signs), axis=1)) f = plt.figure() id2node = dict(zip(range(len(self.net.nodes)), self.net.nodes)) indices = np.argsort(np.abs(np.squeeze(mat)))[:, -1:-11:-1] labels = np.vectorize(id2node.get)(indices) for i in range(labels.shape[1]): plt.text(indices[0, i], -15 * i - 10, labels[0, i]) mat = np.concatenate([mat for i in range(50)], axis=-1) plt.imshow(mat.transpose(), aspect='equal', cmap='jet') plt.tick_params( axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) # labels along the bottom edge are off plt.colorbar() plt.savefig(output) plt.close() def sum(self): assert self.net, print('network non existing') colors = nx.get_edge_attributes(self.net, 'color') sign = {edge: 2 * (colors[edge] == 'r') - 1 for edge in colors} nx.set_edge_attributes(self.net, sign, 'sign') signs = nx.to_numpy_matrix(self.net, weight='sign') weights = nx.to_numpy_matrix(self.net) return np.sum(np.multiply(signs, weights))