def end_to_end_distance(structure): entry_id = structure[0:7] chain_id = structure[-5] ensemble = structure[8] conformer = structure.split("_")[1].split("-")[1] io = PDBIO() pdb = PDBParser().get_structure(structure, structure) residues_number_list = [] for chain in pdb.get_chains(): for residue in chain.get_residues(): if residue.id[0] == " ": residues_number_list.append(residue.get_id()[1]) else: pass coordinates_first = [] coordinates_last = [] for chain in pdb.get_chains(): for residue in chain.get_residues(): if residue.get_id()[1] == residues_number_list[0]: coordinates_first.append((residue['CA'].get_coord())) elif residue.get_id()[1] == residues_number_list[-1]: coordinates_last.append((residue['CA'].get_coord())) dist = numpy.linalg.norm(coordinates_first[0] - coordinates_last[0]) end_to_end_df.loc[len(end_to_end_df)] = [ entry_id, ensemble, conformer, chain_id, structure, dist ]
def chain_splitter(structure, chains): io = PDBIO() pdb = PDBParser().get_structure(structure, structure) for chain in pdb.get_chains(): if (chain.get_id() != " "): io.set_structure(chain) io.save(structure[:-4] + "_" + chain.get_id() + ".pdb") chains.append(chain.get_id()) else: chain_correction(structure) chains.append("A") os.remove(structure) return (chains)
def PDBParser(input, input_format=None): try: parser = BioPDBParser().get_structure('pdb', io.StringIO(input)) chain = list(parser.get_chains())[0] remove_atoms(chain) contacts = get_chain_contacts(chain) except: raise InvalidFormat('Unable to parse contacts') if not contacts: raise InvalidFormat('Unable to parse contacts') output = ["PDB"] output += sorted(contacts, key=itemgetter(2), reverse=True) return output
def download_and_get_chains(): from Bio.PDB import PDBParser, PDBIO failed = [] pdbs_dict = read_rostdb_entries() io = PDBIO() pdbl = PDBList() for pdb_e, chains in pdbs_dict.items(): for chain_e in chains: try: pdbl.retrieve_pdb_file(pdb_e, pdir='./') pdb = PDBParser().get_structure(pdb_e, 'pdb'+pdb_e.lower()+'.ent') for chain in pdb.get_chains(): if chain.get_id() == chain_e: io.set_structure(chain) io.save(pdb.get_id() + '_' + chain.get_id() + '.pdb') except: failed.append((pdb_e, chain_e)) print("failures:", failed)
class IterationTests(unittest.TestCase): def setUp(self): self.struc = PDBParser(PERMISSIVE=True).get_structure('X', "PDB/a_structure.pdb") def test_get_chains(self): """Yields chains from different models separately.""" chains = [chain.id for chain in self.struc.get_chains()] self.assertEqual(chains, ['A','A', 'B', ' ']) def test_get_residues(self): """Yields all residues from all models.""" residues = [resi.id for resi in self.struc.get_residues()] self.assertEqual(len(residues), 167) def test_get_atoms(self): """Yields all atoms from the structure, excluding duplicates and ALTLOCs which are not parsed.""" atoms = ["%12s"%str((atom.id, atom.altloc)) for atom in self.struc.get_atoms()] self.assertEqual(len(atoms), 756)
def download_and_get_chains(): from Bio.PDB import PDBParser, PDBIO failed = [] pdbs_dict = read_rostdb_entries() io = PDBIO() pdbl = PDBList() for pdb_e, chains in pdbs_dict.items(): for chain_e in chains: try: pdbl.retrieve_pdb_file(pdb_e, pdir='./') pdb = PDBParser().get_structure(pdb_e, 'pdb' + pdb_e.lower() + '.ent') for chain in pdb.get_chains(): if chain.get_id() == chain_e: io.set_structure(chain) io.save(pdb.get_id() + '_' + chain.get_id() + '.pdb') except: failed.append((pdb_e, chain_e)) print("failures:", failed)
def get_output(domain, virtualcb=False): """ Function loads pdb file, reads it and returns the atomic coordinates of domain with ranges specified in CATH sequence file. It also returns a list with secondary structure representation developed by DSSP, as well as torsion angles. Function can create a virtual C-beta atom of Glycine residue if requested Input: domain : Full domain name (eg. 16pkA01) virtualcb: (Boolean) Create virtual C beta atom on Glycine. By default is False, which means that atomic coordinates for Glycine are C-alpha Output: coords_list : 2D array of coordinates legend: [residue number, residue name, X, Y, Z] sectorsions : 2D array legend: [Secondary structure, Phi, Psi] """ domain_start, domain_end = domains[domain][0], domains[domain][1] domain_id = domain[:4] chain_id = domain[4] # Get PDB structure try: structure = PDBParser().get_structure( '', f'../../data/pdbfiles/{domain_id}.pdb') except (IndexError, ValueError): return None, None # There is a problem with 0 character, because sometimes # it means no chain (chain == ' '), but another times # it is a valid chain ID if chain_id == '0': # get all chain_IDs chain_IDs = np.array( [ch.get_full_id()[2] for ch in structure.get_chains()]) if '0' in chain_IDs: pass else: chain_id = ' ' chain = structure[0][chain_id] coords_list = [] known_aminoacids = np.array(list(protein_letters_3to1.keys())) for i, residue in enumerate(chain.get_residues()): residue_name = residue.get_resname() if residue_name not in known_aminoacids: break residue_oneletter = protein_letters_3to1[residue_name] residue_number = residue.child_list[0].get_full_id()[3][1] if residue_oneletter == 'G': # if Glycin -> C-alpha/Virtual C-beta. Otherwise C-beta try: if virtualcb: atom = virtual_cbeta(residue) else: atom = residue['CA'] except KeyError: if residue_number < domain_start: if virtualcb: atom = [0, 0, 0] else: atom = residue.child_list[ 0] # just append any atom, it doesnt matter else: print('Missing C-alpha atom') return None, None else: try: atom = residue['CB'] except KeyError: if residue_number < domain_start: atom = residue.child_list[ 0] # just append any atom, it doesnt matter else: print('Missing C-beta atom') return None, None if residue_oneletter == 'G' and virtualcb: x, y, z = atom else: x, y, z = atom.get_coord() coords_list.append( [residue_number, residue_name, residue_oneletter, x, y, z]) if residue_number == domain_end: # because we need to include also that residue break coords_list = np.array(coords_list, dtype='O') # in case the domain_start is not included in the coords indices try: start = np.where(coords_list[:, 0] == domain_start)[0][0] end = np.where(coords_list[:, 0] == domain_end)[0][0] except IndexError: print('domain_start or domain_end index not found in pdb file') return None, None if (end - start) == (domain_end - domain_start): coords_list = coords_list[start:(end + 1)] # Secondary structure and Torsion Angles sec_torsions, seq = secondary_torsions(domain) # , start, end) if sec_torsions is None: return None, None if len(seq) < len(coords_list): print('DSSP output smaller than PDB') return len(seq), len(coords_list) dssp_start, dssp_end = align(''.join(coords_list[:, 2]), seq) if dssp_start is None: print('DSSP Sequence != PDB sequence') print( f'PDB Sequence:\n{"".join(coords_list[:, 2])}\nDSSP sequence:\n{seq}' ) return None, None else: return coords_list, sec_torsions[dssp_start:dssp_end] else: print( f'Domain {domain} has missing data. PDB indices:{start, end}, CATH indices: {domain_start, domain_end}' ) return None, None
# p = Pool(20) parts = len(list_result_n) // 20 for i in tqdm(range(parts)): p.map(download, list_result_n[i * 20:i * 20 + 20]) p.map(download, list_result_n[parts * 20:]) result_chains = {} for i in result: for i1 in result[i]: struc = i1.split(";")[0] chain = i1.split(";")[1].strip() if struc in result_chains: result_chains[struc].append(chain) else: result_chains[struc] = [chain] print(result_chains) for i in tqdm(os.listdir("pdb_m")): io = PDBIO() pdb = PDBParser().get_structure(i, "pdb_m/%s" % i) for chain in pdb.get_chains(): if chain.get_id() in result_chains[i.split("_")[1]]: io.set_structure(chain) io.save("pdb_chain/" + pdb.get_id() + "_" + chain.get_id() + ".pdb")
def parse_pdb_length(name): pdb = PDBParser().get_structure(name, "../../../0-identify_structure/2-get_pdb_chain/{0}/{1}.pdb".format(organism, name)) chain = list(pdb.get_chains())[0] #only 1 chain present return len([_ for _ in chain.get_residues() if PDB.is_aa(_)]) #omits missing residues
row for row in rows if abs(float(row["x"])) <= 5 and abs(float(row["y"])) <= 5 ] try: min_relaxed_energy = float(rows[0]["Relaxed energy (kcal/mol)"]) min_unrelaxed_energy = float( rows[0]["Unrelaxed energy (kcal/mol)"]) except IndexError: min_unrelaxed_energy = np.nan min_relaxed_energy = np.nan # Count the number of atoms and residues in the antigen. ag_file = os.path.join(standards.ExperimentsDirectory, experiment, "structures", "antigen_relaxed.pdb") structure = PDBParser().get_structure("antigen", ag_file) n_chains, n_atoms, n_residues = 0, 0, 0 for chain in structure.get_chains(): n_chains += 1 n_residues += len(list(chain.get_residues())) n_atoms += len(list(chain.get_atoms())) results[-1].update({ "Experiment": experiment, "Chains": n_chains, "Residues": n_residues, "Atoms": n_atoms, "Positions": positions, "Selected Designs": selected, "Min Unrelaxed Energy": min_unrelaxed_energy, "Min Relaxed Energy": min_relaxed_energy }) # Output results as a CSV file.
class Blueprint: def __init__(self, blueprint_file=None, pdbfile=None, structure=None, segments=None, data=None): if pdbfile: self.structure = PDBParser().get_structure(pdbfile, pdbfile) else: self.structure = structure if segments: self.segments = segments self.bp_data = [] self.segment_dict = {} for seg in segments: self.bp_data += seg.bp_data self.segment_dict[seg.id] = seg if blueprint_file and not data: # read the blueprint file and initialize segments # if self.structure is available put the residues in the segments. #self.segments = [ ] foldinfo_register = "" hsstriplet_register = "" register = re.compile('^\s*(\d+)\s+(\w+)\s+(\w+)\s+(.+)') data = [] for line in open(blueprint_file): if line.startswith('FOLDINFO'): foldinfo_register = line.strip() elif line.startswith('HSSTRIPLET'): hsstriplet_register = line.strip() elif line.startswith('HSSTRIAD'): hsstriplet_register = line.strip() elif line.startswith('SSPAIR'): #r = re.compile("(\d+)-(\d+).(\w).") r = re.compile("(\d+)-(\d+).(\w).([-]?\d+)") self.sspairs = r.findall(line) elif line.startswith('HHPAIR') or line[0] == '#': pass else: r = register.split(line) data.append([int(r[1]), r[2], r[3], r[4]]) if blueprint_file or data: # group the tuples in lists by their secondary structure and initiliaze the segments # grab the residues from the structure if this is available # self.bp_data contains all blueprint residue data # self segment_dict is a dict of segments where the keys are the ID for the ss segment. For example # H3 means Helix 3. self.segments = [] self.bp_data = [] self.segment_dict = {} res_index = 0 segment_count = {'L': 1, 'H': 1, 'E': 1} residues = list( self.structure.get_residues()) if self.structure else None for sstype, bp_data in groupby(data, key=lambda x: x[2][0]): resdata = list(bp_data) self.bp_data += resdata id = sstype + str(segment_count[sstype]) segment_count[sstype] += 1 seg = None if self.structure: segment_residues = [] for data in resdata: segment_residues.append(residues[res_index]) res_index += 1 seg = Segment(id, sstype, resdata, segment_residues) else: seg = Segment(id, sstype, resdata) # append the segment to the segment list self.segments.append(seg) # insert the segment to the segment dict self.segment_dict[id] = seg #use the segment_dict to fill foldinfo and hsstriplet ## I AM GOING TO FINISH THIS LATER BECAUSE IT IS GOING TO BE TRICKY TO SET UP THE FOLDS WITH THE SWAPP ## MEANWHILE I AM GOING TO MODIFY dump_blueprint to take the foldinfo and hss tripplet as arguments #get_fold_tokens = re.compile('(\d+-\d+\.[AP]\.-?\d)') #fold_tokens = get_fold_tokens.findall(foldinfo_register) #for ft in fold_tokens: # pass def topology(self): return reduce(lambda x, y: x + '-' + y, [s.id for s in self.segments]) def topology_lengths(self): topol1 = reduce(lambda x, y: x + '-' + y, [s.id for s in self.segments]) elements = re.compile("[HEL]\d+") ss_lst = elements.findall(topol1) topol2 = '' topol3 = '' for ss in ss_lst: seg = self.segment_dict[ss] n = len(seg.bp_data) topol2 += '%s%s-' % (ss[0], n) topol3 += '%s[%s-%s]' % (ss[0], n, n) return topol2, topol3 def ss_tag(self): H = 0 E = 0 for s in self.segments: if s.sstype == 'H': H += 1 elif s.sstype == 'E': E += 1 else: pass return "%dH%dE" % (H, E) def freeze_all(self): for res in self.bp_data: res[3] = '.' def remodel_all(self): for res in self.bp_data: res[3] = 'R' def remodel_segment(self, index=None, id=None, index_to_zero=False, loop_edge=True): res_for_remodel = [] if index: for res in self.segments[index].bp_data: res_for_remodel.append(res) elif id: for res in self.segment_dict[id].bp_data: res_for_remodel.append(res) for res in res_for_remodel: if index_to_zero: res[0] = 0 res[3] = 'R' if loop_edge: for i in range(1, len(self.segments) - 1): prev_seg = self.segments[i - 1] seg = self.segments[i] next_seg = self.segments[i + 1] if seg.sstype == 'L': if seg.bp_data[0][3] == 'R': prev_seg.bp_data[-1][3] = 'R' if seg.bp_data[-1][3] == 'R': next_seg.bp_data[0][3] = 'R' def residue_segment(self, pos): its_segment = '' for segment in self.segment_dict.keys(): seg = self.segment_dict[segment] for res in seg.bp_data: if res[0] == pos: its_segment = segment break else: continue break return its_segment def segment_lengths(self): return reduce(lambda i, j: i + '-' + j, [s.sstype + str(len(s.bp_data)) for s in self.segments]) def reindex_blueprint(self, start=1, rebuild_index_to_zero=False): indexer = start for bp_data in self.bp_data: if rebuild_index_to_zero and bp_data[3] == 'R': bp_data[0] = 0 else: bp_data[0] = indexer indexer += 1 def segment_list(self): r = re.compile('([HEL]\d+)-?') seg_list = r.findall(self.topology()) return seg_list def dump_blueprint(self, filename, header_lines=[]): '''header lines are for setting foldinfo, hsstriplet or any other register on the top of the blueprint.''' out = open(filename, 'w') for line in header_lines: line.strip() # avoid doble carriage return out.write(line + '\n') for r in self.bp_data: out.write("%d %s %s %s\n" % tuple(r)) out.close() def dump_pdb(self, filename): io = PDBIO() io.set_structure(self.structure) io.save(filename) def swapp_segments(self, index1, index2): '''This function swaps the segments, reindexes the blueprint and PDB file and set for remodelling the segments directly conected to the swapped segments. The rest of the structure is set frozen.''' #freeze the structure and delete the residues conected to the swapped segments for remodel #add to the blueprint the corresponding insertions for the deleted residues. self.freeze_all() self.remodel_segment(index1 - 1, index_to_zero=True) self.remodel_segment(index1 + 1, index_to_zero=True) self.remodel_segment(index2 - 1, index_to_zero=True) self.remodel_segment(index2 + 1, index_to_zero=True) #wapp the self.segments self.segments[index1], self.segments[index2] = self.segments[ index2], self.segments[index1] #renumerate the blueprint and the residues indexer = 1 residues_to_detach = set() for segment in self.segments: for i in range(0, len(segment.bp_data)): if segment.bp_data[i][0] == 0: residues_to_detach.add(segment.residues[i]) continue segment.bp_data[i][0] = indexer id = segment.residues[i].id segment.residues[i].id = (id[0], indexer, id[2]) indexer += 1 # detach the residues of the residues directly connected to the swapp # this is done to avoid clashes during the remodelling for res in residues_to_detach: p = res.get_parent() p.detach_child(res.id) # sort the residues in the structure accoriding to the new indexing for chain in self.structure.get_chains(): chain.child_list = sorted(chain.child_list, key=lambda r: r.id[1]) #now that the elements have been reindexed self.bp_data and self.residues must be updated self.bp_data = reduce(lambda x, y: x + y, [s.bp_data for s in self.segments]) self.residues = reduce(lambda x, y: x + y, [s.residues for s in self.segments])
class Family(object): '''A class that compiles information about a protein structure and related sequences. This information is meant to be sufficient to filter the residues and calculate an ez-beta moment from those that remain. Attributes: stru_name: Name of the structure stru_path: Path of the PDB format structure file stru: the structure, as a Biopython entity msa: a dictionary mapping sequence identifiers to rows in a multiple sequence alignment template_seq: the row of the MSA containing the sequence of the structure res_to_pos: a dictionary mapping residues from structures to their column number in the MSA (asssuming the first column is numbered 0) dssp: a Biopython DSSP object with a DSSP for the structure calc: an Ez-beta calculator''' def __init__(self, stru_name, stru_path, msa_path, template_name, param_path): '''Requires a name for the structure (your choice), a path to a PDB format structure file, a path to a multiple sequence alignment containing a row with exactly the same sequence as the structure, the sequence identifier of this row, and a path to a CSV file of Ez-beta parameters (see zenergy.Calculator for how to make these files)''' self.stru_name = stru_name self.stru_path = stru_path with warnings.catch_warnings(): warnings.simplefilter('ignore') self.stru = PDBParser().get_structure(stru_name, stru_path) # When Daniel created the aligned structures, he removed heteroatoms # (though the procedure he used seems to have removed anything # without the residue identifer of one of the 20 standard amino # acids, leading to main chain selenomethionines being removed from # the 1FEP structure). However, he also added a sort of box of # water atoms (perhaps as a visual aid, so you can tell how the # coordinate system is defined?) # Therefore, remove all waters waters = [i for i in self.stru.get_residues() \ if i.get_resname() == 'HOH'] for chain in self.stru.get_chains(): for water in waters: try: chain.detach_child(water.get_id()) # Maybe it's not in this chain except KeyError: pass msa = Bio.AlignIO.read(open(msa_path), 'clustal') self.msa = dict((seq.id, seq) for seq in msa) self.template_seq = self.msa[template_name] self.res_to_pos = map_res_to_pos(self.stru.get_residues(), self.template_seq) self.dssp = DSSP_win.DSSP(self.stru.child_dict[0], stru_path) params = csv.reader(open(param_path, 'rb')) self.calc = zenergy.Calculator(params)
'-mute basic -mute core -ignore_zero_occupancy false -rebuild_disulf false -detect_disulf false' ) for pdb_file in glob.glob('../Data/structures/*.pdb'): print(pdb_file) if '.rosetta' in pdb_file: continue pdb_file_clean = pdb_file.replace('.pdb', '.rosetta.pdb') initial_pose = pose_from_pdb(pdb_file) initial_pose.dump_pdb(pdb_file_clean) io = PDBIO() pdb = PDBParser().get_structure( pdb_file.split('/')[-1].split('.')[0], pdb_file_clean) chains = list(pdb.get_chains()) assert len(chains) == 1 residues = list(chains[0].get_residues()) Dice.extract(pdb, chains[0].get_id(), 1, len(residues) + 1, pdb_file_clean) #io.set_structure(chains[0]) #io.save(pdb_file_clean) #io = PDBIO() ##Set up ScoreFunction #sf = get_fa_scorefxn() ##Set up MoveMap. #mm = MoveMap() #mm.set_bb(True) #mm.set_chi(True)