def _superimpose_structures(structure, superimpose_reference): """ Superimpose two structures. @param structure: Structure to be superimposed. @type structure: Structure @param superimpose_reference: Structure to be superimposed. @type superimpose_reference: Structure @rtype: Structure @return: The superimposed structure. """ ppb = PPBuilder() sup = Superimposer() pp_reference = ppb.build_peptides(superimpose_reference)[0] pp_structure = ppb.build_peptides(structure)[0] # CA only fixed = pp_reference.get_ca_list() moving = pp_structure.get_ca_list() moving_all = Selection.unfold_entities(structure, "A") sup.set_atoms(fixed, moving) sup.apply(moving_all) return structure
def get_contact_map(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return a complete contact map (see description in exercise sheet) for a given chain in a Biopython.PDB structure as numpy array. The values in the matrix describe the c-alpha distance between all residues in a chain of a Biopython.PDB structure. Only integer values of the distance have to be given (see below). ''' length = len(self.get_sequence(chain_id)) ppb = PPBuilder() contact_map = np.zeros((length, length), dtype=np.float32) for pp in ppb.build_peptides(self.structure[0][chain_id], aa_only=True): for i, residue_1 in enumerate(pp): for j, residue_2 in enumerate(pp): contact_map[i, j] = residue_1['CA'] - residue_2['CA'] return contact_map.astype(np.int) # return rounded (integer) values
def __init__(self, prefix, computedFeatsRootDir, areForTrainAndTest=True, boundAvailable=True, res2res_dist=6.0, statusManager=None): ''' :param prefix: str. An id for the complex :param computedFeatsRootDir: str. path where features will be stored :param areForTrainAndTest: boolean. True if ligand and receptor are posed in interacting coordinates and thus, we know the label. False if they are for prediction and thus, we cannot assign labels. :param: boundAvailable. True if there is a bound and unbound pdb for each complex. False otherwise :param res2res_dist: float. max distance between any heavy atoms of 2 amino acids to be considered as interacting (Angstrom) :param statusManager: class that implements .setStatus(msg) to communicate ''' ToolManager.__init__(self, computedFeatsRootDir, statusManager=statusManager) self.prefix = prefix self.areForTrainAndTest = areForTrainAndTest self.res2res_dist = res2res_dist self.boundAvailable = boundAvailable assert not ( boundAvailable == True and areForTrainAndTest == False ), "Error parameters in CMap: boundAvailable==True and areForTrainAndTest==False" self.ppb = PPBuilder( radius=200) # radius set to 200 to not worry about broken chains self.outPath = myMakeDir(computedFeatsRootDir, "common") self.outPathCM = myMakeDir(self.outPath, "contactMaps") self.outPathResDict = myMakeDir(self.outPath, "includedResidues") self.outPathNeigs = myMakeDir(self.outPath, "voroNeigs")
def _map(self, model): """Map (PRIVATE). :param model: the model that will be mapped :type model: L{Model} """ ppb = PPBuilder() ppl = ppb.build_peptides(model) fd = {} for pp in ppl: try: # make fragments flist = _make_fragment_list(pp, self.flength) # classify fragments mflist = _map_fragment_list(flist, self.reflist) for i in range(0, len(pp)): res = pp[i] if i < self.edge: # start residues continue elif i >= (len(pp) - self.edge): # end residues continue else: # fragment index = i - self.edge assert (index >= 0) fd[res] = mflist[index] except PDBException as why: if why == 'CHAINBREAK': # Funny polypeptide - skip pass else: raise PDBException(why) return fd
def match_pdb_residue_num_to_seq(model, ref=None): """Match PDB residue numbering (as given in PDB file) to a reference sequence (can be pdb sequence) numbered by index. Reference sequence is 1-indexed (and is indexed as such in output). Args: model: A biostructmap Model object. ref (dict): A dictionary containing reference protein sequences for each chain in the protein structure. Defaults to the protein sequences given in PDB file. Returns: dict: A dictionary mapping reference sequence index (key) to residue numbering as given in the PDB file (value). For example, we might have a key of ('A', 17) for the 17th residue in the reference sequence for chain 'A', with a value of ('A', (' ', 273, ' ')) that represents the Bio.PDB identifier for the corresponding residue. """ ppb = PPBuilder() polypeptides = ppb.build_peptides(model.parent().structure) if ref is None: ref = model.parent().sequences output = {} for peptide in polypeptides: peptide_sequence = peptide.get_sequence() # Presume that peptide belongs to a single chain chain_id = peptide[0].get_full_id()[2] _, ref_to_pdb = align_protein_sequences(peptide_sequence, ref[chain_id]) for ref_pos, pdb_pos in ref_to_pdb.items(): output[(chain_id, ref_pos)] = peptide[pdb_pos - 1].get_full_id()[2:4] return output
def deleteChain():# Delete a complete chain from a pdb and save the new structure in pdbname_free.pdb parser = PDBParser() nameStruct=pdb_name.partition('.')[0] structure = parser.get_structure(nameStruct, pdb_name) header = parser.get_header() trailer = parser.get_trailer() seq='' nb_chain=input('How many chain do you want to delete : ') for i in range(nb_chain): rm_chain=raw_input('What chain you want to delete : ') for model in structure: for chain in model: if(chain.id==rm_chain): model.detach_child(chain.id) pept = raw_input('Do you want to get a pdb with the sequence in its name : ') if(pept == 'y'): ppb=PPBuilder() for pp in ppb.build_peptides(structure): seq = seq + pp.get_sequence() seq=seq.lower() seq=str(seq) w = PDBIO() w.set_structure(structure) w.save(seq+'_bound.pdb') else: w = PDBIO() w.set_structure(structure) w.save(nameStruct+'_without'+rm_chain+'.pdb')
def _get_proteins_by_structure(self, pdb_structure, model, file_path): """ _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data """ ppb = PPBuilder() protein_data = [] # Parse for the chain_id and chain sequence for c_ele in pdb_structure.get_chains(): if (c_ele): c_ppd_list = [] for c_ppd in ppb.build_peptides(c_ele): c_pp_seq = str(c_ppd.get_sequence()) c_ppd_list.append(c_pp_seq) c_seq = ''.join(c_ppd_list) protein_data.append({ 'id': os.path.basename(file_path), 'model_id': model, 'chain_id': c_ele.get_id(), 'sequence': c_seq, 'md5': hashlib.md5(c_seq.encode()).hexdigest() }) return protein_data
def read_structure_seqs(self, strucm): """ Extracts sequences from structure""" # PDB extrated sequences for mod in strucm.st: ppb = PPBuilder() for chn in mod.get_chains(): seqs = [] #self.sequences[ch_id]['pdb'][mod.id] = [1] ch_id = chn.id wrong_order = False for frag in ppb.build_peptides(chn): start = frag[0].get_id()[1] end = frag[-1].get_id()[1] frid = '{}:{}-{}'.format(ch_id, start, end) sqr = SeqRecord(frag.get_sequence(), 'pdbsq_' + frid, 'pdbsq_' + frid, 'PDB sequence chain ' + frid) if start < end: sqr.features.append( SeqFeature(FeatureLocation(start, end))) else: print("Warning: unusual residue numbering at chain ", ch_id) print( "Warning: chain reconstruction may not be available" ) sqr.features.append( SeqFeature(FeatureLocation(end, start))) wrong_order = True seqs.append(sqr) if ch_id not in self.data: self.add_empty_chain(ch_id) self.data[ch_id]['pdb'][mod.id] = seqs self.data[ch_id]['pdb']['wrong_order'] = wrong_order
def _map(self, model): """ @param model: the model that will be mapped @type model: L{Model} """ ppb=PPBuilder() ppl=ppb.build_peptides(model) fd={} for pp in ppl: try: # make fragments flist=_make_fragment_list(pp, self.flength) # classify fragments mflist=_map_fragment_list(flist, self.reflist) for i in range(0, len(pp)): res=pp[i] if i<self.edge: # start residues continue elif i>=(len(pp)-self.edge): # end residues continue else: # fragment index=i-self.edge assert(index>=0) fd[res]=mflist[index] except PDBException, why: if why == 'CHAINBREAK': # Funny polypeptide - skip pass else: raise PDBException(why)
def main(): parser = optparse.OptionParser() parser.add_option("-p", "--pdb", dest="pdb", help="path to PDB file", metavar="STRING") parser.add_option("-f", "--pdb_fasta", dest="pdb_fasta", help="path to PDB fasta file (out)", metavar="STRING") (options, args) = parser.parse_args() pdb_fasta = options.pdb_fasta pdb_file = options.pdb pdb_name = os.path.basename(pdb_file).split(".")[0] parser=BP.PDBParser() ppb = PPBuilder(radius=1000) # retrieve all amino acids pdbseq = "" structure = parser.get_structure(pdb_name,pdb_file) model = structure[0] for chain in model: for pp in ppb.build_peptides(model[chain.id], aa_only=False): pdbseq += (pp.get_sequence()) print ">",pdb_name,len(pdbseq) print pdbseq with open(pdb_fasta,"w") as o: o.write(">%s %i\n%s\n"%(pdb_name,len(pdbseq),pdbseq))
def fetch_protein(pdb_id: str) -> Tuple[List[str], np.ndarray]: # retrieve pdb file from Protein Data Bank pdb_file = f"{pdb_id}.pdb" pdb_file_path = os.path.join(os.getcwd(), pdb_file) protein_url = f"https://files.rcsb.org/download/{pdb_file}" req = requests.get(protein_url) with open(pdb_file_path, "w") as f: f.write(req.text) # parse pdb file structure = PDBParser().get_structure(pdb_id, pdb_file) peptides = PPBuilder().build_peptides(structure)[0] # extract amino acid sequence and phi/psi angles aa_sequence = list(peptides.get_sequence()) phi_psi_angles = np.array( list( map( lambda x: (180 if not x[0] else np.rad2deg(x[0]), 180 if not x[1] else np.rad2deg(x[1])), peptides.get_phi_psi_list()))).T # remove pdb file subprocess.check_output(["rm", pdb_file]) return aa_sequence, phi_psi_angles
def get_chain_position(input_file, global_index): chain = None position_in_chain = -1 file_name, _ = os.path.splitext(input_file) file_name = file_name.replace('./', '') parser = PDBParser() structure = parser.get_structure(file_name, input_file) builder = PPBuilder() peptides = builder.build_peptides(structure, aa_only=False) pps = [EPolyPeptide(pp) for pp in peptides] total_length = sum([len(pp) for pp in pps]) if global_index >= total_length: return None, -1 distance = 0 offset = 0 global_index = int(global_index) + 1 while distance < global_index: pp = pps[offset] distance += len(pp) offset += 1 if global_index <= distance: position_in_chain = global_index - (distance - len(pp)) chain = pp.chain_id break return chain, position_in_chain
def getBoundResList(fname_bound, fname_unbound, listOfDictsChainToResId): parser = PDBParser(QUIET=True) structureUnbound = parser.get_structure(fname_unbound, fname_unbound) structureBound = parser.get_structure(fname_bound, fname_bound) ppb = PPBuilder() pp_list_unbound = ppb.build_peptides(structureUnbound, aa_only=False) pp_list_bound = ppb.build_peptides(structureBound, aa_only=False) mapper = BoundUnboundMapper(pp_list_unbound, pp_list_bound) mapper.build_correspondence() newDictsList = [] for dictOfChainsToRes in listOfDictsChainToResId: tempDict = {} for chainId_u in dictOfChainsToRes: for resId_u in sorted(dictOfChainsToRes[chainId_u]): chainId_b_resId_b = mapper.mapUnboundToBoundUsingId( " " if chainId_u == "*" else chainId_u, resId_u) # print(chainId_u, resId_u, chainId_b_resId_b) if chainId_b_resId_b is None: continue chainId_b, resId_b = chainId_b_resId_b if not chainId_b in tempDict: tempDict[chainId_b] = [] tempDict[chainId_b].append(resId_b) newDictsList.append(tempDict) return newDictsList
def doChainAlignments(pdbID, structure, consensusFile, chainSequencesDir, verbose): print('Pairwise alignment of chain sequences with consensus...') chains = [chain for chain in structure.get_chains()] ppb = PPBuilder() for chain in chains: sequence = "" for pp in ppb.build_peptides(chain): sequence += pp.get_sequence() sequenceID = pdbID + '_' + chain.get_id() sequenceOutput = os.path.join(chainSequencesDir, sequenceID + '.fasta') if (len(sequence) == 0): print("ERROR: Unable to get chain sequence from PDB file.") exit("Selected PDB-file does not contain protein structure.") with open(sequenceOutput, 'w') as f: f.write('>' + sequenceID + '\n' + str(sequence)) if verbose: print('Chain ' + chain.get_id()) print(sequence) subprocess.check_call([ 'needle', '-asequence', sequenceOutput, '-bsequence', consensusFile, '-noendweight', '-endopen', '10.0', '-endextend', '0.5', '-brief', '-aformat', 'srspair', '-auto', '-aname_outfile', pdbID + '_' + chain.get_id(), '-adirectory_outfile', chainSequencesDir ]) print('OK')
def main(): parser = optparse.OptionParser() parser.add_option("-p", "--pdb", dest="pdb", help="path to PDB file", metavar="STRING") parser.add_option("-f", "--pdb_fasta", dest="pdb_fasta", help="path to PDB fasta file (out)", metavar="STRING") (options, args) = parser.parse_args() pdb_fasta = options.pdb_fasta pdb_file = options.pdb pdb_name = os.path.basename(pdb_file).split(".")[0] parser = BP.PDBParser() ppb = PPBuilder(radius=1000) # retrieve all amino acids pdbseq = "" structure = parser.get_structure(pdb_name, pdb_file) model = structure[0] for chain in model: for pp in ppb.build_peptides(model[chain.id], aa_only=False): pdbseq += (pp.get_sequence()) print ">", pdb_name, len(pdbseq) print pdbseq with open(pdb_fasta, "w") as o: o.write(">%s %i\n%s\n" % (pdb_name, len(pdbseq), pdbseq))
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str): """ :param filelist: :param q: :param lock: :param cursor: :param conn: :param dir_name: """ with open('status_tmp.txt', 'w') as f: f.write('') for file in filelist: if file in open('status_tmp.txt').readlines(): continue pdbl = PDBList() pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb') if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))): print("File with ID PDB: {:s} not found!".format(file)) continue parser = PDBParser() structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))) name = parser.header.get('name', '') head = parser.header.get('head', '') method = parser.header.get('structure_method', '') res = parser.header.get('resolution', '') ncomp = 0 nchain = 0 eclist = [] for values in parser.header['compound'].values(): ncomp += 1 nchain += len(values['chain'].split(',')) eclist.append(values.get('ec', '') or values.get('ec_number', '')) ec = ", ".join(eclist) nres = 0 mmass = 0 ppb = PPBuilder() for pp in ppb.build_peptides(structure): seq = pp.get_sequence() nres += len(seq) seqan = ProteinAnalysis(str(seq)) mmass += int(seqan.molecular_weight()) lock.acquire() try: cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN, NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format( file, name, head, method, res, ncomp, nchain, nres, mmass, ec)) except sqlite3.DatabaseError as err: print("Error: ", err) continue else: print("Download Done for ID PDB: {:s}".format(file)) conn.commit() q.put(file) finally: lock.release() with open('status_tmp.txt', 'at') as f: f.write((file + '\n')) os.remove('status_tmp.txt') q.put(None)
def generate_seq_file(score_file, save_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') mut_chains = sf.iloc[:,0] mut_dict = dict() mut_track = set() pdb_track = set() for chain in mut_chains: info = chain.split('_') pdb_id = info[0] chain_id = info[1] wt_aa = info[2][0:3] mu_aa = info[2][-3:] mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2]))) if not chain in mut_track: mut_track.add(chain) if pdb_id in pdb_track: mut_dict[pdb_id].append({'chain_id':chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}) else: mut_dict[pdb_id] = [{'chain_id': chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}] pdb_track.add(pdb_id) del mut_track del pdb_track parser = PDBParser() seq_builder = PPBuilder() pdb_dl_handle = PDBList() PDB_DIR = './dataFile/PDB_dl' # check if pdb file exists mut_collect = dict() for pdb_id in mut_dict.keys(): if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR) pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for mutation in mut_dict[pdb_id]: protein_chain = model[mutation['chain_id']] sequence = "".join([str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain)]) sequence = sequence.replace('\n', '').replace(' ', '') assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match' mut_Seq_list = list(sequence) mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa']) mut_Seq = ''.join(mut_Seq_list) mut_collect[mutation['name']] = mut_Seq with open(save_file, 'w') as output_hl: for k, v in mut_collect.items(): output_hl.write(k+'\t'+v+'\n')
def real_seq(): structure = PDBParser().get_structure(protein.protein_id, protein.protein_id + '.pdb') ppb = PPBuilder() seq = '' for pp in ppb.build_peptides(structure): seq += pp.get_sequence() return seq
def polypeptide(pdbfile): parser = PDBParser() structure = parser.get_structure('test', pdbfile) builder = PPBuilder() pp, = builder.build_peptides(structure) return pp
def get_phi_psi_data(pdb_id, chain=None): '''Gets phi and phi angle data.''' builder = PPBuilder() return [polypep.get_phi_psi_list() for model in get_structure(pdb_id) for chn in model if chain is None or chain == chn.get_id() for polypep in builder.build_peptides(chn)]
def parse_structure(path): """ Parses a PDB formatter structure using Biopython's PDB Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ print('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) try: s = P.get_structure(sname, path) except Exception as e: print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr) raise Exception(e) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) # Remove HETATMs and solvent res_list = list(s.get_residues()) n_res = len(res_list) _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H' for res in res_list: if _ignore(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError( 'Unsupported non-standard amino acid found: {0}'.format( res.resname)) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) n_chains = len(set([c.id for c in s.get_chains()])) if n_peptides != n_chains: print('[!] Structure contains gaps:', file=sys.stderr) for i_pp, pp in enumerate(peptides): print( '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}' .format(i_pp, pp[0], pp[-1]), file=sys.stderr) #raise Exception('Calculation cannot proceed') return (s, n_chains, n_res)
def get_pdb_amino_acid_sequences(pdb_path): structure = Bio.PDB.PDBParser(QUIET=True).get_structure( pdb_path[:-4], pdb_path) ppb = PPBuilder() pdb_aas = [] for pp in ppb.build_peptides(structure): pdb_aa = str(pp.get_sequence()) pdb_aas.append(pdb_aa) return pdb_aas
def find_pdb_limits(pdb_path): """""" pdb = PDBParser().get_structure('', pdb_path) # takes the first (and only) polypeptide pp = PPBuilder().build_peptides(pdb)[0] start = pp[0].get_id()[1] end = pp[-1].get_id()[1] seq = pp.get_sequence() return (start, end, seq)
def _model_file_to_data(self, file_path, params): """ _model_file_to_data: Do the PDB conversion--parse the model pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.PDBParser(PERMISSIVE=1) pdb1 = file_path pp_no = 0 data = {} try: structure = parser.get_structure("test", pdb1) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'PDBParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 # logging.info(f'Getting pdb structure data for {structure}!') (compound, source) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) model = structure[0] protein_data = self._get_proteins_by_structure( structure, model.get_id(), file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): data = { 'name': structure.header.get('name', ''), 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'compound': compound, 'source': source, 'proteins': protein_data } else: logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) data = {} finally: return data, pp_no, params
def get_pdb_torsion_angles(pdb_path, chain_index): structure = Bio.PDB.PDBParser(QUIET=True).get_structure( pdb_path[:-4], pdb_path) A = [] ppb = PPBuilder() pdb_aas = [] model = ppb.build_peptides(structure) chain = model[chain_index] phi_psi_list = chain.get_phi_psi_list() return [x[0] for x in phi_psi_list], [x[1] for x in phi_psi_list]
def get_primary_sequence(input_file): file_name, _ = os.path.splitext(input_file) file_name = file_name.replace('./', '') parser = PDBParser() structure = parser.get_structure(file_name, input_file) builder = PPBuilder() seq = "" for chain in builder.build_peptides(structure, aa_only=False): seq += chain.get_sequence() return seq
def get_aa_encoded(protein_file): structure = Bio.PDB.PDBParser(QUIET=True).get_structure( pdb_path[:-4], pdb_path) ppb = PPBuilder() pdb_aas = [] for pp in ppb.build_peptides(structure): pdb_aa = str(pp.get_sequence()) pdb_aas.append(pdb_aa) encoded = int_encoding(pdb_aas, AA_CODES) return encoded
def obtian_seq_wo_seq_file(score_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') chains_involved = sf.iloc[:, 0] pdb = dict() pdb_track = set() for chain in chains_involved: chain_name = chain[0:6] pdb_name = chain[0:4] # if we encounter a old pdb if pdb_name in pdb_track: pdb[pdb_name].add(chain_name) # else, we have a new pdb else: # update the track file pdb_track.add(pdb_name) pdb[pdb_name] = {chain_name} # create the link to the PDB database and retrive all the file # related to the files, store them locally under ./dataFile/PDB_dl/ PDB_DIR = './dataFile/PDB_dl' if not os.path.exists(PDB_DIR): os.mkdir(PDB_DIR) # create the download handle pdb_dl_handle = PDBList() # download all of the pdb files for item in pdb.keys(): if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=item, file_format='pdb', overwrite=False, pdir=PDB_DIR) # for each pdb, we will construct the sequence seq_dict = dict() parser = PDBParser() seq_builder = PPBuilder() # key is the pdb_id, value is the chain in a for pdb_id, chain_names in pdb.items(): pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for chain in chain_names: # extract the last letter, which is the chain name chain_id = chain[-1] protein_chain = model[chain_id] sequence = "".join([ str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain) ]) sequence = sequence.replace('\n', '').replace(' ', '') # clean the bad chars seq_dict[chain] = sequence return seq_dict
def get_secondary_structure_details(self, name, pdb_file, aa_only=False): parser = PDBParser() structure = parser.get_structure(name, pdb_file) dssp = DSSP(structure[0], pdb_file, acc_array="Wilke") ss = "".join([aa[2] for aa in dssp]) sasa = [residues[aa[1]] * aa[3] for aa in dssp] builder = PPBuilder() seq = "" for chain in builder.build_peptides(structure, aa_only=aa_only): seq += chain.get_sequence() return name, seq, ss, sasa, structure
def get_sequence( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' ppb = PPBuilder() return ppb.build_peptides(self.structure[chain_id])[0].get_sequence()
def test_add_residue(): structure = PeptideBuilder.initialize_res("A") for aa in "CDEFGHIKLMNPQRSTVWY": structure = PeptideBuilder.add_residue(structure, aa) # extract peptide from structure and compare to expected ppb = PPBuilder() pp = next(iter(ppb.build_peptides(structure))) assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY" # now compare to saved reference structure assert compare_to_reference(structure, "extended.pdb")
def __init__(self, path): ''' Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... ''' # parser object for reading in structure in CIF format CIF_PARSER: MMCIFParser = MMCIFParser() self.ppb = PPBuilder() self.structure = CIF_PARSER.get_structure('structure', path) self.chains: Dict[Entity] = {} for chain in self.structure.get_chains(): self.chains[chain.id] = chain
def test_add_residue(): """ Build a peptide containing all 20 amino acids """ structure = PeptideBuilder.initialize_res("A") for aa in "CDEFGHIKLMNPQRSTVWY": PeptideBuilder.add_residue(structure, aa) # extract peptide from structure and compare to expected ppb = PPBuilder() pp = next(iter(ppb.build_peptides(structure))) assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY" assert compare_to_reference(structure, "extended.pdb")
def write_seqs(filename): """Process a given structure file to extract sequences and save them into files. """ struct_name = os.path.splitext(os.path.basename(filename))[0] is_CIF = False try: parser = PDBParser() structure = parser.get_structure(struct_name, filename) except: try: parser = MMCIFParser() structure = parser.get_structure(struct_name, filename) is_CIF = True except: sys.stderr.write("ERROR: File {} is not a proper/supported protein structure file.\n".format(filename)) return if is_CIF: name, organism, resolution = get_info_from_cif_dict(filename) else: name, organism, resolution = get_info_from_header(structure) description = "| {} | {} | Resolution {:.2f} A".format(name, organism, resolution) ppb = PPBuilder() chain_seqrecord_list = [] peptide_seqrecord_list = [] for model in structure: for chain in model: base_id = "{}.{:d}_{}".format(struct_name, model.id, chain.id) chain_seq = Seq("") for (pp_id, pp) in enumerate(ppb.build_peptides(chain)): chain_seq += pp.get_sequence() peptide_seqrecord_list.append(SeqRecord(pp.get_sequence(), id = "{}.{:d}".format(base_id, pp_id), description = description)) chain_seqrecord_list.append(SeqRecord(chain_seq, id = base_id, description = description)) base_output_name = os.path.splitext(filename)[0] SeqIO.write(chain_seqrecord_list, "{}_chains.fasta".format(base_output_name), "fasta") SeqIO.write(peptide_seqrecord_list, "{}_peptides.fasta".format(base_output_name), "fasta")
def pdb_extract_chain_seqs(struct): """ Takes a Bio.PDB.Structure object, returns a dictionary of chains and sequences (eg {'A':'MTSSLGRF', 'B':'MSLQRGFIN'} (for structures with multiple models, like NMR structures, only the first model is considered). """ chains = {} ppb=PPBuilder() for chain in struct[0]: # takes model 1 for pp in ppb.build_peptides(chain, aa_only=False): seq = str(pp.get_sequence()) chains[chain.id] = "".join(seq) return chains
def getSequence(): # Get the sequence of a specific chain parser = PDBParser() nameStruct=pdb_name.partition('.')[0] structure = parser.get_structure(nameStruct, pdb_name) header = parser.get_header() trailer = parser.get_trailer() seq='' what_chain=raw_input('For what chain do you want the sequence : ') for model in structure: for chain in model: if chain.id != what_chain: model.detach_child(chain.id) ppb=PPBuilder() for pp in ppb.build_peptides(structure): seq = seq + pp.get_sequence() seq=seq.upper() print seq
def getPDBInfo(pdb, pdbpath): # extract information from the PDB file using Biopython # Biopython parsers parser = BP.PDBParser() ppb = PPBuilder() pdbseq = "" # PDB descriptors # name = pdb.split("/")[-1].split(".")[0].split("_")[0] if len(pdb.split("/")[-1].split(".")[0].split("_")) > 1: chain = pdb.split("/")[-1].split(".")[0].split("_")[1] else: # not "_" delimiters in pdb name chain = pdb.split("/")[-1].split(".")[0][4] if os.path.exists(os.path.join(pdbpath, pdb + "_ren.pdb")): structure = parser.get_structure(pdb, os.path.join(pdbpath, pdb + "_ren.pdb")) residue_list = BP.Selection.unfold_entities(structure[0][chain], 'R') # the build_peptides method has an option for treating non-standard amino acids for pp in ppb.build_peptides(structure[0][chain], aa_only=False): pdbseq += (pp.get_sequence()) return structure, residue_list, pdbseq
def create_sequence_from_file(chain_pdb, missing_residues, quiet_parser=False): """ Read a PDB file and creates a sequence and mismask to represent its content. @param chain_pdb: The PDB file to be read. @type chain_pd: str @param missing_residues: A dictionary with the missing residues. @type missing_residues: dict @param quiet_parser: Disable PDBParser warnings. @type quiet_parser: bool """ sequences = [] mismasks = [] output_data = [] output_mismask = [] parser = PDBParser(QUIET=quiet_parser) structure = parser.get_structure("X", chain_pdb) dssp = DSSP(model=structure[0], pdb_file=chain_pdb) # Loop over residues in peptides ppb = PPBuilder() pp_list = ppb.build_peptides(structure[0]) chain_list = structure[0].get_list() if len(pp_list) == 0: raise TorusDBNBuildPolypeptideException( "Could not create a list of Polypeptide objects from the file %s." % (chain_pdb) ) else: pp_chains, chain_missing_residues = _get_pp_with_chain_break( chain_pdb, pp_list, chain_list, missing_residues) for pp_index, pp in enumerate(pp_chains): phi_psi_list = pp.get_phi_psi_list() missing_residues = chain_missing_residues[pp_index] for i in xrange(1, len(phi_psi_list)-1): seq = [0] * 6 mism = [eMISMASK.MOCAPY_HIDDEN] + 4 * [eMISMASK.MOCAPY_OBSERVED] # Amino acid res = pp[i] res_name = res.get_resname() res_index = res.get_id()[1] aa_index = three_to_index(res_name) if res_index in missing_residues: seq[3] = aa_index mism[1] = eMISMASK.MOCAPY_MISSING # angles unknown mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown else: seq[3] = aa_index # Secondary Structure try: ss = res.xtra["SS_DSSP"] ss_index = dssp_to_index(ss) seq[4] = ss_index except: mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown # Angles if None in phi_psi_list[i]: # Previous or next residue missing, therefore angles are # Unknown mism[1] = eMISMASK.MOCAPY_MISSING else: seq[1:3] = phi_psi_list[i] # Cis/Trans information if (res_index - 1) in missing_residues: mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown else: try: seq[5] = _get_peptide_bond_conformation(res, pp[i-1]) except TorusDBNException: mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown output_data.append(seq) output_mismask.append(mism) if output_data and output_mismask: sequences.append(numpy.array(output_data)) mismasks.append(numpy.array(output_mismask, dtype = numpy.uint)) else: raise TorusDBNException( "Could not create training data from the file %s." % (chain_pdb) ) return sequences, mismasks
def create_residue_lists(pdb, chain): print pdb, chain res_lists = [] aa_list = "" try: ppb = PPBuilder() p=PDB.PDBParser(QUIET = True) s=p.get_structure('X', "../foldx_setup/repaired_pdbs/RepairPDB_" + pdb + "_" + chain + ".pdb") pp = ppb.build_peptides(s)[0] seq = str(pp.get_sequence()) #Get the structures ref_struct = s[0] ref_chain = ref_struct[chain] ref_residues = [] ref_res_nums = [] for res in ref_chain: ref_residues.append( res.resname ) ref_res_nums.append(res.id[1]) except KeyError: print "Something is wrong with the mapped residues" #continue seq_array = ddG_var_helper.convert_to_one_letter_code(ref_residues) count = 0 i = 0 while(i < len(seq_array)): if (i < 101): #These statements split up the sequence into 100 aa chunks for performing a Position Scan aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 101): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 201 and i>101): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 201): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 301 and i>201): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 301): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 401 and i>301): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 401): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 501 and i>401): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 501): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" if (i < 601 and i>501): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 601): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" if(i < 701 and i>601): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 701): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" if (i < 801 and i>701): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 801): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" if (i < 901 and i>801): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 901): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 1001 and i>901): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 1001): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 1101 and i>1001): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 1101): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 1201 and i>1101): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 1201): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) aa_list = "" elif(i < 1301 and i>1201): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" if(i == len(seq_array) - 1): aa_list = aa_list + ";" res_lists.append(aa_list) elif(i == 1301): aa_list = aa_list + "," res_num = ref_res_nums[i] aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a" aa_list = aa_list + ";" res_lists.append(aa_list) i = i + 1 return res_lists
pdb_file = gzip.open(os.path.join(input_path, filename)) pdb_parser = Bio.PDB.PDBParser(pdb_file) ## to get structural data pdb_id = pdb_filename[7:11] structure = pdb_parser.get_structure(pdb_id, pdb_file) print structure # Extract sequence from coordinate information # Amino acid residues present in SEQRES but that doesn't have coordinate information # are listed in REMARK 465 ppb = PPBuilder() # PPBuilder uses C--N distance to find polypeptides. for pp in ppb.build_peptides(structure): sequence = pp.get_sequence() print "This is the polypeptide sequence of %s" %pdb_id print sequence print "length of sequence: ", len(sequence) # OR extract sequence from SEQRES - ? # when I aligned the sequence output of 1A27 here and the fasta sequence, I see 4 residues missing # these 4 residues are reported as REMARK 465 MISSING RESIDUES (THE FOLLOWING RESIDUES WERE NOT LOCATED # IN THE EXPERIMENT) # does this mean unresolved or missing in protein construct?
def get_sequence(self): ppb = PPBuilder() polypeptide = ppb.build_peptides(self.structure[0][self.chain]) return polypeptide[0].get_sequence()
''' Extract the protein sequence from a PDB chain. ----------------------------------------------------------- (c) 2013 Allegra Via and Kristian Rother Licensed under the conditions of the Python License This code appears in section 21.4.2 of the book "Managing Biological Data with Python". ----------------------------------------------------------- ''' from Bio import PDB from Bio.PDB.Polypeptide import PPBuilder parser = PDB.PDBParser() structure = parser.get_structure("2DN1", "dn/pdb2dn1.ent") ppb = PPBuilder() peptides = ppb.build_peptides(structure) for pep in peptides: print pep.get_sequence()
##### #Extract chains with Chimera nucl_xen=openModels.open('1kx5.pdb',type='PDB') # nucl_yeast=openModels.open('1id3.pdb',type='PDB') # h2a_xen=Seq(str(nucl[0].sequence('C'))) # h2a_yeast=Seq(str(nuclZ[0].sequence('C'))) rc('select :.A :.B :.C :.D :.E :.F :.G :.H') rc('write selected format pdb #0 xen_nucl.pdb') # rc('write selected format pdb #1 h2a_yeast_xray.pdb') #generate alignments #Biopython extracts seqs from pdb p = PDBParser(PERMISSIVE=1) s = p.get_structure('1kx5', '1kx5.pdb') ppb=PPBuilder() seqs_xen=dict() for i in ['A','B','C','D','E','F','G','H']: seqs_xen[i]=ppb.build_peptides(s[0][i])[0].get_sequence() #Here we manually input the trunctaed versions of yeast sequences seqs_yeast=dict() #CSE4 - H3 #SSKQQWVSSAIQSDSSGRSLSNVNRLAGDQQSINDRALSLLQRTRATKNLFPRREERRRYESSKSDLDIETDYEDQAGNLEIETENEEEAEMETEVPAPVRTHSYALDRYVRQKRREKQRKQSLKR #VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI #VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI seqs_yeast['A']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI') seqs_yeast['E']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI') #H4 seqs_yeast['B']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG') seqs_yeast['F']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG')
def __init__(self): PPBuilder.__init__(self)
def parse_structure(path): """ Parses a structure using Biopython's PDB/mmCIF Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ print('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) s_ext = fname.split('.')[-1] _ext = set(('pdb', 'ent', 'cif')) if s_ext not in _ext: raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext)) if s_ext in set(('pdb', 'ent')): sparser = PDBParser(QUIET=1) elif s_ext == 'cif': sparser = MMCIFParser() try: s = sparser.get_structure(sname, path) except Exception as e: print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr) raise Exception(e) # Keep first model only if len(s) > 1: print('[!] Structure contains more than one model. Only the first one will be kept') model_one = s[0].id for m in s.child_list[:]: if m.id != model_one: s.detach_child(m.id) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) # Remove HETATMs and solvent res_list = list(s.get_residues()) def _is_het(residue): return residue.id[0][0] == 'W' or residue.id[0][0] == 'H' for res in res_list: if _is_het(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname)) n_res = len(list(s.get_residues())) # Remove Hydrogens atom_list = list(s.get_atoms()) def _is_hydrogen(atom): return atom.element == 'H' for atom in atom_list: if _is_hydrogen(atom): residue = atom.parent residue.detach_child(atom.name) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) n_chains = len(set([c.id for c in s.get_chains()])) if n_peptides != n_chains: print('[!] Structure contains gaps:', file=sys.stderr) for i_pp, pp in enumerate(peptides): print('\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'.format(i_pp, pp[0], pp[-1]), file=sys.stderr) #raise Exception('Calculation cannot proceed') return (s, n_chains, n_res)
def validate_structure(s, selection=None, clean=True): # setup logging logger = logging.getLogger('Prodigy') # Keep first model only if len(s) > 1: logger.warning('[!] Structure contains more than one model. Only the first one will be kept') model_one = s[0].id for m in s.child_list[:]: if m.id != model_one: s.detach_child(m.id) # process selected chains chains = list(s.get_chains()) chain_ids = set([c.id for c in chains]) if selection: sel_chains = [] # Match selected chain with structure for sel in selection: for c in sel.split(','): sel_chains.append(c) if c not in chain_ids: raise ValueError('Selected chain not present in provided structure: {0}'.format(c)) # Remove unselected chains _ignore = lambda x: x.id not in sel_chains for c in chains: if _ignore(c): c.parent.detach_child(c.id) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) if clean: # Remove HETATMs and solvent res_list = list(s.get_residues()) _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H' for res in res_list: if _ignore(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname)) # Remove Hydrogens atom_list = list(s.get_atoms()) _ignore = lambda x: x.element == 'H' for atom in atom_list: if _ignore(atom): residue = atom.parent residue.detach_child(atom.name) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) if n_peptides != len(chain_ids): message = '[!] Structure contains gaps:\n' for i_pp, pp in enumerate(peptides): message += '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > ' \ '{2.parent.id} {2.resname}{2.id[1]}\n'.format(i_pp, pp[0], pp[-1]) logger.warning(message) # raise Exception(message) return s
parser = PDB.PDBParser() structure = parser.get_structure(myPDBfile, myPDBfile[1:3].lower() + "/pdb" + myPDBfile.lower() + ".ent") if myAnswer == "No": io = PDBIO() io.set_structure(structure) io.save(myPDBfile + ".pdb") except: print ("you structure is not available in the PDB") parser = PDB.PDBParser() structure = PDB.PDBParser().get_structure(myPDBfile, myPDBfile + ".pdb") chains = [chain for chain in structure.get_chains()] print (chains) ppb = PPBuilder() for chain in chains: print (chain) print (chain.get_id()) for chainseq in ppb.build_peptides(chain): print (chainseq.get_sequence()) mySeq = chainseq.get_sequence() myFastaA = ">" + myPDBfile + "_" + chain.get_id() + "\n" + chainseq.get_sequence() print (myFastaA) myFastaAfile = open(myPDBfile + "_" + chain.get_id() + ".fasta", "w") myFastaAfile.write(">" + myPDBfile + "_" + chain.get_id() + "\n" + str(mySeq)) myFastaAfile.close() myFastafiles = os.listdir(".") myFastaIter = iter(myFastafiles)