Example #1
0
    def _superimpose_structures(structure, superimpose_reference):
        """ Superimpose two structures.
        @param structure: Structure to be superimposed.
        @type structure: Structure
        
        @param superimpose_reference: Structure to be superimposed.
        @type superimpose_reference: Structure
        
        @rtype: Structure
        @return: The superimposed structure.
        
        """        
        ppb = PPBuilder()
        sup = Superimposer()
        pp_reference = ppb.build_peptides(superimpose_reference)[0]
        pp_structure = ppb.build_peptides(structure)[0]

        # CA only
        fixed = pp_reference.get_ca_list()
        moving = pp_structure.get_ca_list()
        moving_all = Selection.unfold_entities(structure, "A")        
        sup.set_atoms(fixed, moving)
        sup.apply(moving_all)
    
        return structure
Example #2
0
    def get_contact_map(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return a complete contact map (see description in exercise sheet) 
                for a given chain in a Biopython.PDB structure as numpy array. 
                The values in the matrix describe the c-alpha distance between all residues 
                in a chain of a Biopython.PDB structure.
                Only integer values of the distance have to be given (see below).
        '''

        length = len(self.get_sequence(chain_id))
        ppb = PPBuilder()
        contact_map = np.zeros((length, length), dtype=np.float32)

        for pp in ppb.build_peptides(self.structure[0][chain_id],
                                     aa_only=True):
            for i, residue_1 in enumerate(pp):
                for j, residue_2 in enumerate(pp):
                    contact_map[i, j] = residue_1['CA'] - residue_2['CA']

        return contact_map.astype(np.int)  # return rounded (integer) values
Example #3
0
    def __init__(self,
                 prefix,
                 computedFeatsRootDir,
                 areForTrainAndTest=True,
                 boundAvailable=True,
                 res2res_dist=6.0,
                 statusManager=None):
        '''
      :param prefix: str. An id for the complex
      :param computedFeatsRootDir: str. path where features will be stored

      :param areForTrainAndTest: boolean. True if ligand and receptor are posed in interacting coordinates and thus,
                                we know the label. False if they are for prediction and thus, we cannot assign labels.
      :param: boundAvailable. True if there is a bound and unbound pdb for each complex. False otherwise
      :param res2res_dist: float. max distance between any heavy atoms of 2 amino acids to be considered as interacting
                                  (Angstrom)
      :param statusManager: class that implements .setStatus(msg) to communicate
    '''
        ToolManager.__init__(self,
                             computedFeatsRootDir,
                             statusManager=statusManager)
        self.prefix = prefix
        self.areForTrainAndTest = areForTrainAndTest
        self.res2res_dist = res2res_dist
        self.boundAvailable = boundAvailable
        assert not (
            boundAvailable == True and areForTrainAndTest == False
        ), "Error parameters in CMap: boundAvailable==True and areForTrainAndTest==False"
        self.ppb = PPBuilder(
            radius=200)  # radius set to 200 to not worry about broken chains
        self.outPath = myMakeDir(computedFeatsRootDir, "common")
        self.outPathCM = myMakeDir(self.outPath, "contactMaps")
        self.outPathResDict = myMakeDir(self.outPath, "includedResidues")
        self.outPathNeigs = myMakeDir(self.outPath, "voroNeigs")
Example #4
0
    def _map(self, model):
        """Map (PRIVATE).

        :param model: the model that will be mapped
        :type model: L{Model}
        """
        ppb = PPBuilder()
        ppl = ppb.build_peptides(model)
        fd = {}
        for pp in ppl:
            try:
                # make fragments
                flist = _make_fragment_list(pp, self.flength)
                # classify fragments
                mflist = _map_fragment_list(flist, self.reflist)
                for i in range(0, len(pp)):
                    res = pp[i]
                    if i < self.edge:
                        # start residues
                        continue
                    elif i >= (len(pp) - self.edge):
                        # end residues
                        continue
                    else:
                        # fragment
                        index = i - self.edge
                        assert (index >= 0)
                        fd[res] = mflist[index]
            except PDBException as why:
                if why == 'CHAINBREAK':
                    # Funny polypeptide - skip
                    pass
                else:
                    raise PDBException(why)
        return fd
Example #5
0
def match_pdb_residue_num_to_seq(model, ref=None):
    """Match PDB residue numbering (as given in PDB file) to
    a reference sequence (can be pdb sequence) numbered by index.

    Reference sequence is 1-indexed (and is indexed as such in output).

    Args:
        model: A biostructmap Model object.
        ref (dict): A dictionary containing reference protein sequences for each
            chain in the protein structure. Defaults to the protein sequences
            given in PDB file.
    Returns:
        dict: A dictionary mapping reference sequence index (key) to
            residue numbering as given in the PDB file (value). For example,
            we might have a key of ('A', 17) for the 17th residue in the
            reference sequence for chain 'A', with a value of
            ('A', (' ', 273, ' ')) that represents the Bio.PDB identifier for
            the corresponding residue.
    """
    ppb = PPBuilder()
    polypeptides = ppb.build_peptides(model.parent().structure)
    if ref is None:
        ref = model.parent().sequences
    output = {}
    for peptide in polypeptides:
        peptide_sequence = peptide.get_sequence()
        # Presume that peptide belongs to a single chain
        chain_id = peptide[0].get_full_id()[2]
        _, ref_to_pdb = align_protein_sequences(peptide_sequence,
                                                ref[chain_id])

        for ref_pos, pdb_pos in ref_to_pdb.items():
            output[(chain_id,
                    ref_pos)] = peptide[pdb_pos - 1].get_full_id()[2:4]
    return output
Example #6
0
def deleteChain():# Delete a complete chain from a pdb and save the new structure in pdbname_free.pdb
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	nb_chain=input('How many chain do you want to delete : ')
	for i in range(nb_chain):
		rm_chain=raw_input('What chain you want to delete : ')
		for model in structure:
			for chain in model:
				if(chain.id==rm_chain):
					model.detach_child(chain.id)
	pept = raw_input('Do you want to get a pdb with the sequence in its name : ')
	if(pept == 'y'):
		ppb=PPBuilder()
		for pp in ppb.build_peptides(structure):
			seq = seq + pp.get_sequence()
		seq=seq.lower()
		seq=str(seq)
		w = PDBIO()
		w.set_structure(structure)
		w.save(seq+'_bound.pdb')
	else:
		w = PDBIO()
		w.set_structure(structure)
		w.save(nameStruct+'_without'+rm_chain+'.pdb')
Example #7
0
    def _get_proteins_by_structure(self, pdb_structure, model, file_path):
        """
            _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data
        """
        ppb = PPBuilder()
        protein_data = []

        # Parse for the chain_id and chain sequence
        for c_ele in pdb_structure.get_chains():
            if (c_ele):
                c_ppd_list = []
                for c_ppd in ppb.build_peptides(c_ele):
                    c_pp_seq = str(c_ppd.get_sequence())
                    c_ppd_list.append(c_pp_seq)
                c_seq = ''.join(c_ppd_list)
                protein_data.append({
                    'id':
                    os.path.basename(file_path),
                    'model_id':
                    model,
                    'chain_id':
                    c_ele.get_id(),
                    'sequence':
                    c_seq,
                    'md5':
                    hashlib.md5(c_seq.encode()).hexdigest()
                })

        return protein_data
 def read_structure_seqs(self, strucm):
     """ Extracts sequences from structure"""
     # PDB extrated sequences
     for mod in strucm.st:
         ppb = PPBuilder()
         for chn in mod.get_chains():
             seqs = []
             #self.sequences[ch_id]['pdb'][mod.id] = [1]
             ch_id = chn.id
             wrong_order = False
             for frag in ppb.build_peptides(chn):
                 start = frag[0].get_id()[1]
                 end = frag[-1].get_id()[1]
                 frid = '{}:{}-{}'.format(ch_id, start, end)
                 sqr = SeqRecord(frag.get_sequence(), 'pdbsq_' + frid,
                                 'pdbsq_' + frid,
                                 'PDB sequence chain ' + frid)
                 if start < end:
                     sqr.features.append(
                         SeqFeature(FeatureLocation(start, end)))
                 else:
                     print("Warning: unusual residue numbering at chain ",
                           ch_id)
                     print(
                         "Warning: chain reconstruction may not be available"
                     )
                     sqr.features.append(
                         SeqFeature(FeatureLocation(end, start)))
                     wrong_order = True
                 seqs.append(sqr)
             if ch_id not in self.data:
                 self.add_empty_chain(ch_id)
             self.data[ch_id]['pdb'][mod.id] = seqs
             self.data[ch_id]['pdb']['wrong_order'] = wrong_order
Example #9
0
 def _map(self, model):
     """
     @param model: the model that will be mapped
     @type model: L{Model}
     """
     ppb=PPBuilder()
     ppl=ppb.build_peptides(model)
     fd={}
     for pp in ppl:
         try:
             # make fragments
             flist=_make_fragment_list(pp, self.flength)
             # classify fragments
             mflist=_map_fragment_list(flist, self.reflist)
             for i in range(0, len(pp)):
                 res=pp[i]
                 if i<self.edge:
                     # start residues
                     continue
                 elif i>=(len(pp)-self.edge):
                     # end residues
                     continue
                 else:
                     # fragment
                     index=i-self.edge
                     assert(index>=0)
                     fd[res]=mflist[index]
         except PDBException, why:
             if why == 'CHAINBREAK':
                 # Funny polypeptide - skip
                 pass
             else:
                 raise PDBException(why)
def main():
    parser = optparse.OptionParser()
    parser.add_option("-p", "--pdb", dest="pdb", help="path to PDB file", metavar="STRING")
    parser.add_option("-f", "--pdb_fasta", dest="pdb_fasta", help="path to PDB fasta file (out)", metavar="STRING")

    (options, args) = parser.parse_args()
    pdb_fasta = options.pdb_fasta
    pdb_file = options.pdb

    pdb_name = os.path.basename(pdb_file).split(".")[0]

    parser=BP.PDBParser()
    ppb = PPBuilder(radius=1000) # retrieve all amino acids
    pdbseq = ""
    structure = parser.get_structure(pdb_name,pdb_file)
    model = structure[0]
    for chain in model:
        for pp in ppb.build_peptides(model[chain.id], aa_only=False):
            pdbseq += (pp.get_sequence())

    print ">",pdb_name,len(pdbseq)
    print pdbseq

    with open(pdb_fasta,"w") as o:
        o.write(">%s %i\n%s\n"%(pdb_name,len(pdbseq),pdbseq))
Example #11
0
def fetch_protein(pdb_id: str) -> Tuple[List[str], np.ndarray]:
    # retrieve pdb file from Protein Data Bank
    pdb_file = f"{pdb_id}.pdb"
    pdb_file_path = os.path.join(os.getcwd(), pdb_file)
    protein_url = f"https://files.rcsb.org/download/{pdb_file}"
    req = requests.get(protein_url)
    with open(pdb_file_path, "w") as f:
        f.write(req.text)

    # parse pdb file
    structure = PDBParser().get_structure(pdb_id, pdb_file)
    peptides = PPBuilder().build_peptides(structure)[0]

    # extract amino acid sequence and phi/psi angles
    aa_sequence = list(peptides.get_sequence())
    phi_psi_angles = np.array(
        list(
            map(
                lambda x: (180 if not x[0] else np.rad2deg(x[0]), 180
                           if not x[1] else np.rad2deg(x[1])),
                peptides.get_phi_psi_list()))).T

    # remove pdb file
    subprocess.check_output(["rm", pdb_file])

    return aa_sequence, phi_psi_angles
Example #12
0
def get_chain_position(input_file, global_index):
    chain = None
    position_in_chain = -1
    file_name, _ = os.path.splitext(input_file)
    file_name = file_name.replace('./', '')
    parser = PDBParser()
    structure = parser.get_structure(file_name, input_file)
    builder = PPBuilder()
    peptides = builder.build_peptides(structure, aa_only=False)
    pps = [EPolyPeptide(pp) for pp in peptides]
    total_length = sum([len(pp) for pp in pps])
    if global_index >= total_length:
        return None, -1
    distance = 0
    offset = 0
    global_index = int(global_index) + 1
    while distance < global_index:
        pp = pps[offset]
        distance += len(pp)
        offset += 1
        if global_index <= distance:
            position_in_chain = global_index - (distance - len(pp))
            chain = pp.chain_id
            break
    return chain, position_in_chain
Example #13
0
def getBoundResList(fname_bound, fname_unbound, listOfDictsChainToResId):
    parser = PDBParser(QUIET=True)
    structureUnbound = parser.get_structure(fname_unbound, fname_unbound)
    structureBound = parser.get_structure(fname_bound, fname_bound)
    ppb = PPBuilder()
    pp_list_unbound = ppb.build_peptides(structureUnbound, aa_only=False)
    pp_list_bound = ppb.build_peptides(structureBound, aa_only=False)
    mapper = BoundUnboundMapper(pp_list_unbound, pp_list_bound)
    mapper.build_correspondence()
    newDictsList = []

    for dictOfChainsToRes in listOfDictsChainToResId:
        tempDict = {}
        for chainId_u in dictOfChainsToRes:
            for resId_u in sorted(dictOfChainsToRes[chainId_u]):
                chainId_b_resId_b = mapper.mapUnboundToBoundUsingId(
                    " " if chainId_u == "*" else chainId_u, resId_u)
                #        print(chainId_u, resId_u, chainId_b_resId_b)
                if chainId_b_resId_b is None: continue
                chainId_b, resId_b = chainId_b_resId_b
                if not chainId_b in tempDict:
                    tempDict[chainId_b] = []
                tempDict[chainId_b].append(resId_b)
        newDictsList.append(tempDict)
    return newDictsList
Example #14
0
def deleteChain():# Delete a complete chain from a pdb and save the new structure in pdbname_free.pdb
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	nb_chain=input('How many chain do you want to delete : ')
	for i in range(nb_chain):
		rm_chain=raw_input('What chain you want to delete : ')
		for model in structure:
			for chain in model:
				if(chain.id==rm_chain):
					model.detach_child(chain.id)
	pept = raw_input('Do you want to get a pdb with the sequence in its name : ')
	if(pept == 'y'):
		ppb=PPBuilder()
		for pp in ppb.build_peptides(structure):
			seq = seq + pp.get_sequence()
		seq=seq.lower()
		seq=str(seq)
		w = PDBIO()
		w.set_structure(structure)
		w.save(seq+'_bound.pdb')
	else:
		w = PDBIO()
		w.set_structure(structure)
		w.save(nameStruct+'_without'+rm_chain+'.pdb')
Example #15
0
def doChainAlignments(pdbID, structure, consensusFile, chainSequencesDir,
                      verbose):
    print('Pairwise alignment of chain sequences with consensus...')
    chains = [chain for chain in structure.get_chains()]
    ppb = PPBuilder()
    for chain in chains:
        sequence = ""
        for pp in ppb.build_peptides(chain):
            sequence += pp.get_sequence()
        sequenceID = pdbID + '_' + chain.get_id()
        sequenceOutput = os.path.join(chainSequencesDir, sequenceID + '.fasta')
        if (len(sequence) == 0):
            print("ERROR: Unable to get chain sequence from PDB file.")
            exit("Selected PDB-file does not contain protein structure.")
        with open(sequenceOutput, 'w') as f:
            f.write('>' + sequenceID + '\n' + str(sequence))
        if verbose:
            print('Chain ' + chain.get_id())
            print(sequence)
        subprocess.check_call([
            'needle', '-asequence', sequenceOutput, '-bsequence',
            consensusFile, '-noendweight', '-endopen', '10.0', '-endextend',
            '0.5', '-brief', '-aformat', 'srspair', '-auto', '-aname_outfile',
            pdbID + '_' + chain.get_id(), '-adirectory_outfile',
            chainSequencesDir
        ])
    print('OK')
def main():
    parser = optparse.OptionParser()
    parser.add_option("-p",
                      "--pdb",
                      dest="pdb",
                      help="path to PDB file",
                      metavar="STRING")
    parser.add_option("-f",
                      "--pdb_fasta",
                      dest="pdb_fasta",
                      help="path to PDB fasta file (out)",
                      metavar="STRING")

    (options, args) = parser.parse_args()
    pdb_fasta = options.pdb_fasta
    pdb_file = options.pdb

    pdb_name = os.path.basename(pdb_file).split(".")[0]

    parser = BP.PDBParser()
    ppb = PPBuilder(radius=1000)  # retrieve all amino acids
    pdbseq = ""
    structure = parser.get_structure(pdb_name, pdb_file)
    model = structure[0]
    for chain in model:
        for pp in ppb.build_peptides(model[chain.id], aa_only=False):
            pdbseq += (pp.get_sequence())

    print ">", pdb_name, len(pdbseq)
    print pdbseq

    with open(pdb_fasta, "w") as o:
        o.write(">%s %i\n%s\n" % (pdb_name, len(pdbseq), pdbseq))
Example #17
0
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str):
    """
    :param filelist:
    :param q:
    :param lock:
    :param cursor:
    :param conn:
    :param dir_name:
    """
    with open('status_tmp.txt', 'w') as f:
        f.write('')
    for file in filelist:
        if file in open('status_tmp.txt').readlines():
            continue
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb')
        if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))):
            print("File with ID PDB: {:s} not found!".format(file))
            continue
        parser = PDBParser()
        structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file)))
        name = parser.header.get('name', '')
        head = parser.header.get('head', '')
        method = parser.header.get('structure_method', '')
        res = parser.header.get('resolution', '')
        ncomp = 0
        nchain = 0
        eclist = []
        for values in parser.header['compound'].values():
            ncomp += 1
            nchain += len(values['chain'].split(','))
            eclist.append(values.get('ec', '') or values.get('ec_number', ''))
        ec = ", ".join(eclist)
        nres = 0
        mmass = 0
        ppb = PPBuilder()
        for pp in ppb.build_peptides(structure):
            seq = pp.get_sequence()
            nres += len(seq)
            seqan = ProteinAnalysis(str(seq))
            mmass += int(seqan.molecular_weight())
        lock.acquire()
        try:
            cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN,
NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format(
                file, name, head, method, res, ncomp, nchain, nres, mmass, ec))
        except sqlite3.DatabaseError as err:
            print("Error: ", err)
            continue
        else:
            print("Download Done for ID PDB: {:s}".format(file))
            conn.commit()
            q.put(file)
        finally:
            lock.release()
            with open('status_tmp.txt', 'at') as f:
                f.write((file + '\n'))
    os.remove('status_tmp.txt')
    q.put(None)
Example #18
0
def generate_seq_file(score_file, save_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    mut_chains = sf.iloc[:,0]

    mut_dict = dict()
    mut_track = set()
    pdb_track = set()
    for chain in mut_chains:
        info = chain.split('_')
        pdb_id = info[0]
        chain_id = info[1]
        wt_aa = info[2][0:3]
        mu_aa = info[2][-3:]
        mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2])))
        if not chain in mut_track:
            mut_track.add(chain)
            if pdb_id in pdb_track:
                mut_dict[pdb_id].append({'chain_id':chain_id,
                                         'wt_aa': wt_aa,
                                         'mu_aa': mu_aa,
                                         'mu_pos': mu_pos,
                                         'name': chain})
            else:
                mut_dict[pdb_id] = [{'chain_id': chain_id,
                                     'wt_aa': wt_aa,
                                     'mu_aa': mu_aa,
                                     'mu_pos': mu_pos,
                                     'name': chain}]
                pdb_track.add(pdb_id)
    del mut_track
    del pdb_track
                
    parser = PDBParser()
    seq_builder = PPBuilder()
    pdb_dl_handle = PDBList()
    PDB_DIR = './dataFile/PDB_dl'
    # check if pdb file exists
    mut_collect = dict()
    for pdb_id in mut_dict.keys():
        if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR)
        pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]

        for mutation in mut_dict[pdb_id]:
            protein_chain = model[mutation['chain_id']]
            sequence = "".join([str(pp.get_sequence())
                                for pp in seq_builder.build_peptides(protein_chain)])
            sequence = sequence.replace('\n', '').replace(' ', '')
            assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match'
            mut_Seq_list = list(sequence)
            mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa'])
            mut_Seq = ''.join(mut_Seq_list)
            mut_collect[mutation['name']] = mut_Seq
    
    with open(save_file, 'w') as output_hl:
        for k, v in mut_collect.items():
            output_hl.write(k+'\t'+v+'\n')
 def real_seq():
     structure = PDBParser().get_structure(protein.protein_id,
                                           protein.protein_id + '.pdb')
     ppb = PPBuilder()
     seq = ''
     for pp in ppb.build_peptides(structure):
         seq += pp.get_sequence()
     return seq
Example #20
0
def polypeptide(pdbfile):
	parser = PDBParser()
	structure = parser.get_structure('test', pdbfile)

	builder = PPBuilder()
	pp, = builder.build_peptides(structure)

	return pp
Example #21
0
def get_phi_psi_data(pdb_id, chain=None):
    '''Gets phi and phi angle data.'''
    builder = PPBuilder()
    return [polypep.get_phi_psi_list()
            for model in get_structure(pdb_id)
            for chn in model
            if chain is None or chain == chn.get_id()
            for polypep in builder.build_peptides(chn)]
Example #22
0
def parse_structure(path):
    """
    Parses a PDB formatter structure using Biopython's PDB Parser
    Verifies the integrity of the structure (gaps) and its
    suitability for the calculation (is it a complex?).
    """

    print('[+] Reading structure file: {0}'.format(path))
    fname = os.path.basename(path)
    sname = '.'.join(fname.split('.')[:-1])

    try:
        s = P.get_structure(sname, path)
    except Exception as e:
        print('[!] Structure \'{0}\' could not be parsed'.format(sname),
              file=sys.stderr)
        raise Exception(e)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    # Remove HETATMs and solvent
    res_list = list(s.get_residues())
    n_res = len(res_list)
    _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H'
    for res in res_list:
        if _ignore(res):
            chain = res.parent
            chain.detach_child(res.id)
        elif not is_aa(res, standard=True):
            raise ValueError(
                'Unsupported non-standard amino acid found: {0}'.format(
                    res.resname))

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)
    n_chains = len(set([c.id for c in s.get_chains()]))

    if n_peptides != n_chains:
        print('[!] Structure contains gaps:', file=sys.stderr)
        for i_pp, pp in enumerate(peptides):
            print(
                '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'
                .format(i_pp, pp[0], pp[-1]),
                file=sys.stderr)
        #raise Exception('Calculation cannot proceed')

    return (s, n_chains, n_res)
Example #23
0
def get_pdb_amino_acid_sequences(pdb_path):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    ppb = PPBuilder()
    pdb_aas = []
    for pp in ppb.build_peptides(structure):
        pdb_aa = str(pp.get_sequence())
        pdb_aas.append(pdb_aa)
    return pdb_aas
Example #24
0
def find_pdb_limits(pdb_path):
    """"""
    pdb = PDBParser().get_structure('', pdb_path)
    # takes the first (and only) polypeptide
    pp = PPBuilder().build_peptides(pdb)[0]
    start = pp[0].get_id()[1]
    end = pp[-1].get_id()[1]
    seq = pp.get_sequence()
    return (start, end, seq)
Example #25
0
    def _model_file_to_data(self, file_path, params):
        """
            _model_file_to_data:
                Do the PDB conversion--parse the model pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.PDBParser(PERMISSIVE=1)
        pdb1 = file_path
        pp_no = 0
        data = {}

        try:
            structure = parser.get_structure("test", pdb1)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'PDBParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            # logging.info(f'Getting pdb structure data for {structure}!')
            (compound, source) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            model = structure[0]
            protein_data = self._get_proteins_by_structure(
                structure, model.get_id(), file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                data = {
                    'name': structure.header.get('name', ''),
                    'num_chains': num_chains,
                    'num_residues': num_residues,
                    'num_atoms': num_atoms,
                    'compound': compound,
                    'source': source,
                    'proteins': protein_data
                }
            else:
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
                data = {}
        finally:
            return data, pp_no, params
Example #26
0
def get_pdb_torsion_angles(pdb_path, chain_index):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    A = []
    ppb = PPBuilder()
    pdb_aas = []
    model = ppb.build_peptides(structure)
    chain = model[chain_index]
    phi_psi_list = chain.get_phi_psi_list()
    return [x[0] for x in phi_psi_list], [x[1] for x in phi_psi_list]
Example #27
0
def get_primary_sequence(input_file):
    file_name, _ = os.path.splitext(input_file)
    file_name = file_name.replace('./', '')
    parser = PDBParser()
    structure = parser.get_structure(file_name, input_file)
    builder = PPBuilder()
    seq = ""
    for chain in builder.build_peptides(structure, aa_only=False):
        seq += chain.get_sequence()
    return seq
def get_aa_encoded(protein_file):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    ppb = PPBuilder()
    pdb_aas = []
    for pp in ppb.build_peptides(structure):
        pdb_aa = str(pp.get_sequence())
        pdb_aas.append(pdb_aa)
    encoded = int_encoding(pdb_aas, AA_CODES)
    return encoded
Example #29
0
def obtian_seq_wo_seq_file(score_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    chains_involved = sf.iloc[:, 0]
    pdb = dict()
    pdb_track = set()
    for chain in chains_involved:
        chain_name = chain[0:6]
        pdb_name = chain[0:4]
        # if we encounter a old pdb
        if pdb_name in pdb_track:
            pdb[pdb_name].add(chain_name)
        # else, we have a new pdb
        else:
            # update the track file
            pdb_track.add(pdb_name)
            pdb[pdb_name] = {chain_name}

    # create the link to the PDB database and retrive all the file
    # related to the files, store them locally under ./dataFile/PDB_dl/
    PDB_DIR = './dataFile/PDB_dl'
    if not os.path.exists(PDB_DIR):
        os.mkdir(PDB_DIR)
    # create the download handle
    pdb_dl_handle = PDBList()
    # download all of the pdb files
    for item in pdb.keys():
        if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=item,
                                            file_format='pdb',
                                            overwrite=False,
                                            pdir=PDB_DIR)

    # for each pdb, we will construct the sequence
    seq_dict = dict()
    parser = PDBParser()
    seq_builder = PPBuilder()
    # key is the pdb_id, value is the chain in a
    for pdb_id, chain_names in pdb.items():
        pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]
        for chain in chain_names:
            # extract the last letter, which is the chain name
            chain_id = chain[-1]
            protein_chain = model[chain_id]
            sequence = "".join([
                str(pp.get_sequence())
                for pp in seq_builder.build_peptides(protein_chain)
            ])
            sequence = sequence.replace('\n',
                                        '').replace(' ',
                                                    '')  # clean the bad chars
            seq_dict[chain] = sequence

    return seq_dict
 def get_secondary_structure_details(self, name, pdb_file, aa_only=False):
     parser = PDBParser()
     structure = parser.get_structure(name, pdb_file)
     dssp = DSSP(structure[0], pdb_file, acc_array="Wilke")
     ss = "".join([aa[2] for aa in dssp])
     sasa = [residues[aa[1]] * aa[3] for aa in dssp]
     builder = PPBuilder()
     seq = ""
     for chain in builder.build_peptides(structure, aa_only=aa_only):
         seq += chain.get_sequence()
     return name, seq, ss, sasa, structure
Example #31
0
 def get_sequence( self, chain_id ):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     ppb = PPBuilder()
     return ppb.build_peptides(self.structure[chain_id])[0].get_sequence()
def test_add_residue():
    structure = PeptideBuilder.initialize_res("A")
    for aa in "CDEFGHIKLMNPQRSTVWY":
        structure = PeptideBuilder.add_residue(structure, aa)

    # extract peptide from structure and compare to expected
    ppb = PPBuilder()
    pp = next(iter(ppb.build_peptides(structure)))
    assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY"

    # now compare to saved reference structure
    assert compare_to_reference(structure, "extended.pdb")
 def __init__(self, path):
     '''
         Initialize every PDB_Parser with a path to a structure-file in CIF format.
         An example file is included in the repository (7ahl.cif).
         Tip: Store the parsed structure in an object variable instead of parsing it
         again & again ...
     '''
     # parser object for reading in structure in CIF format
     CIF_PARSER: MMCIFParser = MMCIFParser()
     self.ppb = PPBuilder()
     self.structure = CIF_PARSER.get_structure('structure', path)
     self.chains: Dict[Entity] = {}
     for chain in self.structure.get_chains():
         self.chains[chain.id] = chain
def test_add_residue():
    """
    Build a peptide containing all 20 amino acids
    """
    structure = PeptideBuilder.initialize_res("A")
    for aa in "CDEFGHIKLMNPQRSTVWY":
        PeptideBuilder.add_residue(structure, aa)

    # extract peptide from structure and compare to expected
    ppb = PPBuilder()
    pp = next(iter(ppb.build_peptides(structure)))
    assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY"

    assert compare_to_reference(structure, "extended.pdb")
Example #35
0
def write_seqs(filename):
    """Process a given structure file to extract sequences and save them into files.
    """
    struct_name = os.path.splitext(os.path.basename(filename))[0]
    is_CIF = False
    try:
        parser = PDBParser()
        structure = parser.get_structure(struct_name, filename)

    except:
        try:
            parser = MMCIFParser()
            structure = parser.get_structure(struct_name, filename)
            is_CIF = True

        except:
            sys.stderr.write("ERROR: File {} is not a proper/supported protein structure file.\n".format(filename))
            return

    if is_CIF:
        name, organism, resolution = get_info_from_cif_dict(filename)

    else:
        name, organism, resolution = get_info_from_header(structure)

    description = "| {} | {} | Resolution {:.2f} A".format(name, organism, resolution)

    ppb = PPBuilder()
    
    chain_seqrecord_list = []
    peptide_seqrecord_list = []
    for model in structure:
        for chain in model:
            base_id = "{}.{:d}_{}".format(struct_name, model.id, chain.id)
            chain_seq = Seq("")
            for (pp_id, pp) in enumerate(ppb.build_peptides(chain)):
                chain_seq += pp.get_sequence()
                peptide_seqrecord_list.append(SeqRecord(pp.get_sequence(), 
                    id = "{}.{:d}".format(base_id, pp_id),
                    description = description))

            chain_seqrecord_list.append(SeqRecord(chain_seq,
                id = base_id,
                description = description))

    base_output_name = os.path.splitext(filename)[0]
    
    SeqIO.write(chain_seqrecord_list, "{}_chains.fasta".format(base_output_name), "fasta")
    SeqIO.write(peptide_seqrecord_list, "{}_peptides.fasta".format(base_output_name), "fasta")
Example #36
0
def pdb_extract_chain_seqs(struct):
	"""
	Takes a Bio.PDB.Structure object, returns a dictionary
	of chains and sequences (eg {'A':'MTSSLGRF', 'B':'MSLQRGFIN'}
	(for structures with multiple models, like NMR structures, 
	only the first model is considered).
	"""
	
	chains = {}
	ppb=PPBuilder()
	for chain in struct[0]: # takes model 1
		for pp in ppb.build_peptides(chain, aa_only=False):
			seq = str(pp.get_sequence())
			chains[chain.id] = "".join(seq)
	
	return chains
Example #37
0
def getSequence(): # Get the sequence of a specific chain
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	what_chain=raw_input('For what chain do you want the sequence : ')

	for model in structure:
		for chain in model:
			if chain.id != what_chain:
				model.detach_child(chain.id)

	ppb=PPBuilder()
	for pp in ppb.build_peptides(structure):
		seq = seq + pp.get_sequence()
	seq=seq.upper()
	print seq
def getPDBInfo(pdb, pdbpath):
    # extract information from the PDB file using Biopython
    # Biopython parsers
    parser = BP.PDBParser()
    ppb = PPBuilder()

    pdbseq = ""
    # PDB descriptors
    # name = pdb.split("/")[-1].split(".")[0].split("_")[0]
    if len(pdb.split("/")[-1].split(".")[0].split("_")) > 1:
        chain = pdb.split("/")[-1].split(".")[0].split("_")[1]
    else:
        # not "_" delimiters in pdb name
        chain = pdb.split("/")[-1].split(".")[0][4]

    if os.path.exists(os.path.join(pdbpath, pdb + "_ren.pdb")):
        structure = parser.get_structure(pdb, os.path.join(pdbpath, pdb + "_ren.pdb"))
    residue_list = BP.Selection.unfold_entities(structure[0][chain], 'R')

    # the build_peptides method has an option for treating non-standard amino acids
    for pp in ppb.build_peptides(structure[0][chain], aa_only=False):
        pdbseq += (pp.get_sequence())
    return structure, residue_list, pdbseq
Example #39
0
def create_sequence_from_file(chain_pdb, missing_residues, quiet_parser=False):
    """ Read a PDB file and creates a sequence and mismask to represent 
    its content.
    
    @param chain_pdb: The PDB file to be read.
    @type chain_pd: str
    
    @param missing_residues: A dictionary with the missing residues.
    @type missing_residues: dict
    
    @param quiet_parser: Disable PDBParser warnings.
    @type quiet_parser: bool
        
    """
    sequences = []
    mismasks = []
            
    output_data = []
    output_mismask = []
    
    parser = PDBParser(QUIET=quiet_parser)
    structure = parser.get_structure("X", chain_pdb)

    dssp = DSSP(model=structure[0], pdb_file=chain_pdb)        

    # Loop over residues in peptides
    ppb = PPBuilder()
    pp_list = ppb.build_peptides(structure[0])
    chain_list = structure[0].get_list()
    
    if len(pp_list) == 0:
        raise TorusDBNBuildPolypeptideException(
            "Could not create a list of Polypeptide objects from the file %s." 
            % (chain_pdb)
        )
    else:
       pp_chains, chain_missing_residues = _get_pp_with_chain_break(
            chain_pdb, pp_list, chain_list, missing_residues)
                   
    for pp_index, pp in enumerate(pp_chains):
        phi_psi_list = pp.get_phi_psi_list()
        missing_residues = chain_missing_residues[pp_index]
                    
        for i in xrange(1, len(phi_psi_list)-1):
            seq = [0] * 6
            mism = [eMISMASK.MOCAPY_HIDDEN] + 4 * [eMISMASK.MOCAPY_OBSERVED]
            
            # Amino acid
            res = pp[i]
            res_name = res.get_resname()
            res_index = res.get_id()[1]

            aa_index = three_to_index(res_name)
            
            if res_index in missing_residues:
                seq[3] = aa_index
                mism[1] = eMISMASK.MOCAPY_MISSING # angles unknown
                mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown
                mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown  
            else:
                seq[3] = aa_index

                # Secondary Structure
                try:
                    ss = res.xtra["SS_DSSP"]
                    ss_index = dssp_to_index(ss)
                    seq[4] = ss_index    
                except:
                    mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown
                                 
                # Angles
                if None in phi_psi_list[i]:
                # Previous or next residue missing, therefore angles are
                # Unknown
                    mism[1] = eMISMASK.MOCAPY_MISSING                                                 
                else:
                    seq[1:3] = phi_psi_list[i]
                
                 # Cis/Trans information   
                if (res_index - 1) in missing_residues:
                    mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown   
                else: 
                    try:                                    
                        seq[5] = _get_peptide_bond_conformation(res, pp[i-1])
                    except TorusDBNException:
                        mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown

            output_data.append(seq)
            output_mismask.append(mism)

    if output_data and output_mismask:
        sequences.append(numpy.array(output_data))
        mismasks.append(numpy.array(output_mismask, dtype = numpy.uint))
    else: 
        raise TorusDBNException(
            "Could not create training data from the file %s." 
            % (chain_pdb)
        )
    return sequences, mismasks    
Example #40
0
def create_residue_lists(pdb, chain):
	print pdb, chain
	res_lists = []
	aa_list = ""
	try:
		ppb = PPBuilder()	
		p=PDB.PDBParser(QUIET = True)
		s=p.get_structure('X', "../foldx_setup/repaired_pdbs/RepairPDB_" + pdb + "_" + chain + ".pdb")
		pp = ppb.build_peptides(s)[0]
		seq = str(pp.get_sequence())
		
		#Get the structures
		ref_struct = s[0]
		ref_chain = ref_struct[chain]
		ref_residues = []
		ref_res_nums = []
		for res in ref_chain:
			ref_residues.append( res.resname )
			ref_res_nums.append(res.id[1])				
	except KeyError:
		print "Something is wrong with the mapped residues"
		#continue
	seq_array = ddG_var_helper.convert_to_one_letter_code(ref_residues)
	
	count = 0
	i = 0
	while(i < len(seq_array)):	
		if (i < 101): #These statements split up the sequence into 100 aa chunks for performing a Position Scan
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 201 and i>101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 301 and i>201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 401 and i>301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 401):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 501 and i>401):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)			
		elif(i == 501):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 601 and i>501):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 601):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if(i < 701 and i>601):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 701):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 801 and i>701):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 801):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 901 and i>801):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 901):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1001 and i>901):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1001):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1101 and i>1001):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1201 and i>1101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1301 and i>1201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
		i = i + 1
	return res_lists

pdb_file = gzip.open(os.path.join(input_path, filename))
pdb_parser = Bio.PDB.PDBParser(pdb_file)

## to get structural data
pdb_id = pdb_filename[7:11]
structure = pdb_parser.get_structure(pdb_id, pdb_file)

print structure


# Extract sequence from coordinate information
# Amino acid residues present in SEQRES but that doesn't have coordinate information
# are listed in REMARK 465
ppb = PPBuilder()        # PPBuilder uses C--N distance to find polypeptides.
for pp in ppb.build_peptides(structure):
    sequence = pp.get_sequence()
    print "This is the polypeptide sequence of %s" %pdb_id
    print sequence
    print "length of sequence: ", len(sequence)

# OR extract sequence from SEQRES - ?

# when I aligned the sequence output of 1A27 here and the fasta sequence, I see 4 residues missing
# these 4 residues are reported as REMARK 465 MISSING RESIDUES (THE FOLLOWING RESIDUES WERE NOT LOCATED
# IN THE EXPERIMENT)
# does this mean unresolved or missing in protein construct?


 def get_sequence(self):
     ppb = PPBuilder()
     polypeptide = ppb.build_peptides(self.structure[0][self.chain])
     return polypeptide[0].get_sequence()
Example #43
0
'''

Extract the protein sequence from a PDB chain.

-----------------------------------------------------------
(c) 2013 Allegra Via and Kristian Rother
    Licensed under the conditions of the Python License

    This code appears in section 21.4.2 of the book
    "Managing Biological Data with Python".
-----------------------------------------------------------
'''

from Bio import PDB
from Bio.PDB.Polypeptide import PPBuilder

parser = PDB.PDBParser()
structure = parser.get_structure("2DN1", "dn/pdb2dn1.ent")
ppb = PPBuilder()
peptides = ppb.build_peptides(structure)
for pep in peptides:
    print pep.get_sequence()
    
#####
#Extract chains with Chimera
nucl_xen=openModels.open('1kx5.pdb',type='PDB')
# nucl_yeast=openModels.open('1id3.pdb',type='PDB')
# h2a_xen=Seq(str(nucl[0].sequence('C')))
# h2a_yeast=Seq(str(nuclZ[0].sequence('C')))
rc('select :.A :.B :.C :.D :.E :.F :.G :.H')
rc('write selected format pdb #0 xen_nucl.pdb')
# rc('write selected format pdb #1 h2a_yeast_xray.pdb')

#generate alignments
#Biopython extracts seqs from pdb
p = PDBParser(PERMISSIVE=1)
s = p.get_structure('1kx5', '1kx5.pdb')
ppb=PPBuilder()
seqs_xen=dict()
for i in ['A','B','C','D','E','F','G','H']:
	seqs_xen[i]=ppb.build_peptides(s[0][i])[0].get_sequence()

#Here we manually input the trunctaed versions of yeast sequences
seqs_yeast=dict()
#CSE4 - H3
#SSKQQWVSSAIQSDSSGRSLSNVNRLAGDQQSINDRALSLLQRTRATKNLFPRREERRRYESSKSDLDIETDYEDQAGNLEIETENEEEAEMETEVPAPVRTHSYALDRYVRQKRREKQRKQSLKR
#VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI
#VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI
seqs_yeast['A']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI')
seqs_yeast['E']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI')
#H4
seqs_yeast['B']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG')
seqs_yeast['F']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG')
 def __init__(self):
     PPBuilder.__init__(self)
Example #46
0
def parse_structure(path):
    """
    Parses a structure using Biopython's PDB/mmCIF Parser
    Verifies the integrity of the structure (gaps) and its
    suitability for the calculation (is it a complex?).
    """

    print('[+] Reading structure file: {0}'.format(path))
    fname = os.path.basename(path)
    sname = '.'.join(fname.split('.')[:-1])
    s_ext = fname.split('.')[-1]

    _ext = set(('pdb', 'ent', 'cif'))
    if s_ext not in _ext:
        raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext))

    if s_ext in set(('pdb', 'ent')):
        sparser = PDBParser(QUIET=1)
    elif s_ext == 'cif':
        sparser = MMCIFParser()

    try:
        s = sparser.get_structure(sname, path)
    except Exception as e:
        print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr)
        raise Exception(e)

    # Keep first model only
    if len(s) > 1:
        print('[!] Structure contains more than one model. Only the first one will be kept')
        model_one = s[0].id
        for m in s.child_list[:]:
            if m.id != model_one:
                s.detach_child(m.id)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    # Remove HETATMs and solvent
    res_list = list(s.get_residues())

    def _is_het(residue):
        return residue.id[0][0] == 'W' or residue.id[0][0] == 'H'

    for res in res_list:
        if _is_het(res):
            chain = res.parent
            chain.detach_child(res.id)
        elif not is_aa(res, standard=True):
            raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname))
    n_res = len(list(s.get_residues()))

    # Remove Hydrogens
    atom_list = list(s.get_atoms())

    def _is_hydrogen(atom):
        return atom.element == 'H'

    for atom in atom_list:
        if _is_hydrogen(atom):
            residue = atom.parent
            residue.detach_child(atom.name)

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)
    n_chains = len(set([c.id for c in s.get_chains()]))

    if n_peptides != n_chains:
        print('[!] Structure contains gaps:', file=sys.stderr)
        for i_pp, pp in enumerate(peptides):
            print('\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'.format(i_pp, pp[0], pp[-1]), file=sys.stderr)
        #raise Exception('Calculation cannot proceed')

    return (s, n_chains, n_res)
Example #47
0
def validate_structure(s, selection=None, clean=True):
    # setup logging
    logger = logging.getLogger('Prodigy')

    # Keep first model only
    if len(s) > 1:
        logger.warning('[!] Structure contains more than one model. Only the first one will be kept')
        model_one = s[0].id
        for m in s.child_list[:]:
            if m.id != model_one:
                s.detach_child(m.id)

    # process selected chains
    chains = list(s.get_chains())
    chain_ids = set([c.id for c in chains])

    if selection:
        sel_chains = []
        # Match selected chain with structure
        for sel in selection:
            for c in sel.split(','):
                sel_chains.append(c)
                if c not in chain_ids:
                    raise ValueError('Selected chain not present in provided structure: {0}'.format(c))

        # Remove unselected chains
        _ignore = lambda x: x.id not in sel_chains
        for c in chains:
            if _ignore(c):
                c.parent.detach_child(c.id)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    if clean:
        # Remove HETATMs and solvent
        res_list = list(s.get_residues())
        _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H'
        for res in res_list:
            if _ignore(res):
                chain = res.parent
                chain.detach_child(res.id)
            elif not is_aa(res, standard=True):
                raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname))

        # Remove Hydrogens
        atom_list = list(s.get_atoms())
        _ignore = lambda x: x.element == 'H'
        for atom in atom_list:
            if _ignore(atom):
                residue = atom.parent
                residue.detach_child(atom.name)

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)

    if n_peptides != len(chain_ids):
        message = '[!] Structure contains gaps:\n'
        for i_pp, pp in enumerate(peptides):
            message += '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > ' \
                       '{2.parent.id} {2.resname}{2.id[1]}\n'.format(i_pp, pp[0], pp[-1])
        logger.warning(message)
        # raise Exception(message)

    return s
Example #48
0
    parser = PDB.PDBParser()
    structure = parser.get_structure(myPDBfile, myPDBfile[1:3].lower() + "/pdb" + myPDBfile.lower() + ".ent")

    if myAnswer == "No":
        io = PDBIO()
        io.set_structure(structure)
        io.save(myPDBfile + ".pdb")

except:
    print ("you structure is not available in the PDB")
    parser = PDB.PDBParser()
    structure = PDB.PDBParser().get_structure(myPDBfile, myPDBfile + ".pdb")

chains = [chain for chain in structure.get_chains()]
print (chains)
ppb = PPBuilder()
for chain in chains:
    print (chain)
    print (chain.get_id())
    for chainseq in ppb.build_peptides(chain):
        print (chainseq.get_sequence())
        mySeq = chainseq.get_sequence()

        myFastaA = ">" + myPDBfile + "_" + chain.get_id() + "\n" + chainseq.get_sequence()
        print (myFastaA)
        myFastaAfile = open(myPDBfile + "_" + chain.get_id() + ".fasta", "w")
        myFastaAfile.write(">" + myPDBfile + "_" + chain.get_id() + "\n" + str(mySeq))
        myFastaAfile.close()

        myFastafiles = os.listdir(".")
        myFastaIter = iter(myFastafiles)