Ejemplo n.º 1
0
    def _superimpose_structures(structure, superimpose_reference):
        """ Superimpose two structures.
        @param structure: Structure to be superimposed.
        @type structure: Structure
        
        @param superimpose_reference: Structure to be superimposed.
        @type superimpose_reference: Structure
        
        @rtype: Structure
        @return: The superimposed structure.
        
        """        
        ppb = PPBuilder()
        sup = Superimposer()
        pp_reference = ppb.build_peptides(superimpose_reference)[0]
        pp_structure = ppb.build_peptides(structure)[0]

        # CA only
        fixed = pp_reference.get_ca_list()
        moving = pp_structure.get_ca_list()
        moving_all = Selection.unfold_entities(structure, "A")        
        sup.set_atoms(fixed, moving)
        sup.apply(moving_all)
    
        return structure
Ejemplo n.º 2
0
def getBoundResList(fname_bound, fname_unbound, listOfDictsChainToResId):
    parser = PDBParser(QUIET=True)
    structureUnbound = parser.get_structure(fname_unbound, fname_unbound)
    structureBound = parser.get_structure(fname_bound, fname_bound)
    ppb = PPBuilder()
    pp_list_unbound = ppb.build_peptides(structureUnbound, aa_only=False)
    pp_list_bound = ppb.build_peptides(structureBound, aa_only=False)
    mapper = BoundUnboundMapper(pp_list_unbound, pp_list_bound)
    mapper.build_correspondence()
    newDictsList = []

    for dictOfChainsToRes in listOfDictsChainToResId:
        tempDict = {}
        for chainId_u in dictOfChainsToRes:
            for resId_u in sorted(dictOfChainsToRes[chainId_u]):
                chainId_b_resId_b = mapper.mapUnboundToBoundUsingId(
                    " " if chainId_u == "*" else chainId_u, resId_u)
                #        print(chainId_u, resId_u, chainId_b_resId_b)
                if chainId_b_resId_b is None: continue
                chainId_b, resId_b = chainId_b_resId_b
                if not chainId_b in tempDict:
                    tempDict[chainId_b] = []
                tempDict[chainId_b].append(resId_b)
        newDictsList.append(tempDict)
    return newDictsList
def main():
    parser = optparse.OptionParser()
    parser.add_option("-p",
                      "--pdb",
                      dest="pdb",
                      help="path to PDB file",
                      metavar="STRING")
    parser.add_option("-f",
                      "--pdb_fasta",
                      dest="pdb_fasta",
                      help="path to PDB fasta file (out)",
                      metavar="STRING")

    (options, args) = parser.parse_args()
    pdb_fasta = options.pdb_fasta
    pdb_file = options.pdb

    pdb_name = os.path.basename(pdb_file).split(".")[0]

    parser = BP.PDBParser()
    ppb = PPBuilder(radius=1000)  # retrieve all amino acids
    pdbseq = ""
    structure = parser.get_structure(pdb_name, pdb_file)
    model = structure[0]
    for chain in model:
        for pp in ppb.build_peptides(model[chain.id], aa_only=False):
            pdbseq += (pp.get_sequence())

    print ">", pdb_name, len(pdbseq)
    print pdbseq

    with open(pdb_fasta, "w") as o:
        o.write(">%s %i\n%s\n" % (pdb_name, len(pdbseq), pdbseq))
def main():
    parser = optparse.OptionParser()
    parser.add_option("-p", "--pdb", dest="pdb", help="path to PDB file", metavar="STRING")
    parser.add_option("-f", "--pdb_fasta", dest="pdb_fasta", help="path to PDB fasta file (out)", metavar="STRING")

    (options, args) = parser.parse_args()
    pdb_fasta = options.pdb_fasta
    pdb_file = options.pdb

    pdb_name = os.path.basename(pdb_file).split(".")[0]

    parser=BP.PDBParser()
    ppb = PPBuilder(radius=1000) # retrieve all amino acids
    pdbseq = ""
    structure = parser.get_structure(pdb_name,pdb_file)
    model = structure[0]
    for chain in model:
        for pp in ppb.build_peptides(model[chain.id], aa_only=False):
            pdbseq += (pp.get_sequence())

    print ">",pdb_name,len(pdbseq)
    print pdbseq

    with open(pdb_fasta,"w") as o:
        o.write(">%s %i\n%s\n"%(pdb_name,len(pdbseq),pdbseq))
Ejemplo n.º 5
0
    def get_contact_map(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return a complete contact map (see description in exercise sheet) 
                for a given chain in a Biopython.PDB structure as numpy array. 
                The values in the matrix describe the c-alpha distance between all residues 
                in a chain of a Biopython.PDB structure.
                Only integer values of the distance have to be given (see below).
        '''

        length = len(self.get_sequence(chain_id))
        ppb = PPBuilder()
        contact_map = np.zeros((length, length), dtype=np.float32)

        for pp in ppb.build_peptides(self.structure[0][chain_id],
                                     aa_only=True):
            for i, residue_1 in enumerate(pp):
                for j, residue_2 in enumerate(pp):
                    contact_map[i, j] = residue_1['CA'] - residue_2['CA']

        return contact_map.astype(np.int)  # return rounded (integer) values
Ejemplo n.º 6
0
def get_chain_position(input_file, global_index):
    chain = None
    position_in_chain = -1
    file_name, _ = os.path.splitext(input_file)
    file_name = file_name.replace('./', '')
    parser = PDBParser()
    structure = parser.get_structure(file_name, input_file)
    builder = PPBuilder()
    peptides = builder.build_peptides(structure, aa_only=False)
    pps = [EPolyPeptide(pp) for pp in peptides]
    total_length = sum([len(pp) for pp in pps])
    if global_index >= total_length:
        return None, -1
    distance = 0
    offset = 0
    global_index = int(global_index) + 1
    while distance < global_index:
        pp = pps[offset]
        distance += len(pp)
        offset += 1
        if global_index <= distance:
            position_in_chain = global_index - (distance - len(pp))
            chain = pp.chain_id
            break
    return chain, position_in_chain
Ejemplo n.º 7
0
def deleteChain():# Delete a complete chain from a pdb and save the new structure in pdbname_free.pdb
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	nb_chain=input('How many chain do you want to delete : ')
	for i in range(nb_chain):
		rm_chain=raw_input('What chain you want to delete : ')
		for model in structure:
			for chain in model:
				if(chain.id==rm_chain):
					model.detach_child(chain.id)
	pept = raw_input('Do you want to get a pdb with the sequence in its name : ')
	if(pept == 'y'):
		ppb=PPBuilder()
		for pp in ppb.build_peptides(structure):
			seq = seq + pp.get_sequence()
		seq=seq.lower()
		seq=str(seq)
		w = PDBIO()
		w.set_structure(structure)
		w.save(seq+'_bound.pdb')
	else:
		w = PDBIO()
		w.set_structure(structure)
		w.save(nameStruct+'_without'+rm_chain+'.pdb')
Ejemplo n.º 8
0
def deleteChain():# Delete a complete chain from a pdb and save the new structure in pdbname_free.pdb
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	nb_chain=input('How many chain do you want to delete : ')
	for i in range(nb_chain):
		rm_chain=raw_input('What chain you want to delete : ')
		for model in structure:
			for chain in model:
				if(chain.id==rm_chain):
					model.detach_child(chain.id)
	pept = raw_input('Do you want to get a pdb with the sequence in its name : ')
	if(pept == 'y'):
		ppb=PPBuilder()
		for pp in ppb.build_peptides(structure):
			seq = seq + pp.get_sequence()
		seq=seq.lower()
		seq=str(seq)
		w = PDBIO()
		w.set_structure(structure)
		w.save(seq+'_bound.pdb')
	else:
		w = PDBIO()
		w.set_structure(structure)
		w.save(nameStruct+'_without'+rm_chain+'.pdb')
Ejemplo n.º 9
0
def doChainAlignments(pdbID, structure, consensusFile, chainSequencesDir,
                      verbose):
    print('Pairwise alignment of chain sequences with consensus...')
    chains = [chain for chain in structure.get_chains()]
    ppb = PPBuilder()
    for chain in chains:
        sequence = ""
        for pp in ppb.build_peptides(chain):
            sequence += pp.get_sequence()
        sequenceID = pdbID + '_' + chain.get_id()
        sequenceOutput = os.path.join(chainSequencesDir, sequenceID + '.fasta')
        if (len(sequence) == 0):
            print("ERROR: Unable to get chain sequence from PDB file.")
            exit("Selected PDB-file does not contain protein structure.")
        with open(sequenceOutput, 'w') as f:
            f.write('>' + sequenceID + '\n' + str(sequence))
        if verbose:
            print('Chain ' + chain.get_id())
            print(sequence)
        subprocess.check_call([
            'needle', '-asequence', sequenceOutput, '-bsequence',
            consensusFile, '-noendweight', '-endopen', '10.0', '-endextend',
            '0.5', '-brief', '-aformat', 'srspair', '-auto', '-aname_outfile',
            pdbID + '_' + chain.get_id(), '-adirectory_outfile',
            chainSequencesDir
        ])
    print('OK')
Ejemplo n.º 10
0
    def _get_proteins_by_structure(self, pdb_structure, model, file_path):
        """
            _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data
        """
        ppb = PPBuilder()
        protein_data = []

        # Parse for the chain_id and chain sequence
        for c_ele in pdb_structure.get_chains():
            if (c_ele):
                c_ppd_list = []
                for c_ppd in ppb.build_peptides(c_ele):
                    c_pp_seq = str(c_ppd.get_sequence())
                    c_ppd_list.append(c_pp_seq)
                c_seq = ''.join(c_ppd_list)
                protein_data.append({
                    'id':
                    os.path.basename(file_path),
                    'model_id':
                    model,
                    'chain_id':
                    c_ele.get_id(),
                    'sequence':
                    c_seq,
                    'md5':
                    hashlib.md5(c_seq.encode()).hexdigest()
                })

        return protein_data
Ejemplo n.º 11
0
def match_pdb_residue_num_to_seq(model, ref=None):
    """Match PDB residue numbering (as given in PDB file) to
    a reference sequence (can be pdb sequence) numbered by index.

    Reference sequence is 1-indexed (and is indexed as such in output).

    Args:
        model: A biostructmap Model object.
        ref (dict): A dictionary containing reference protein sequences for each
            chain in the protein structure. Defaults to the protein sequences
            given in PDB file.
    Returns:
        dict: A dictionary mapping reference sequence index (key) to
            residue numbering as given in the PDB file (value). For example,
            we might have a key of ('A', 17) for the 17th residue in the
            reference sequence for chain 'A', with a value of
            ('A', (' ', 273, ' ')) that represents the Bio.PDB identifier for
            the corresponding residue.
    """
    ppb = PPBuilder()
    polypeptides = ppb.build_peptides(model.parent().structure)
    if ref is None:
        ref = model.parent().sequences
    output = {}
    for peptide in polypeptides:
        peptide_sequence = peptide.get_sequence()
        # Presume that peptide belongs to a single chain
        chain_id = peptide[0].get_full_id()[2]
        _, ref_to_pdb = align_protein_sequences(peptide_sequence,
                                                ref[chain_id])

        for ref_pos, pdb_pos in ref_to_pdb.items():
            output[(chain_id,
                    ref_pos)] = peptide[pdb_pos - 1].get_full_id()[2:4]
    return output
Ejemplo n.º 12
0
    def _map(self, model):
        """Map (PRIVATE).

        :param model: the model that will be mapped
        :type model: L{Model}
        """
        ppb = PPBuilder()
        ppl = ppb.build_peptides(model)
        fd = {}
        for pp in ppl:
            try:
                # make fragments
                flist = _make_fragment_list(pp, self.flength)
                # classify fragments
                mflist = _map_fragment_list(flist, self.reflist)
                for i in range(0, len(pp)):
                    res = pp[i]
                    if i < self.edge:
                        # start residues
                        continue
                    elif i >= (len(pp) - self.edge):
                        # end residues
                        continue
                    else:
                        # fragment
                        index = i - self.edge
                        assert (index >= 0)
                        fd[res] = mflist[index]
            except PDBException as why:
                if why == 'CHAINBREAK':
                    # Funny polypeptide - skip
                    pass
                else:
                    raise PDBException(why)
        return fd
 def read_structure_seqs(self, strucm):
     """ Extracts sequences from structure"""
     # PDB extrated sequences
     for mod in strucm.st:
         ppb = PPBuilder()
         for chn in mod.get_chains():
             seqs = []
             #self.sequences[ch_id]['pdb'][mod.id] = [1]
             ch_id = chn.id
             wrong_order = False
             for frag in ppb.build_peptides(chn):
                 start = frag[0].get_id()[1]
                 end = frag[-1].get_id()[1]
                 frid = '{}:{}-{}'.format(ch_id, start, end)
                 sqr = SeqRecord(frag.get_sequence(), 'pdbsq_' + frid,
                                 'pdbsq_' + frid,
                                 'PDB sequence chain ' + frid)
                 if start < end:
                     sqr.features.append(
                         SeqFeature(FeatureLocation(start, end)))
                 else:
                     print("Warning: unusual residue numbering at chain ",
                           ch_id)
                     print(
                         "Warning: chain reconstruction may not be available"
                     )
                     sqr.features.append(
                         SeqFeature(FeatureLocation(end, start)))
                     wrong_order = True
                 seqs.append(sqr)
             if ch_id not in self.data:
                 self.add_empty_chain(ch_id)
             self.data[ch_id]['pdb'][mod.id] = seqs
             self.data[ch_id]['pdb']['wrong_order'] = wrong_order
Ejemplo n.º 14
0
 def _map(self, model):
     """
     @param model: the model that will be mapped
     @type model: L{Model}
     """
     ppb=PPBuilder()
     ppl=ppb.build_peptides(model)
     fd={}
     for pp in ppl:
         try:
             # make fragments
             flist=_make_fragment_list(pp, self.flength)
             # classify fragments
             mflist=_map_fragment_list(flist, self.reflist)
             for i in range(0, len(pp)):
                 res=pp[i]
                 if i<self.edge:
                     # start residues
                     continue
                 elif i>=(len(pp)-self.edge):
                     # end residues
                     continue
                 else:
                     # fragment
                     index=i-self.edge
                     assert(index>=0)
                     fd[res]=mflist[index]
         except PDBException, why:
             if why == 'CHAINBREAK':
                 # Funny polypeptide - skip
                 pass
             else:
                 raise PDBException(why)
Ejemplo n.º 15
0
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str):
    """
    :param filelist:
    :param q:
    :param lock:
    :param cursor:
    :param conn:
    :param dir_name:
    """
    with open('status_tmp.txt', 'w') as f:
        f.write('')
    for file in filelist:
        if file in open('status_tmp.txt').readlines():
            continue
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb')
        if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))):
            print("File with ID PDB: {:s} not found!".format(file))
            continue
        parser = PDBParser()
        structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file)))
        name = parser.header.get('name', '')
        head = parser.header.get('head', '')
        method = parser.header.get('structure_method', '')
        res = parser.header.get('resolution', '')
        ncomp = 0
        nchain = 0
        eclist = []
        for values in parser.header['compound'].values():
            ncomp += 1
            nchain += len(values['chain'].split(','))
            eclist.append(values.get('ec', '') or values.get('ec_number', ''))
        ec = ", ".join(eclist)
        nres = 0
        mmass = 0
        ppb = PPBuilder()
        for pp in ppb.build_peptides(structure):
            seq = pp.get_sequence()
            nres += len(seq)
            seqan = ProteinAnalysis(str(seq))
            mmass += int(seqan.molecular_weight())
        lock.acquire()
        try:
            cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN,
NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format(
                file, name, head, method, res, ncomp, nchain, nres, mmass, ec))
        except sqlite3.DatabaseError as err:
            print("Error: ", err)
            continue
        else:
            print("Download Done for ID PDB: {:s}".format(file))
            conn.commit()
            q.put(file)
        finally:
            lock.release()
            with open('status_tmp.txt', 'at') as f:
                f.write((file + '\n'))
    os.remove('status_tmp.txt')
    q.put(None)
Ejemplo n.º 16
0
def generate_seq_file(score_file, save_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    mut_chains = sf.iloc[:,0]

    mut_dict = dict()
    mut_track = set()
    pdb_track = set()
    for chain in mut_chains:
        info = chain.split('_')
        pdb_id = info[0]
        chain_id = info[1]
        wt_aa = info[2][0:3]
        mu_aa = info[2][-3:]
        mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2])))
        if not chain in mut_track:
            mut_track.add(chain)
            if pdb_id in pdb_track:
                mut_dict[pdb_id].append({'chain_id':chain_id,
                                         'wt_aa': wt_aa,
                                         'mu_aa': mu_aa,
                                         'mu_pos': mu_pos,
                                         'name': chain})
            else:
                mut_dict[pdb_id] = [{'chain_id': chain_id,
                                     'wt_aa': wt_aa,
                                     'mu_aa': mu_aa,
                                     'mu_pos': mu_pos,
                                     'name': chain}]
                pdb_track.add(pdb_id)
    del mut_track
    del pdb_track
                
    parser = PDBParser()
    seq_builder = PPBuilder()
    pdb_dl_handle = PDBList()
    PDB_DIR = './dataFile/PDB_dl'
    # check if pdb file exists
    mut_collect = dict()
    for pdb_id in mut_dict.keys():
        if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR)
        pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]

        for mutation in mut_dict[pdb_id]:
            protein_chain = model[mutation['chain_id']]
            sequence = "".join([str(pp.get_sequence())
                                for pp in seq_builder.build_peptides(protein_chain)])
            sequence = sequence.replace('\n', '').replace(' ', '')
            assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match'
            mut_Seq_list = list(sequence)
            mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa'])
            mut_Seq = ''.join(mut_Seq_list)
            mut_collect[mutation['name']] = mut_Seq
    
    with open(save_file, 'w') as output_hl:
        for k, v in mut_collect.items():
            output_hl.write(k+'\t'+v+'\n')
Ejemplo n.º 17
0
def polypeptide(pdbfile):
	parser = PDBParser()
	structure = parser.get_structure('test', pdbfile)

	builder = PPBuilder()
	pp, = builder.build_peptides(structure)

	return pp
 def real_seq():
     structure = PDBParser().get_structure(protein.protein_id,
                                           protein.protein_id + '.pdb')
     ppb = PPBuilder()
     seq = ''
     for pp in ppb.build_peptides(structure):
         seq += pp.get_sequence()
     return seq
Ejemplo n.º 19
0
def get_phi_psi_data(pdb_id, chain=None):
    '''Gets phi and phi angle data.'''
    builder = PPBuilder()
    return [polypep.get_phi_psi_list()
            for model in get_structure(pdb_id)
            for chn in model
            if chain is None or chain == chn.get_id()
            for polypep in builder.build_peptides(chn)]
Ejemplo n.º 20
0
def parse_structure(path):
    """
    Parses a PDB formatter structure using Biopython's PDB Parser
    Verifies the integrity of the structure (gaps) and its
    suitability for the calculation (is it a complex?).
    """

    print('[+] Reading structure file: {0}'.format(path))
    fname = os.path.basename(path)
    sname = '.'.join(fname.split('.')[:-1])

    try:
        s = P.get_structure(sname, path)
    except Exception as e:
        print('[!] Structure \'{0}\' could not be parsed'.format(sname),
              file=sys.stderr)
        raise Exception(e)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    # Remove HETATMs and solvent
    res_list = list(s.get_residues())
    n_res = len(res_list)
    _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H'
    for res in res_list:
        if _ignore(res):
            chain = res.parent
            chain.detach_child(res.id)
        elif not is_aa(res, standard=True):
            raise ValueError(
                'Unsupported non-standard amino acid found: {0}'.format(
                    res.resname))

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)
    n_chains = len(set([c.id for c in s.get_chains()]))

    if n_peptides != n_chains:
        print('[!] Structure contains gaps:', file=sys.stderr)
        for i_pp, pp in enumerate(peptides):
            print(
                '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'
                .format(i_pp, pp[0], pp[-1]),
                file=sys.stderr)
        #raise Exception('Calculation cannot proceed')

    return (s, n_chains, n_res)
Ejemplo n.º 21
0
def get_pdb_amino_acid_sequences(pdb_path):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    ppb = PPBuilder()
    pdb_aas = []
    for pp in ppb.build_peptides(structure):
        pdb_aa = str(pp.get_sequence())
        pdb_aas.append(pdb_aa)
    return pdb_aas
Ejemplo n.º 22
0
    def _model_file_to_data(self, file_path, params):
        """
            _model_file_to_data:
                Do the PDB conversion--parse the model pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.PDBParser(PERMISSIVE=1)
        pdb1 = file_path
        pp_no = 0
        data = {}

        try:
            structure = parser.get_structure("test", pdb1)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'PDBParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            # logging.info(f'Getting pdb structure data for {structure}!')
            (compound, source) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            model = structure[0]
            protein_data = self._get_proteins_by_structure(
                structure, model.get_id(), file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                data = {
                    'name': structure.header.get('name', ''),
                    'num_chains': num_chains,
                    'num_residues': num_residues,
                    'num_atoms': num_atoms,
                    'compound': compound,
                    'source': source,
                    'proteins': protein_data
                }
            else:
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
                data = {}
        finally:
            return data, pp_no, params
Ejemplo n.º 23
0
def get_pdb_torsion_angles(pdb_path, chain_index):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    A = []
    ppb = PPBuilder()
    pdb_aas = []
    model = ppb.build_peptides(structure)
    chain = model[chain_index]
    phi_psi_list = chain.get_phi_psi_list()
    return [x[0] for x in phi_psi_list], [x[1] for x in phi_psi_list]
def get_aa_encoded(protein_file):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    ppb = PPBuilder()
    pdb_aas = []
    for pp in ppb.build_peptides(structure):
        pdb_aa = str(pp.get_sequence())
        pdb_aas.append(pdb_aa)
    encoded = int_encoding(pdb_aas, AA_CODES)
    return encoded
Ejemplo n.º 25
0
def get_primary_sequence(input_file):
    file_name, _ = os.path.splitext(input_file)
    file_name = file_name.replace('./', '')
    parser = PDBParser()
    structure = parser.get_structure(file_name, input_file)
    builder = PPBuilder()
    seq = ""
    for chain in builder.build_peptides(structure, aa_only=False):
        seq += chain.get_sequence()
    return seq
Ejemplo n.º 26
0
def obtian_seq_wo_seq_file(score_file):
    score_file = './dataFile/' + score_file
    sf = pd.read_csv(score_file, sep='\t')
    chains_involved = sf.iloc[:, 0]
    pdb = dict()
    pdb_track = set()
    for chain in chains_involved:
        chain_name = chain[0:6]
        pdb_name = chain[0:4]
        # if we encounter a old pdb
        if pdb_name in pdb_track:
            pdb[pdb_name].add(chain_name)
        # else, we have a new pdb
        else:
            # update the track file
            pdb_track.add(pdb_name)
            pdb[pdb_name] = {chain_name}

    # create the link to the PDB database and retrive all the file
    # related to the files, store them locally under ./dataFile/PDB_dl/
    PDB_DIR = './dataFile/PDB_dl'
    if not os.path.exists(PDB_DIR):
        os.mkdir(PDB_DIR)
    # create the download handle
    pdb_dl_handle = PDBList()
    # download all of the pdb files
    for item in pdb.keys():
        if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'):
            pdb_dl_handle.retrieve_pdb_file(pdb_code=item,
                                            file_format='pdb',
                                            overwrite=False,
                                            pdir=PDB_DIR)

    # for each pdb, we will construct the sequence
    seq_dict = dict()
    parser = PDBParser()
    seq_builder = PPBuilder()
    # key is the pdb_id, value is the chain in a
    for pdb_id, chain_names in pdb.items():
        pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent'
        model = parser.get_structure(pdb_id, pdb_file)[0]
        for chain in chain_names:
            # extract the last letter, which is the chain name
            chain_id = chain[-1]
            protein_chain = model[chain_id]
            sequence = "".join([
                str(pp.get_sequence())
                for pp in seq_builder.build_peptides(protein_chain)
            ])
            sequence = sequence.replace('\n',
                                        '').replace(' ',
                                                    '')  # clean the bad chars
            seq_dict[chain] = sequence

    return seq_dict
Ejemplo n.º 27
0
 def get_secondary_structure_details(self, name, pdb_file, aa_only=False):
     parser = PDBParser()
     structure = parser.get_structure(name, pdb_file)
     dssp = DSSP(structure[0], pdb_file, acc_array="Wilke")
     ss = "".join([aa[2] for aa in dssp])
     sasa = [residues[aa[1]] * aa[3] for aa in dssp]
     builder = PPBuilder()
     seq = ""
     for chain in builder.build_peptides(structure, aa_only=aa_only):
         seq += chain.get_sequence()
     return name, seq, ss, sasa, structure
Ejemplo n.º 28
0
 def calculate_RMSD(self,
                    row,
                    source_position,
                    fragment_length,
                    aa_only=False):
     if self.args.source is None:
         setattr(row, "rmsd", -1)
     target_position = row.pos
     source_structure = self.__get_structure__(self.args.source)
     builder = PPBuilder()
     type1 = builder.build_peptides(source_structure, aa_only=aa_only)
     length1 = type1[-1][-1].get_full_id()[3][1]
     fixed_residues = []
     for pp in type1:
         fixed_residues += [x for x in pp]
     fixed = [atom['CA'] for atom in fixed_residues
              ][source_position:source_position + fragment_length]
     builder = PPBuilder()
     target_file = self.get_target_file(row.protein_id)
     if target_file is None:
         setattr(row, "rmsd", -1)
         return
     target_structure = self.__get_structure__(target_file)
     type2 = builder.build_peptides(target_structure, aa_only=aa_only)
     length2 = type2[-1][-1].get_full_id()[3][1]
     moving_residues = []
     for pp in type2:
         moving_residues += [x for x in pp]
     moving = [atom['CA'] for atom in moving_residues
               ][target_position:target_position + fragment_length]
     lengths = [length1, length2]
     smallest = min(int(item) for item in lengths)
     # find RMSD
     if len(fixed) != len(moving):
         setattr(row, "rmsd", -1)
         return
     sup = Bio.PDB.Superimposer()
     sup.set_atoms(fixed, moving)
     sup.apply(target_structure[0].get_atoms())
     RMSD = round(sup.rms, 4)
     setattr(row, "rmsd", RMSD)
Ejemplo n.º 29
0
 def get_sequence( self, chain_id ):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     ppb = PPBuilder()
     return ppb.build_peptides(self.structure[chain_id])[0].get_sequence()
Ejemplo n.º 30
0
def test_add_residue():
    structure = PeptideBuilder.initialize_res("A")
    for aa in "CDEFGHIKLMNPQRSTVWY":
        structure = PeptideBuilder.add_residue(structure, aa)

    # extract peptide from structure and compare to expected
    ppb = PPBuilder()
    pp = next(iter(ppb.build_peptides(structure)))
    assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY"

    # now compare to saved reference structure
    assert compare_to_reference(structure, "extended.pdb")
def test_add_residue():
    """
    Build a peptide containing all 20 amino acids
    """
    structure = PeptideBuilder.initialize_res("A")
    for aa in "CDEFGHIKLMNPQRSTVWY":
        PeptideBuilder.add_residue(structure, aa)

    # extract peptide from structure and compare to expected
    ppb = PPBuilder()
    pp = next(iter(ppb.build_peptides(structure)))
    assert pp.get_sequence() == "ACDEFGHIKLMNPQRSTVWY"

    assert compare_to_reference(structure, "extended.pdb")
Ejemplo n.º 32
0
def write_seqs(filename):
    """Process a given structure file to extract sequences and save them into files.
    """
    struct_name = os.path.splitext(os.path.basename(filename))[0]
    is_CIF = False
    try:
        parser = PDBParser()
        structure = parser.get_structure(struct_name, filename)

    except:
        try:
            parser = MMCIFParser()
            structure = parser.get_structure(struct_name, filename)
            is_CIF = True

        except:
            sys.stderr.write("ERROR: File {} is not a proper/supported protein structure file.\n".format(filename))
            return

    if is_CIF:
        name, organism, resolution = get_info_from_cif_dict(filename)

    else:
        name, organism, resolution = get_info_from_header(structure)

    description = "| {} | {} | Resolution {:.2f} A".format(name, organism, resolution)

    ppb = PPBuilder()
    
    chain_seqrecord_list = []
    peptide_seqrecord_list = []
    for model in structure:
        for chain in model:
            base_id = "{}.{:d}_{}".format(struct_name, model.id, chain.id)
            chain_seq = Seq("")
            for (pp_id, pp) in enumerate(ppb.build_peptides(chain)):
                chain_seq += pp.get_sequence()
                peptide_seqrecord_list.append(SeqRecord(pp.get_sequence(), 
                    id = "{}.{:d}".format(base_id, pp_id),
                    description = description))

            chain_seqrecord_list.append(SeqRecord(chain_seq,
                id = base_id,
                description = description))

    base_output_name = os.path.splitext(filename)[0]
    
    SeqIO.write(chain_seqrecord_list, "{}_chains.fasta".format(base_output_name), "fasta")
    SeqIO.write(peptide_seqrecord_list, "{}_peptides.fasta".format(base_output_name), "fasta")
Ejemplo n.º 33
0
    def getChainSeq(chain: Bio.PDB.Structure.Structure) -> str:
        '''
        Returns the sequence of a protein chain, in the format of a Bio.PDB.Structure.Structure object.

        ASSUMPTIONS:
        -------------------
        - The chain objects are disjoint (in sequence) from other chain objects.
        - A structure object is not passed (may error-out!)
        '''
        ppb = PPBuilder()

        try:  # errored out a few times, see note below
            seq = (str(ppb.build_peptides(chain)[0].get_sequence()))
        except:
            seq = "X"  # in case the chain doesn't exist, supply a token that will be ~0.0 sequence identity for all sequences!
        return seq
Ejemplo n.º 34
0
def pdb_extract_chain_seqs(struct):
	"""
	Takes a Bio.PDB.Structure object, returns a dictionary
	of chains and sequences (eg {'A':'MTSSLGRF', 'B':'MSLQRGFIN'}
	(for structures with multiple models, like NMR structures, 
	only the first model is considered).
	"""
	
	chains = {}
	ppb=PPBuilder()
	for chain in struct[0]: # takes model 1
		for pp in ppb.build_peptides(chain, aa_only=False):
			seq = str(pp.get_sequence())
			chains[chain.id] = "".join(seq)
	
	return chains
Ejemplo n.º 35
0
def get_coords(pdb_path, chain_index):
    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        pdb_path[:-4], pdb_path)
    A = []
    ppb = PPBuilder()
    pdb_aas = []
    model = ppb.build_peptides(structure)
    chain = model[chain_index]
    try:
        ca_list = chain.get_ca_list()
    except:
        print('get_coords chain.get_ca_list() exception', chain_index)
        return A
    for ca in ca_list:
        coord = ca.get_coord()
        A.append(np.asarray(coord))
    return A
Ejemplo n.º 36
0
def get_sequence_position(structure, chain_id, start_position, end_position):
    builder = PPBuilder()
    peptides = builder.build_peptides(structure, aa_only=False)
    pps = [EPolyPeptide(pp) for pp in peptides]
    seq_leftover = 0
    start = None
    end = None
    for pp in pps:
        if not pp.chain_id == chain_id:
            seq_leftover += len(pp)
            continue
        start = int(start_position) - pp.start
        end = int(end_position) - pp.start
        break

    if not start and not end:
        return -1
    else:
        return seq_leftover + start
Ejemplo n.º 37
0
def getSequence(): # Get the sequence of a specific chain
	parser = PDBParser()
	nameStruct=pdb_name.partition('.')[0]
	structure = parser.get_structure(nameStruct, pdb_name)
	header = parser.get_header()
	trailer = parser.get_trailer()
	seq=''
	
	what_chain=raw_input('For what chain do you want the sequence : ')

	for model in structure:
		for chain in model:
			if chain.id != what_chain:
				model.detach_child(chain.id)

	ppb=PPBuilder()
	for pp in ppb.build_peptides(structure):
		seq = seq + pp.get_sequence()
	seq=seq.upper()
	print seq
def getPDBInfo(pdb, pdbpath):
    # extract information from the PDB file using Biopython
    # Biopython parsers
    parser = BP.PDBParser()
    ppb = PPBuilder()

    pdbseq = ""
    # PDB descriptors
    # name = pdb.split("/")[-1].split(".")[0].split("_")[0]
    if len(pdb.split("/")[-1].split(".")[0].split("_")) > 1:
        chain = pdb.split("/")[-1].split(".")[0].split("_")[1]
    else:
        # not "_" delimiters in pdb name
        chain = pdb.split("/")[-1].split(".")[0][4]

    if os.path.exists(os.path.join(pdbpath, pdb + "_ren.pdb")):
        structure = parser.get_structure(pdb, os.path.join(pdbpath, pdb + "_ren.pdb"))
    residue_list = BP.Selection.unfold_entities(structure[0][chain], 'R')

    # the build_peptides method has an option for treating non-standard amino acids
    for pp in ppb.build_peptides(structure[0][chain], aa_only=False):
        pdbseq += (pp.get_sequence())
    return structure, residue_list, pdbseq
Ejemplo n.º 39
0
def parse_structure(path):
    """
    Parses a structure using Biopython's PDB/mmCIF Parser
    Verifies the integrity of the structure (gaps) and its
    suitability for the calculation (is it a complex?).
    """

    print('[+] Reading structure file: {0}'.format(path))
    fname = os.path.basename(path)
    sname = '.'.join(fname.split('.')[:-1])
    s_ext = fname.split('.')[-1]

    _ext = set(('pdb', 'ent', 'cif'))
    if s_ext not in _ext:
        raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext))

    if s_ext in set(('pdb', 'ent')):
        sparser = PDBParser(QUIET=1)
    elif s_ext == 'cif':
        sparser = MMCIFParser()

    try:
        s = sparser.get_structure(sname, path)
    except Exception as e:
        print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr)
        raise Exception(e)

    # Keep first model only
    if len(s) > 1:
        print('[!] Structure contains more than one model. Only the first one will be kept')
        model_one = s[0].id
        for m in s.child_list[:]:
            if m.id != model_one:
                s.detach_child(m.id)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    # Remove HETATMs and solvent
    res_list = list(s.get_residues())

    def _is_het(residue):
        return residue.id[0][0] == 'W' or residue.id[0][0] == 'H'

    for res in res_list:
        if _is_het(res):
            chain = res.parent
            chain.detach_child(res.id)
        elif not is_aa(res, standard=True):
            raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname))
    n_res = len(list(s.get_residues()))

    # Remove Hydrogens
    atom_list = list(s.get_atoms())

    def _is_hydrogen(atom):
        return atom.element == 'H'

    for atom in atom_list:
        if _is_hydrogen(atom):
            residue = atom.parent
            residue.detach_child(atom.name)

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)
    n_chains = len(set([c.id for c in s.get_chains()]))

    if n_peptides != n_chains:
        print('[!] Structure contains gaps:', file=sys.stderr)
        for i_pp, pp in enumerate(peptides):
            print('\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'.format(i_pp, pp[0], pp[-1]), file=sys.stderr)
        #raise Exception('Calculation cannot proceed')

    return (s, n_chains, n_res)
pdb_file = gzip.open(os.path.join(input_path, filename))
pdb_parser = Bio.PDB.PDBParser(pdb_file)

## to get structural data
pdb_id = pdb_filename[7:11]
structure = pdb_parser.get_structure(pdb_id, pdb_file)

print structure


# Extract sequence from coordinate information
# Amino acid residues present in SEQRES but that doesn't have coordinate information
# are listed in REMARK 465
ppb = PPBuilder()        # PPBuilder uses C--N distance to find polypeptides.
for pp in ppb.build_peptides(structure):
    sequence = pp.get_sequence()
    print "This is the polypeptide sequence of %s" %pdb_id
    print sequence
    print "length of sequence: ", len(sequence)

# OR extract sequence from SEQRES - ?

# when I aligned the sequence output of 1A27 here and the fasta sequence, I see 4 residues missing
# these 4 residues are reported as REMARK 465 MISSING RESIDUES (THE FOLLOWING RESIDUES WERE NOT LOCATED
# IN THE EXPERIMENT)
# does this mean unresolved or missing in protein construct?



Ejemplo n.º 41
0
def create_residue_lists(pdb, chain):
	print pdb, chain
	res_lists = []
	aa_list = ""
	try:
		ppb = PPBuilder()	
		p=PDB.PDBParser(QUIET = True)
		s=p.get_structure('X', "../foldx_setup/repaired_pdbs/RepairPDB_" + pdb + "_" + chain + ".pdb")
		pp = ppb.build_peptides(s)[0]
		seq = str(pp.get_sequence())
		
		#Get the structures
		ref_struct = s[0]
		ref_chain = ref_struct[chain]
		ref_residues = []
		ref_res_nums = []
		for res in ref_chain:
			ref_residues.append( res.resname )
			ref_res_nums.append(res.id[1])				
	except KeyError:
		print "Something is wrong with the mapped residues"
		#continue
	seq_array = ddG_var_helper.convert_to_one_letter_code(ref_residues)
	
	count = 0
	i = 0
	while(i < len(seq_array)):	
		if (i < 101): #These statements split up the sequence into 100 aa chunks for performing a Position Scan
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 201 and i>101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 301 and i>201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 401 and i>301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 401):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 501 and i>401):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)			
		elif(i == 501):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 601 and i>501):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 601):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if(i < 701 and i>601):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 701):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 801 and i>701):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 801):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		if (i < 901 and i>801):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 901):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1001 and i>901):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1001):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1101 and i>1001):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1201 and i>1101):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
			aa_list = ""
		elif(i < 1301 and i>1201):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			if(i == len(seq_array) - 1):
				aa_list = aa_list + ";"	
				res_lists.append(aa_list)
		elif(i == 1301):
			aa_list = aa_list + ","
			res_num = ref_res_nums[i]
			aa_list = aa_list + seq_array[i] + chain + str(res_num) + "a"
			aa_list = aa_list + ";"	
			res_lists.append(aa_list)
		i = i + 1
	return res_lists
Ejemplo n.º 42
0
 def get_sequence(self):
     ppb = PPBuilder()
     polypeptide = ppb.build_peptides(self.structure[0][self.chain])
     return polypeptide[0].get_sequence()
Ejemplo n.º 43
0
'''

Extract the protein sequence from a PDB chain.

-----------------------------------------------------------
(c) 2013 Allegra Via and Kristian Rother
    Licensed under the conditions of the Python License

    This code appears in section 21.4.2 of the book
    "Managing Biological Data with Python".
-----------------------------------------------------------
'''

from Bio import PDB
from Bio.PDB.Polypeptide import PPBuilder

parser = PDB.PDBParser()
structure = parser.get_structure("2DN1", "dn/pdb2dn1.ent")
ppb = PPBuilder()
peptides = ppb.build_peptides(structure)
for pep in peptides:
    print pep.get_sequence()
    
nucl_xen=openModels.open('1kx5.pdb',type='PDB')
# nucl_yeast=openModels.open('1id3.pdb',type='PDB')
# h2a_xen=Seq(str(nucl[0].sequence('C')))
# h2a_yeast=Seq(str(nuclZ[0].sequence('C')))
rc('select :.A :.B :.C :.D :.E :.F :.G :.H')
rc('write selected format pdb #0 xen_nucl.pdb')
# rc('write selected format pdb #1 h2a_yeast_xray.pdb')

#generate alignments
#Biopython extracts seqs from pdb
p = PDBParser(PERMISSIVE=1)
s = p.get_structure('1kx5', '1kx5.pdb')
ppb=PPBuilder()
seqs_xen=dict()
for i in ['A','B','C','D','E','F','G','H']:
	seqs_xen[i]=ppb.build_peptides(s[0][i])[0].get_sequence()

#Here we manually input the trunctaed versions of yeast sequences
seqs_yeast=dict()
#CSE4 - H3
#SSKQQWVSSAIQSDSSGRSLSNVNRLAGDQQSINDRALSLLQRTRATKNLFPRREERRRYESSKSDLDIETDYEDQAGNLEIETENEEEAEMETEVPAPVRTHSYALDRYVRQKRREKQRKQSLKR
#VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI
#VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI
seqs_yeast['A']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI')
seqs_yeast['E']=Seq('VEKKYTPSELALYEIRKYQRSTDLLISKIPFARLVKEVTDEFTTKDQDLRWQSMAIMALQEASEAYLVGLLEHTNLLALHAKRITIMKKDMQLARRIRGQFI')
#H4
seqs_yeast['B']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG')
seqs_yeast['F']=Seq('KRHRKILRDNIQGITKPAIRRLARRGGVKRISGLIYEEVRAVLKSFLESVIRDSVTYTEHAKRKTVTSLDVVYALKRQGRTLYGFGG')
#H2A
#MSGGKGGKAGSAA KASQ SRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLTAVLEYLAAEILELAGNAARDNKKTRIIPRHLQLAIRNDDELNKLLGNVTIAQGGVLPNIHQNLLPK KSAKATKASQEL
# SGRGKQGGKTR  AKAK TRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPK KTESSKSKSK
Ejemplo n.º 45
0
def validate_structure(s, selection=None, clean=True):
    # setup logging
    logger = logging.getLogger('Prodigy')

    # Keep first model only
    if len(s) > 1:
        logger.warning('[!] Structure contains more than one model. Only the first one will be kept')
        model_one = s[0].id
        for m in s.child_list[:]:
            if m.id != model_one:
                s.detach_child(m.id)

    # process selected chains
    chains = list(s.get_chains())
    chain_ids = set([c.id for c in chains])

    if selection:
        sel_chains = []
        # Match selected chain with structure
        for sel in selection:
            for c in sel.split(','):
                sel_chains.append(c)
                if c not in chain_ids:
                    raise ValueError('Selected chain not present in provided structure: {0}'.format(c))

        # Remove unselected chains
        _ignore = lambda x: x.id not in sel_chains
        for c in chains:
            if _ignore(c):
                c.parent.detach_child(c.id)

    # Double occupancy check
    for atom in list(s.get_atoms()):
        if atom.is_disordered():
            residue = atom.parent
            sel_at = atom.selected_child
            sel_at.altloc = ' '
            sel_at.disordered_flag = 0
            residue.detach_child(atom.id)
            residue.add(sel_at)

    if clean:
        # Remove HETATMs and solvent
        res_list = list(s.get_residues())
        _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H'
        for res in res_list:
            if _ignore(res):
                chain = res.parent
                chain.detach_child(res.id)
            elif not is_aa(res, standard=True):
                raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname))

        # Remove Hydrogens
        atom_list = list(s.get_atoms())
        _ignore = lambda x: x.element == 'H'
        for atom in atom_list:
            if _ignore(atom):
                residue = atom.parent
                residue.detach_child(atom.name)

    # Detect gaps and compare with no. of chains
    pep_builder = PPBuilder()
    peptides = pep_builder.build_peptides(s)
    n_peptides = len(peptides)

    if n_peptides != len(chain_ids):
        message = '[!] Structure contains gaps:\n'
        for i_pp, pp in enumerate(peptides):
            message += '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > ' \
                       '{2.parent.id} {2.resname}{2.id[1]}\n'.format(i_pp, pp[0], pp[-1])
        logger.warning(message)
        # raise Exception(message)

    return s
Ejemplo n.º 46
0
def create_sequence_from_file(chain_pdb, missing_residues, quiet_parser=False):
    """ Read a PDB file and creates a sequence and mismask to represent 
    its content.
    
    @param chain_pdb: The PDB file to be read.
    @type chain_pd: str
    
    @param missing_residues: A dictionary with the missing residues.
    @type missing_residues: dict
    
    @param quiet_parser: Disable PDBParser warnings.
    @type quiet_parser: bool
        
    """
    sequences = []
    mismasks = []
            
    output_data = []
    output_mismask = []
    
    parser = PDBParser(QUIET=quiet_parser)
    structure = parser.get_structure("X", chain_pdb)

    dssp = DSSP(model=structure[0], pdb_file=chain_pdb)        

    # Loop over residues in peptides
    ppb = PPBuilder()
    pp_list = ppb.build_peptides(structure[0])
    chain_list = structure[0].get_list()
    
    if len(pp_list) == 0:
        raise TorusDBNBuildPolypeptideException(
            "Could not create a list of Polypeptide objects from the file %s." 
            % (chain_pdb)
        )
    else:
       pp_chains, chain_missing_residues = _get_pp_with_chain_break(
            chain_pdb, pp_list, chain_list, missing_residues)
                   
    for pp_index, pp in enumerate(pp_chains):
        phi_psi_list = pp.get_phi_psi_list()
        missing_residues = chain_missing_residues[pp_index]
                    
        for i in xrange(1, len(phi_psi_list)-1):
            seq = [0] * 6
            mism = [eMISMASK.MOCAPY_HIDDEN] + 4 * [eMISMASK.MOCAPY_OBSERVED]
            
            # Amino acid
            res = pp[i]
            res_name = res.get_resname()
            res_index = res.get_id()[1]

            aa_index = three_to_index(res_name)
            
            if res_index in missing_residues:
                seq[3] = aa_index
                mism[1] = eMISMASK.MOCAPY_MISSING # angles unknown
                mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown
                mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown  
            else:
                seq[3] = aa_index

                # Secondary Structure
                try:
                    ss = res.xtra["SS_DSSP"]
                    ss_index = dssp_to_index(ss)
                    seq[4] = ss_index    
                except:
                    mism[3] = eMISMASK.MOCAPY_MISSING # ss unknown
                                 
                # Angles
                if None in phi_psi_list[i]:
                # Previous or next residue missing, therefore angles are
                # Unknown
                    mism[1] = eMISMASK.MOCAPY_MISSING                                                 
                else:
                    seq[1:3] = phi_psi_list[i]
                
                 # Cis/Trans information   
                if (res_index - 1) in missing_residues:
                    mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown   
                else: 
                    try:                                    
                        seq[5] = _get_peptide_bond_conformation(res, pp[i-1])
                    except TorusDBNException:
                        mism[4] = eMISMASK.MOCAPY_MISSING # cis unknown

            output_data.append(seq)
            output_mismask.append(mism)

    if output_data and output_mismask:
        sequences.append(numpy.array(output_data))
        mismasks.append(numpy.array(output_mismask, dtype = numpy.uint))
    else: 
        raise TorusDBNException(
            "Could not create training data from the file %s." 
            % (chain_pdb)
        )
    return sequences, mismasks    
Ejemplo n.º 47
0
        io = PDBIO()
        io.set_structure(structure)
        io.save(myPDBfile + ".pdb")

except:
    print ("you structure is not available in the PDB")
    parser = PDB.PDBParser()
    structure = PDB.PDBParser().get_structure(myPDBfile, myPDBfile + ".pdb")

chains = [chain for chain in structure.get_chains()]
print (chains)
ppb = PPBuilder()
for chain in chains:
    print (chain)
    print (chain.get_id())
    for chainseq in ppb.build_peptides(chain):
        print (chainseq.get_sequence())
        mySeq = chainseq.get_sequence()

        myFastaA = ">" + myPDBfile + "_" + chain.get_id() + "\n" + chainseq.get_sequence()
        print (myFastaA)
        myFastaAfile = open(myPDBfile + "_" + chain.get_id() + ".fasta", "w")
        myFastaAfile.write(">" + myPDBfile + "_" + chain.get_id() + "\n" + str(mySeq))
        myFastaAfile.close()

        myFastafiles = os.listdir(".")
        myFastaIter = iter(myFastafiles)

        for myFasta in myFastaIter:
            if myPDBfile + "_" + chain.get_id() in myFasta:
                print (myFasta)