def parse_pdb (self):

        pdb_struct = None
        #checking for file handle or file name to parse
        if self.pdb_file:
            pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_file)[0]
        elif self.pdb_filename:
            pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_filename)[0]
        else:
            return None

        #extracting sequence and preparing dictionary of residues
        #bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        for chain in pdb_struct:
            self.pdb_seq[chain.id] = Seq('')            
            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS')
                else:
                    try:
                        self.pdb_seq[chain.id] += polypeptide.three_to_one(res.resname)
                    except Exception as msg:
                        continue
        return pdb_struct
    def parse_pdb(self):

        pdb_struct = None
        #checking for file handle or file name to parse
        if self.pdb_file:
            pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure(
                'ref', self.pdb_file)[0]
        elif self.pdb_filename:
            pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure(
                'ref', self.pdb_filename)[0]
        else:
            return None

        #extracting sequence and preparing dictionary of residues
        #bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        for chain in pdb_struct:
            self.pdb_seq[chain.id] = Seq('')
            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS')
                else:
                    try:
                        self.pdb_seq[chain.id] += polypeptide.three_to_one(
                            res.resname)
                    except Exception as msg:
                        continue
        return pdb_struct
Example #3
0
def get_adjacency_matrix(pdb_id, pdb_file):

    parser = PDBParser()   # initialize biopython PDB parser
    structure = parser.get_structure(pdb_id, pdb_file)  # get PDB parsed by providing id and file name

    # deriving all amino acids based on presence of beta carbon
    amino_acids = [res for res in structure[0]['A'] if 'CB' in res]

    # set up df based on num. of amino acids. All amino acid pair interaction values will be appended.
    adj_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids)))

    for i, r1 in enumerate(amino_acids):
        for j, r2 in enumerate(amino_acids):
            if i != j:
                # looking through all non-self AA interactions
                distance = r1['CB'] - r2['CB']  # distance in Angstrom, 3D space between beta carbons on 2 amino acids

                # if 3D distance < 8 Angstrom, then 3D contact is assumed.
                # Adjancency matrix has a 1 for amino acids with 3D contact (8 A limit) and 0 for not.
                if distance <= 8:
                    adj_df_values[i][j] = 1.0

                else:
                    adj_df_values[i][j] = 0
            else:
                adj_df_values[i][j] = 0

    # df with rows and cols having aa name and position; values from appended df
    adjacency_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                                columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                                data=adj_df_values)

    return adjacency_df
Example #4
0
    def select_ref_atoms(self, fragment, ref_pdbio_struct, use_similar=False):

        for chain in ref_pdbio_struct:
            for res in chain:
                try:
                    gn = self.get_generic_number(res)
                    if gn == fragment.rotamer.residue.display_generic_number.label:
                        logger.info("Ref {}:{}\tFragment {}:{}".format(
                            polypeptide.three_to_one(res.resname),
                            self.get_generic_number(res),
                            fragment.rotamer.residue.amino_acid, fragment.
                            rotamer.residue.display_generic_number.label))
                        if use_similar:
                            for rule in self.similarity_rules:
                                if polypeptide.three_to_one(
                                        res.resname
                                ) in rule[self.similarity_dict[
                                        "target_residue"]] and fragment.rotamer.residue.amino_acid in rule[
                                            self.similarity_dict[
                                                "target_residue"]] and fragment.interaction_type.slug in rule[
                                                    self.similarity_dict[
                                                        "interaction_type"]]:
                                    return [res['CA'], res['N'], res['O']]
                        else:
                            return [res['CA'], res['N'], res['O']]
                except Exception as msg:
                    continue
        return []
Example #5
0
def get_distance_matrix(pdb_id, pdb_file):
    parser = PDBParser()  # initialize biopython PDB parser
    structure = parser.get_structure(pdb_id, pdb_file)  # get PDB parsed by providing id and file name

    # deriving all amino acids based on presence of beta carbon
    amino_acids = [res for res in structure[0]['A'] if 'CB' in res]

    # set up df based on num. of amino acids. All amino acid pair interaction values will be appended.
    dist_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids)))

    for i, r1 in enumerate(amino_acids):
        for j, r2 in enumerate(amino_acids):
            if i != j:
                # looking through all non-self AA interactions
                dist = r1['CB'] - r2['CB']  # distance in Angstrom, 3D space between beta carbons on 2 amino acids
                dist_df_values[i][j] = dist  # distance matrix just has 3D distance values. No cutoff required.
            else:
                dist_df_values[i][j] = 0

    # df with rows and cols having aa name and position; values from appended df
    distance_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                               columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                               data=dist_df_values)

    return distance_df
Example #6
0
    def parse_structure(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        for chain in pdb_struct:
            self.residues[chain.id] = {}
            self.pdb_seq[chain.id] = Seq('')

            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    resname = polypeptide.three_to_one('HIS')
                else:
                    if res.resname not in self.residue_list:
                        continue
                    self.residues[chain.id][res.id[1]] = MappedResidue(
                        res.id[1], polypeptide.three_to_one(res.resname))

            self.pdb_seq[chain.id] = ''.join([
                self.residues[chain.id][x].name
                for x in sorted(self.residues[chain.id].keys())
            ])

            for pos, res in enumerate(sorted(self.residues[chain.id].keys()),
                                      start=1):
                self.residues[chain.id][res].pos_in_aln = pos
def aa_to_index(aa):
    """
    :param aa: Three character amino acid name.
    :returns: Integer index as per BioPython, unknown/non-standard amino acids return 20.
    """
    if Polypeptide.is_aa(aa, standard=True):
        return Polypeptide.three_to_index(aa)
    else:
        return 20
Example #8
0
    def _calc_residue_dist(self, residue_one, residue_two, dist_atoms='CA'):
        """Returns the C-alpha distance between two residues"""

        if not Polypeptide.is_aa(residue_one) or not Polypeptide.is_aa(residue_two):
            return np.nan
        dist_atom_1 = dist_atoms if dist_atoms in residue_one else 'CA'
        dist_atom_2 = dist_atoms if dist_atoms in residue_two else 'CA'

        try:
            diff_vector = residue_one[dist_atom_1].coord - residue_two[dist_atom_2].coord
        except KeyError:
            return np.nan
        return np.sqrt(np.sum(diff_vector * diff_vector))
def MutationsDict(file, positions=None):
    """Get dictionary with lists of mutations per position in protein, ignore
    positions without residue in pdb file.

    Parameters:
        file (string): pdb file to get mutations from
        positions: list of tuples of the form (chain, first, last) for positions
                   to mutate for all other aminoacids. If None, mutates all
                   positions in all chains

    Returns:
        dict with keys :aa:chain:position, each containing lists with
        :aa:chain:position:mutated_aa for all mutations

    """

    # Sorted list of one letter amino acids
    AA = list(Bio.PDB.Polypeptide.aa1)
    # Generate model of original pdb file
    model = it.Pmolecule(file).model
    # Dict to store mutations
    mutations = dict()
    if positions:
        for chain_id, first, last in positions:
            # Get chain corresponding to chain_id given
            chain = next(chain for chain in model.get_chains()
                         if chain.id == chain_id)
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    prefix = code + chain_id + str(position)
                    # Only save positions between first and last
                    if position in range(first, last + 1):
                        mutations[prefix] = [
                            prefix + aa for aa in AA if aa != code
                        ]
    else:
        for chain in model.get_chains():
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    chain_id = chain.id
                    prefix = code + chain_id + str(position)
                    mutations[prefix] = [
                        prefix + aa for aa in AA if aa != code
                    ]
    return mutations
Example #10
0
	def definePeptideChain(self):		# find peptide chain if not stated in self.__table and fill self.__peptide
		l = 'INFINITY'			# with list of peptide residues if length is less than 30
		if not self.__table['chain_antigen'][0]:
			for i in self.__chains:
				buf = len(PPBuilder().build_peptides(self.__struct[0][i])[0])
				if (buf <= l):
					l = buf
					chid = i
			self.__table.loc['chain_antigen', :] = chid
		else:
			chid = self.__table['chain_antigen'][0]
					
		pp = list(self.__struct[0][chid])
		
		if (len(pp) > 30):
			line = self.__name + '\t;TOO MANY AMINO ACIDS (' + str(len(pp)) + ') TO BE A PEPTIDE :(\n'
			self.printerr('definePeptideChain(): ' + line)
			return 0
				
		pep_res = []
		for r in pp:
			if (Polypeptide.is_aa(r.get_resname(), standard=True)):
				pep_res.append(r)
		self.__peptide = pep_res
		self.__regions_res.update({'peptide':pep_res})
		return 1
Example #11
0
def modeller_get_chain_seqs(target_protein, target_chain, version):
    target_path = path.join(PATHS.modeller, target_protein + target_chain)
    target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent'

    pdb_file_path = path.join(target_path, target_pdb_fname)
    if not path.isfile(pdb_file_path):
        LOGGER.warning('File %s not found' % pdb_file_path)
        return None, None
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    structure_id = path.basename(target_pdb_fname).split('.')[0]
    try:
        structure = parser.get_structure(structure_id, pdb_file_path)
    except:
        print(
            "ERROR: failed parser.get_structure(structure_id, pdb_fname) for "
            + target_pdb_fname)
        return None
    model = structure[0]
    try:
        chain = model[target_chain]
    except KeyError:
        return None
    chain_lst = []
    for res in chain.get_residues():
        if is_aa(res) and res.get_id()[0] == ' ':
            if res.resname == 'UNK' or res.resname == 'ASX':
                chain_lst.append('-')
            elif res.resname == 'SEC':
                chain_lst.append('U')
            else:
                chain_lst.append(Polypeptide.three_to_one(res.resname))

    return chain_lst, chain
Example #12
0
def write_FASTAs(PDB_ID, chains):
    polypeptide_IDs = []
    for chain_ID, residues in chains.items():
        if residues and Polypeptide.is_aa(residues[0][0]):
            polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID)
            polypeptide_IDs.append(polypeptide_ID)
            sequence = []
            for resname, resseq, icode in residues:
                try:
                    sequence.append(Polypeptide.three_to_one(resname))
                except KeyError:
                    sequence.append('X')
            with open('{}.fasta'.format(polypeptide_ID), mode='w') as f:
                f.write('>{}\n'.format(polypeptide_ID))
                f.write('{}\n'.format(''.join(sequence)))
    return polypeptide_IDs
Example #13
0
def residue_seq_to_one(seq):
    """
    Standard mapping from 3-letters amino acid type encoding to one.
    """
    three_to_one = lambda r: Polypeptide.three_to_one(r.name)\
        if r.name in Polypeptide.standard_aa_names else 'U'
    return list(map(three_to_one, seq))
Example #14
0
    def get_sequence(self, chain_id):
        """
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        """
        from Bio.PDB import Polypeptide

        chain = self.structure.child_list[0].child_dict[chain_id]
        # print(chain.child_list)
        sequence_list = []
        for residue in chain.child_list:
            try:
                poly_short = Polypeptide.three_to_one(residue.resname)
            except KeyError:  # probably the start of only HOH -> discard rest
                # print(poly_short)
                break
            # print(poly_short)
            sequence_list.append(poly_short)
        sequence = ''.join(sequence_list)
        return sequence
Example #15
0
def get_chain_to_valid_residues(structure, pdb_name=None):
    """Get tuples of chains and their valid residues."""
    if pdb_name is None:
        pdb_name = ()
    else:
        pdb_name = (pdb_name, )
    chain_res = []
    if type(structure) is Bio.PDB.Structure.Structure:
        for model in structure:
            for chain in model:
                residues = [
                    res for res in chain
                    if poly.is_aa(res.get_resname(), standard=True)
                    and 'CA' in res
                ]
                if len(residues) != 0:
                    chain_res.append(
                        (pdb_name + (str(model.serial_num), chain.get_id()),
                         residues))
    else:
        if 'atom_name' in structure.columns:
            calphas = structure[structure['atom_name'] == 'CA']
        else:
            calphas = structure[structure['maestro_atom_name'] == ' CA ']
        calphas = calphas[calphas['resname'] != 'UNK']

        for (chain, chain_ca) in calphas.groupby(['model', 'chain']):
            residues = [ca for idx, ca in chain_ca.iterrows()]
            if len(residues) != 0:
                chain_res.append((pdb_name + chain, residues))
    return chain_res
Example #16
0
def get_missing_sidechains(pdb_dataset, output_scwrl):
    """Get residues that are missing atoms."""
    for pdb_filename in db.get_structures_filenames(pdb_dataset):
        biopy_structure = db.parse_biopython_structure(pdb_filename)
        pdb_name = db.get_pdb_name(pdb_filename)
        missing = 0
        scwrl_list = []
        logging.info("Processing {:}".format(pdb_name))
        for model in biopy_structure:
            for chain in model:
                for i, residue in enumerate(chain):
                    res_name = residue.resname
                    if res_name not in expected:
                        logging.warning("Non-standard residue found: {:}. "
                                        "Skipping.".format(res_name))
                        continue
                    res_code = poly.three_to_one(res_name)
                    res_id = residue.id[1]
                    curr_count = len(
                        Bio.PDB.Selection.unfold_entities(residue, 'A'))
                    if curr_count != expected[res_name]:
                        logging.debug(
                            "Missing residue {:} at position {:} (with id {:})"
                            " which has {:} instead of the expected {:} atoms."
                            .format(res_name, i, res_id, curr_count,
                                    expected[res_name]))
                        missing += 1
                        scwrl_list.append(res_code.upper())
                    else:
                        scwrl_list.append(res_code.lower())

        logging.debug("Missing {:} residue total".format(missing))
        with open(output_scwrl, 'w') as f:
            f.write("".join(scwrl_list))
Example #17
0
 def constructor(self, recalculate):
     chain_obj = global_stuff.the_obj_manager.get_variable(pdb_chain_wrapper(self.params), recalculate)
     # write the seq file at location + name
     raw_seq_string = ''.join([Polypeptide.three_to_one(res.resname) for res in chain_obj])
     seq = Bio.Seq.Seq(raw_seq_string)
     seq_record = Bio.SeqRecord.SeqRecord(seq)
     SeqIO.write(seq_record, self.get_file_location(), 'fasta')
     return open(self.get_file_location(),'r')
Example #18
0
	def getClearPeptideSeq(self):
		if not self.__peptide:
			self.printerr('getClearPeptideSeq(): PEPTIDE (' + self.__name +') IS EMPTY\n')
			return 0
		s = ''
		for r in list(self.__peptide):
			s = s + Polypeptide.three_to_one(r.get_resname())
		return s
Example #19
0
    def select_ref_atoms (self, fragment, ref_pdbio_struct, use_similar=False):

        for chain in ref_pdbio_struct:
            for res in chain:
                try:
                    gn = self.get_generic_number(res)
                    if gn == fragment.rotamer.residue.display_generic_number.label:
                        logger.info("Ref {}:{}\tFragment {}:{}".format(polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment.rotamer.residue.display_generic_number.label))
                        if use_similar:
                            for rule in self.similarity_rules:
                                if polypeptide.three_to_one(res.resname) in rule[self.similarity_dict["target_residue"]] and fragment.rotamer.residue.amino_acid in rule[self.similarity_dict["target_residue"]] and fragment.interaction_type.slug in rule[self.similarity_dict["interaction_type"]]:
                                    return [res['CA'], res['N'], res['O']] 
                        else:
                            return [res['CA'], res['N'], res['O']] 
                except Exception as msg:
                    continue
        return []                  
Example #20
0
 def get_peptide_sequence(self, residues):
     """
     Returns a sequence string of a given list of Bio.PDB.Residue objects.
     """
     return "".join([
         polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
         for x in residues if x.resname in self.residue_list
     ])
Example #21
0
 def get_chain_sequence(self, chain):
     """
     Returns a sequence string of a given chain.
     """
     return "".join([
         polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
         for x in self.residues[chain] if x.resname in self.residue_list
     ])
    def get_sequence( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''
        chain = self.structure.child_list[0].child_dict[chain_id]
        sequence = ""
        for residue in chain:
            if Polypeptide.is_aa(residue):
                long_name = residue.get_resname()
                sequence += Polypeptide.three_to_one(long_name)

        return sequence
Example #23
0
 def parse_structure(self):
     for residue in self.structure.get_residues():
         if PDB.is_aa(residue,
                      standard=True):  #only consider standard 20 residues
             res = residue.id[1]
             if res not in self.residues:  #dont doublecount mutated residues	(ex. 1ORC)
                 self.residues.append(res)
                 self.d_sequence[res] = Polypeptide.three_to_one(
                     Residue.Residue.get_resname(residue))
Example #24
0
	def calcDistMatrices(self, key1, key2): 	# calculate and store distance matrix to self.__d_matrices for a pair of regions
		res_list_1 = self.getRegion(key1)	# key1 and key2 refer to keys of self.__regions_res dictionary
		res_list_2 = self.getRegion(key2)
		
		if not res_list_1 or not res_list_2:
			self.printerr('calcDistMatrices(): RESIDUE LIST IS EMPTY\n')
			return 0
			
		values = []
		for res1 in res_list_1:
			values.append([])	
			for res2 in res_list_2:	
				values[len(values)-1].append(residuesMinDist(res1, res2))
				
		rows = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_1]	
		cols = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_2]
		mat = pd.DataFrame(values, index = rows, columns = cols)
		self.__d_matrices.update({(key1, key2): mat})
		return 1
Example #25
0
def get_chain_sequences(df):
    """Return list of tuples of (id, sequence) for different chains of monomers in a given dataframe."""
    # Keep only CA of standard residues
    df = df[df['name'] == 'CA'].drop_duplicates()
    df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))]
    df['resname'] = df['resname'].apply(Poly.three_to_one)
    chain_sequences = []
    for c, chain in df.groupby(['ensemble', 'subunit', 'structure', 'model', 'chain']):
        seq = ''.join(chain['resname'])
        chain_sequences.append((tuple([str(x) for x in c]), seq))
    return chain_sequences
Example #26
0
def fastaToCNSThreeLetter(seqRecord,width=10):
	seqStr = seqRecord.seq.tostring()
	L = len(seqStr)
	
	outSeq = [PP.one_to_three(i) for i in seqStr]
	numbered = zip(outSeq,count(0))
	
	groups = [[j[0] for j in i[1]] for i in groupby(numbered,lambda x:x[1]/width)]
	joined = '\n'.join([' '.join(one_group) for one_group in groups]) + '\n'
	
	return joined
Example #27
0
def standard_residue_filter(df):
    """Filter out non-standard residues."""
    residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \
        .drop_duplicates()
    sel = residues['resname'].apply(lambda x: Poly.is_aa(x, standard=True))

    residues['to_keep'] = sel
    residues_to_keep = residues.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep']
    to_keep = residues_to_keep.loc[df.set_index(
        ['structure', 'model', 'chain', 'residue', 'resname']).index]
    return df[to_keep.values]
Example #28
0
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     sequence = self.get_amino_residues(chain_id)
     return ''.join(
         Polypeptide.three_to_one(res.get_resname()) for res in sequence)
Example #29
0
    def get_torsion_angle(self):

        target_file = os.path.join(self._target_dir,
                                   'target_angels_v%s.pkl' % self.VERSION)
        if os.path.isfile(target_file) and not self._is_modeller:
            return pkl_load(target_file)

        poly = Polypeptide.Polypeptide(self._bio_chain)
        all_angles = np.array(poly.get_phi_psi_list())
        angels_arr = np.array(all_angles[self._aa_mask], dtype=np.float32)
        # if not self._is_modeller:
        #     pkl_save(target_file, angels_arr)
        return angels_arr
Example #30
0
def GetList(protein, mincount, measure_cutoffs, thresh=9.0, loss=True):
    """Get list with SSP positions, AA in three letter code.

    If loss==False, use complement for gain predictions. """

    pos = GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=thresh)
    if not loss:
        total_pos = functional_data[protein].columns
        complement = [i for i in total_pos if i not in pos]
        pos = complement
    positions = map(lambda x: pp.one_to_three(x[0]) + x[1:], pos)

    return list(positions)
 def parse_structure(self, pdb_struct):
     """
     extracting sequence and preparing dictionary of residues
     bio.pdb reads pdb in the following cascade: model->chain->residue->atom
     """
     for chain in pdb_struct:
         self.residues[chain.id] = {}
         self.pdb_seq[chain.id] = Seq('')
         
         for res in chain:
         #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
             if res.resname == "HID":
                 resname = polypeptide.three_to_one('HIS')
             else:
                 if res.resname not in self.residue_list:
                     continue
                 self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname))
 
         self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())])
         
         for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1):
             self.residues[chain.id][res].pos_in_aln = pos
 def create_structure_rotamer(PDB_residue, residue_object, structure):
     out_stream = StringIO()
     io = PDBIO()
     # print(PDB_residue)
     io.set_structure(PDB_residue)
     io.save(out_stream)
     pdbdata = PdbData.objects.get_or_create(pdb=out_stream.getvalue())[0]
     missing_atoms = atom_num_dict[Polypeptide.three_to_one(
         PDB_residue.get_resname())] > len(PDB_residue.get_unpacked_list())
     rot = Rotamer(missing_atoms=missing_atoms,
                   pdbdata=pdbdata,
                   residue=residue_object,
                   structure=structure)
     return rot
Example #33
0
def get_all_chain_sequences_df(df):
    """Return list of tuples of (struct_name, chain_sequences) for sharded."""
    all_chain_sequences = []
    # Keep only CA of standard residues
    df = df[df['name'] == 'CA'].drop_duplicates()
    df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))]
    df['resname'] = df['resname'].apply(Poly.three_to_one)
    for s, structure in df.groupby(['ensemble', 'subunit', 'structure']):
        chain_sequences = []
        for c, chain in structure.groupby(['model', 'chain']):
            seq = ''.join(chain['resname'])
            chain_sequences.append((c, seq))
        all_chain_sequences.append((s, chain_sequences))
    return all_chain_sequences
Example #34
0
    def constructor(self, params, recalculate, to_pickle = False, to_filelize = False, always_recalculate = False, old_obj = None):

        the_dict = self.get_var_or_file(objects.baW, params, recalculate, True, True, False)
        chain_letter = self.get_param(params, c)
        pos = self.get_param(params, 'pos')
        key = (pos, chain_letter)
        vals = the_dict[key]
        res_three = vals[0]
        chain_seq_in_one = self.get_var_or_file(objects.dW, params, recalculate, True, False)
        pos_to_aa = self.get_var_or_file(objects.eW, params, recalculate, True, False)
        res_one = chain_seq_in_one[pos_to_aa[pos]]
        assert(Polypeptide.three_to_one(res_three) == res_one)
#        pdb.set_trace()
        return [vals[1], vals[3], vals[5], vals[7], vals[9]]
Example #35
0
def pdb2seq(pdbname):
    import Bio.PDB.Polypeptide as bio
    seq = ""
    with open(pdbname, "r") as pdb:
        prev_n, n = 0, 0
        for line in pdb:
            line = line.strip("\n")
            if line[:4] == "ATOM":
                n = int(line[23:26])
                if n != prev_n:
                    aa = line[17:20]
                    seq += bio.three_to_one(aa)
                    prev_n = n
    return (seq)
Example #36
0
def extract_seqs(structure, defmodel):
    '''
    Uses Biopython to count the numer of chains and to extract the
    each chain's sequence as a list of sequences.
    Called by: clean_and_sort()
    '''
    nchains = 0
    for model in structure:
        if model.id == defmodel:
            seqs = []
            chain_ids = []
            for chain in model:
                nchains += 1
                seqlist = []
                for residue in chain:
                    if bpp_poly.is_aa(residue.get_resname(), standard=True):
                        seqlist.append(
                            bpp_poly.three_to_one(residue.get_resname()))
                    else:
                        seqlist.append('X')
                seq = str("".join(seqlist))
                seqs.append(seq)
                chain_ids.append(chain.id)
    return nchains, seqs, chain_ids
    def get_bfactors( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the B-Factors for all residues in a chain of a Biopython.PDB structure.
                The B-Factors describe the mobility of an atom or a residue.
                In a Biopython.PDB structure B-Factors are given for each atom in a residue.
                Calculate the mean B-Factor for a residue by averaging over the B-Factor 
                of all atoms in a residue.
                Sometimes B-Factors are not available for a certain residue; 
                (e.g. the residue was not resolved); insert np.nan for those cases.
            
                Finally normalize your B-Factors using Standard scores (zero mean, unit variance).
                You have to use np.nanmean, np.nanvar etc. if you have nan values in your array.
                The returned data structure has to be a numpy array rounded again to integer.
        '''
        chain = self.structure.child_list[0].child_dict[chain_id]
        residues1 = chain.get_list()
        residues = []
        #remove residues that are not AAs
        for res_nr in range(len(residues1)):
            if Polypeptide.is_aa(residues1[res_nr]):
                residues.append(residues1[res_nr])
        length = len(residues)
        b_factors = np.zeros(length, dtype=np.float32)
        #calculate bfactor average per residue
        tmp_factors = np.zeros(length, dtype=np.float32)
        for res_nr in range(length):
            atoms = residues[res_nr].get_list()
            bfactor = 0
            atom_count = 0
            for ato in atoms:
                bfactor += ato.bfactor
                atom_count += 1

            bfactor /= atom_count
            tmp_factors[res_nr] = bfactor

        #normalize
        mean = np.nanmean(tmp_factors)
        vari = np.nanstd(tmp_factors)
        for foo in range(length):
            b_factors[foo] = (tmp_factors[foo]-mean)/vari
        return b_factors.astype( np.int ) # return rounded (integer) values
Example #38
0
def clean_pdb(structure, pdb_name, clean_dir):
    '''
    Function to select and write pdb with only aminoacids
    Invokes SelectAA class constructed with Bio.PDB.select
    Called by: clean_pdb_files()
               clean_and_sort()
    '''
    reslist = []
    clean_name = clean_dir + pdb_name + '.clean.pdb'
    for res in structure.get_residues():
        if bpp_poly.is_aa(res.get_resname(), standard=True):
            reslist.append(res.resname)
    if len(reslist) > 30:
        io.set_structure(structure)
        io.save(clean_name, SelectAA())
        return True
    else:
        return False
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     chains = list(self.structure.get_chains())
     for x in chains:
         if x.id == chain_id:
             ret = ""
             for res in x.get_unpacked_list():
                 if res.resname != 'HOH':
                     ret += PP.three_to_one(res.resname)
             return ret
     return None
Example #40
0
	def calcNrgMatrices(self, gromacs_dir, xpm_dir, *keys): 	# calculate and store energy matrices to self.__e_matrices for pairs of 
		res_list = [self.getRegion(key) for key in keys]	# regions; keys is a list of keys from self.__regions_res dictionary
		if not all(res_list):
			self.printerr('calcNrgMatrices: RESIDUE LIST IS EMPTY\n')
			return 0
	
		# Create pdb containing only desirable regions
		self.pushToPDB(gromacs_dir, *keys)
		
		# Execute Gromacs script
		list_of_seqs = [''.join(map(lambda res: Polypeptide.three_to_one(res.get_resname()), x)) for x in res_list]
		bashCommand = " ".join(["./grom_script.sh", gromacs_dir, xpm_dir, self.__name] + list_of_seqs)
		exitcode = subprocess.call(bashCommand, shell=True)
		if not exitcode:
			self.printerr('calcNrgMatrices: GROMACS FAILURE\n')
			return 0
		
		# Extract DataFrames from xpm files
		bad_mat, annotation = extractXPM(''.join([xpm_dir, '/', 'total', self.__name, '.xpm']))
		if len(annotation) > 2: 
			aminos = reduce(lambda x, y: list(x) + list(y), annotation[1:])
		else:
			aminos = list(annotation[-1])
		mat = pd.DataFrame(bad_mat, columns = aminos, index = aminos)
		
		ind_list = [0]
		[ind_list.append(len(x) + ind_list[-1]) for x in annotation[1:]]
		
		sub_mats = []
		sub_mats_keys = []
		for i in range(len(res_list)):
			for j in range(i, len(res_list)):
				sub_mats_keys.append((keys[i], keys[j]))
				sub_mats.append(mat.iloc[ind_list[i]:ind_list[i + 1], ind_list[j]:ind_list[j + 1]])
				
		self.__e_matrices.update(dict(zip(sub_mats_keys, sub_mats)))
		return 1
Example #41
0
    def create_fragment(self, fragment_file_name):

        tokens = fragment_file_name.strip().replace('.pdb', '').split('_')
        #Not the most efficient way, but gives the overview on what is going on
        generic_num = float("%s.%s" %(tokens[0], tokens[1]))
        res_name = tokens[2]
        protein_entry_name = tokens[3]
        pdb_code = tokens[4]

        if len(tokens) > 5:
            if len(tokens) == 7:
                feature = '_'.join([tokens[5], tokens[6]])
            elif len(tokens) == 6:
                feature = tokens[5]

        #Checking the if the crystal is in the database
        try:
            s = Structure.objects.get(pdb_code__index=pdb_code)
        except Structure.DoesNotExist:
            self.logger.warning('Cannot find the structure {} in the database. Skipping the fragment {}'.format(pdb_code, fragment_file_name.strip().replace('.pdb', '')))
            return

        #ResidueFragmentInteractionType
        try:
            i, created = ResidueFragmentInteractionType.objects.get_or_create(slug=feature, name=self.interactions[feature])
        except Exception:
            self.logger.info("Failed to find or create feature {}...".format(feature))
        #Rotamer and Fragment
        try:
            fragment_struct = PDBParser(PERMISSIVE=True).get_structure('frag', os.sep.join([self.fragments_dir, fragment_file_name]))[0]
            fragment_pdb_data = ''
            r = None
            for residue in fragment_struct.get_residues():
                hetfield, resseq, icode=residue.get_id()
                if hetfield == ' ': #Amino acid
                    try:
                        r = Residue.objects.get(sequence_number=int(resseq), amino_acid=polypeptide.three_to_one(residue.resname),protein_conformation=s.protein_conformation)
                        d, created = PdbData.objects.get_or_create(pdb=extract_pdb_data(residue))
                        rot, created = Rotamer.objects.get_or_create(residue=r, structure=s, pdbdata=d)
                        #rot.save()
                    except Exception as msg:
                        self.logger.error('Failed to add rotamer {}:{}{}\n'.format(pdb_code, resseq, msg))
                        return
                else:
                    fragment_pdb_data += extract_pdb_data(residue)
            try:
                fd, created = PdbData.objects.get_or_create(pdb=fragment_pdb_data)
                #Taking the first ligand from the list, since existing fragments do not contain the ligand info
                f, created = Fragment.objects.get_or_create(residue=r, ligand=s.ligands.all()[0], structure=s, pdbdata=fd)
                #f.save()
            except Exception as msg:
                self.logger.error('Failed to add fragment {}\n{}'.format(fragment_file_name, msg))
        except Exception as msg:
            self.logger.error('Failed to add fragment {} to the db\n{}'.format(fragment_file_name, msg))
        #StructureLigandInteraction
        try:
            lr, created = LigandRole.objects.get_or_create(name='unknown',slug='unknown')
            sli, created = StructureLigandInteraction.objects.get_or_create(structure=s, ligand=s.ligands.all()[0], ligand_role=lr)
        except Exception as msg:
            self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg))
        try:
            rfi, created = ResidueFragmentInteraction.objects.get_or_create(structure_ligand_pair=sli, rotamer=rot, fragment=f, interaction_type=i)
            self.logger.info("Successfully added interacting fragment {}".format(fragment_file_name))
        except Exception as msg:
            self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg))
Example #42
0
    def create_residues(self, args):

        schemes = {
            'gpcrdb': {'type': False},
            'gpcrdba': {
                'type': 'structure',
                'seq_based': 'bw',
            },
            'gpcrdbb': {
                'type': 'structure',
                'seq_based': 'woot',
            },
            'gpcrdbc': {
                'type': 'structure',
                'seq_based': 'pin',
            },
            'gpcrdbf': {
                'type': 'structure',
                'seq_based': 'wang',
            },
            'bw': {'type': 'sequence'},
            'woot': {'type': 'sequence'},
            'pin': {'type': 'sequence'},
            'wang': {'type': 'sequence'},
        }

        for scheme_name, scheme in schemes.items():
            schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name)
            mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt'])
            if os.path.isfile(mapping_file):
                with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file:
                    schemes[scheme_name]['table'] = {}
                    for row in scheme_table_file:
                        split_row = shlex.split(row)
                        schemes[scheme_name]['table'][split_row[0]] = split_row[1]
        missing_proteins = []
        self.logger.info('CREATING RESIDUES')
        for arg in args:
            if os.path.exists(os.sep.join([self.dump_source_dir, arg])):
                residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r')
                self.logger.info('Parsing residue data from {}'.format(arg))
            else:
                print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                continue
            for line in residue_data_fh:
                id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug...
                if prot_name in missing_proteins:
                    continue
                
                # fetch schemes and conversion tables
                #Checking if the protein exists in the db
                try:
                    pconf = ProteinConformation.objects.get(protein__entry_name=prot_name,
                        state__slug=settings.DEFAULT_PROTEIN_STATE)
                except ProteinConformation.DoesNotExist as e:
                    missing_proteins.append(prot_name)
                    continue
                #Checking if given residue already exists in the db
                try:
                    Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num)
                    continue
                except Residue.DoesNotExist as e:
                    pass

                r = Residue()
                r.protein_conformation = pconf
                r.sequence_number = int(res_num)
                r.amino_acid = polypeptide.three_to_one(res_name.upper())
                
                generic_numbers = []
                
                try:
                    r.save()
                    self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number,
                        r.amino_acid, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format(
                        r.sequence_number, r.amino_acid, pconf.protein.entry_name))
                    continue
                  
                # residue segment
                dump_segment = sec_str_name
                try:
                    r.protein_segment = ProteinSegment.objects.get(slug=dump_segment)
                except:
                    self.logger.error('Failed to fetch protein segment {}'.format(dump_segment))

                # generic number
                if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'):
                    # separate bulge number (1241 - > 124 + 1)
                    bulge_prime = ''
                    dump_oliveira = str(oli)
                    if len(dump_oliveira) == 4:
                        bulge_prime = dump_oliveira[3]
                        dump_oliveira = dump_oliveira[:3]
                    dump_gpcrdb = gpcrdb[:4]
                    dump_seq_based = bw

                    # default gpcrdb number
                    def_gpcrdb = False
                    if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']:
                        default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + 
                            bulge_prime)
                        try:
                            def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label,
                                scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'])
                        except ResidueGenericNumber.DoesNotExist as e:
                            def_gpcrdb = ResidueGenericNumber()
                            def_gpcrdb.label = default_label
                            def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']
                            def_gpcrdb.protein_segment = r.protein_segment
                            def_gpcrdb.save()
                            self.logger.info('Created generic number {:s} in numbering scheme {:s}'
                                .format(default_label,
                                schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name))
                                    
                    # if default number was found/added successfully, process the alternative numbers
                    if def_gpcrdb:
                        # add default generic number to residue record
                        r.generic_number = def_gpcrdb

                        # dict of sequence-based numbers, for use in structure-based numbers (5.46x461)
                        seq_based_labels = {}

                        # sequence-based schemes first (the sequence-based numbers are needed for the
                        # structure based schemes)
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'sequence':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']:
                                    seq_based_label = dump_seq_based
                                # if not convert the number to the correct scheme
                                else:
                                    slug = pconf.protein.residue_numbering_scheme.slug
                                    for d, c in schemes[schemes[slug]['seq_based']]['table'].items():
                                        if c == dump_seq_based:
                                            seq_based_label = scheme['table'][d]
                                            break

                                # fetch/insert the number
                                try:
                                    seq_based = ResidueGenericNumber.objects.get(label=seq_based_label,
                                        scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    seq_based = ResidueGenericNumber()
                                    seq_based.label = seq_based_label
                                    seq_based.scheme = scheme['obj']
                                    seq_based.protein_segment = r.protein_segment
                                    seq_based.save()
                                r.alternative_generic_numbers.add(seq_based)

                                # add added number to the dict for later use
                                seq_based_labels[scheme_name] = seq_based_label
                                                
                        # structure-based numbers
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'structure':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    struct_based_label = dump_gpcrdb + bulge_prime
                                # if not convert the number to the correct scheme
                                else:
                                    for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items():
                                        if c == dump_gpcrdb:
                                            struct_based_label = scheme['table'][d] + bulge_prime
                                            break

                                # add the sequence-based label (5x461 -> 5.46x461)
                                split_struct_based_label = struct_based_label.split('x')
                                struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' +
                                    split_struct_based_label[1])

                                # fetch/insert the number
                                try:
                                    struct_based = ResidueGenericNumber.objects.get(
                                        label=struct_based_label, scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    struct_based = ResidueGenericNumber()
                                    struct_based.label = struct_based_label
                                    struct_based.scheme = scheme['obj']
                                    struct_based.protein_segment = r.protein_segment
                                    struct_based.save()
                                                
                                # add to residue as a display number or alternative number?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    r.display_generic_number = struct_based
                                else:
                                    r.alternative_generic_numbers.add(struct_based)
                try:
                    r.save()
                    self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                        res_name, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error(
                        'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                            res_name, pconf.protein.entry_name))
        self.logger.info('COMPLETED CREATING RESIDUES')
Example #43
0
 def get_chain_sequence(self, chain):
     """
     Returns a sequence string of a given chain.
     """
     return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list])
Example #44
0
 def get_peptide_sequence(self, residues):
     """
     Returns a sequence string of a given list of Bio.PDB.Residue objects.
     """
     return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list])
Example #45
0
 def get_chain_sequence(self, chain):
     return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in chain if x.resname in self.residue_list])