Esempio n. 1
0
    def parse_structure(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        for chain in pdb_struct:
            self.residues[chain.id] = {}
            self.pdb_seq[chain.id] = Seq('')

            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    resname = polypeptide.three_to_one('HIS')
                else:
                    if res.resname not in self.residue_list:
                        continue
                    self.residues[chain.id][res.id[1]] = MappedResidue(
                        res.id[1], polypeptide.three_to_one(res.resname))

            self.pdb_seq[chain.id] = ''.join([
                self.residues[chain.id][x].name
                for x in sorted(self.residues[chain.id].keys())
            ])

            for pos, res in enumerate(sorted(self.residues[chain.id].keys()),
                                      start=1):
                self.residues[chain.id][res].pos_in_aln = pos
    def parse_pdb(self):

        pdb_struct = None
        #checking for file handle or file name to parse
        if self.pdb_file:
            pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure(
                'ref', self.pdb_file)[0]
        elif self.pdb_filename:
            pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure(
                'ref', self.pdb_filename)[0]
        else:
            return None

        #extracting sequence and preparing dictionary of residues
        #bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        for chain in pdb_struct:
            self.pdb_seq[chain.id] = Seq('')
            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS')
                else:
                    try:
                        self.pdb_seq[chain.id] += polypeptide.three_to_one(
                            res.resname)
                    except Exception as msg:
                        continue
        return pdb_struct
Esempio n. 3
0
    def select_ref_atoms(self, fragment, ref_pdbio_struct, use_similar=False):

        for chain in ref_pdbio_struct:
            for res in chain:
                try:
                    gn = self.get_generic_number(res)
                    if gn == fragment.rotamer.residue.display_generic_number.label:
                        logger.info("Ref {}:{}\tFragment {}:{}".format(
                            polypeptide.three_to_one(res.resname),
                            self.get_generic_number(res),
                            fragment.rotamer.residue.amino_acid, fragment.
                            rotamer.residue.display_generic_number.label))
                        if use_similar:
                            for rule in self.similarity_rules:
                                if polypeptide.three_to_one(
                                        res.resname
                                ) in rule[self.similarity_dict[
                                        "target_residue"]] and fragment.rotamer.residue.amino_acid in rule[
                                            self.similarity_dict[
                                                "target_residue"]] and fragment.interaction_type.slug in rule[
                                                    self.similarity_dict[
                                                        "interaction_type"]]:
                                    return [res['CA'], res['N'], res['O']]
                        else:
                            return [res['CA'], res['N'], res['O']]
                except Exception as msg:
                    continue
        return []
Esempio n. 4
0
def get_adjacency_matrix(pdb_id, pdb_file):

    parser = PDBParser()   # initialize biopython PDB parser
    structure = parser.get_structure(pdb_id, pdb_file)  # get PDB parsed by providing id and file name

    # deriving all amino acids based on presence of beta carbon
    amino_acids = [res for res in structure[0]['A'] if 'CB' in res]

    # set up df based on num. of amino acids. All amino acid pair interaction values will be appended.
    adj_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids)))

    for i, r1 in enumerate(amino_acids):
        for j, r2 in enumerate(amino_acids):
            if i != j:
                # looking through all non-self AA interactions
                distance = r1['CB'] - r2['CB']  # distance in Angstrom, 3D space between beta carbons on 2 amino acids

                # if 3D distance < 8 Angstrom, then 3D contact is assumed.
                # Adjancency matrix has a 1 for amino acids with 3D contact (8 A limit) and 0 for not.
                if distance <= 8:
                    adj_df_values[i][j] = 1.0

                else:
                    adj_df_values[i][j] = 0
            else:
                adj_df_values[i][j] = 0

    # df with rows and cols having aa name and position; values from appended df
    adjacency_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                                columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                                data=adj_df_values)

    return adjacency_df
Esempio n. 5
0
def get_distance_matrix(pdb_id, pdb_file):
    parser = PDBParser()  # initialize biopython PDB parser
    structure = parser.get_structure(pdb_id, pdb_file)  # get PDB parsed by providing id and file name

    # deriving all amino acids based on presence of beta carbon
    amino_acids = [res for res in structure[0]['A'] if 'CB' in res]

    # set up df based on num. of amino acids. All amino acid pair interaction values will be appended.
    dist_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids)))

    for i, r1 in enumerate(amino_acids):
        for j, r2 in enumerate(amino_acids):
            if i != j:
                # looking through all non-self AA interactions
                dist = r1['CB'] - r2['CB']  # distance in Angstrom, 3D space between beta carbons on 2 amino acids
                dist_df_values[i][j] = dist  # distance matrix just has 3D distance values. No cutoff required.
            else:
                dist_df_values[i][j] = 0

    # df with rows and cols having aa name and position; values from appended df
    distance_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                               columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids],
                               data=dist_df_values)

    return distance_df
    def parse_pdb (self):

        pdb_struct = None
        #checking for file handle or file name to parse
        if self.pdb_file:
            pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_file)[0]
        elif self.pdb_filename:
            pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_filename)[0]
        else:
            return None

        #extracting sequence and preparing dictionary of residues
        #bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        for chain in pdb_struct:
            self.pdb_seq[chain.id] = Seq('')            
            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS')
                else:
                    try:
                        self.pdb_seq[chain.id] += polypeptide.three_to_one(res.resname)
                    except Exception as msg:
                        continue
        return pdb_struct
def MutationsDict(file, positions=None):
    """Get dictionary with lists of mutations per position in protein, ignore
    positions without residue in pdb file.

    Parameters:
        file (string): pdb file to get mutations from
        positions: list of tuples of the form (chain, first, last) for positions
                   to mutate for all other aminoacids. If None, mutates all
                   positions in all chains

    Returns:
        dict with keys :aa:chain:position, each containing lists with
        :aa:chain:position:mutated_aa for all mutations

    """

    # Sorted list of one letter amino acids
    AA = list(Bio.PDB.Polypeptide.aa1)
    # Generate model of original pdb file
    model = it.Pmolecule(file).model
    # Dict to store mutations
    mutations = dict()
    if positions:
        for chain_id, first, last in positions:
            # Get chain corresponding to chain_id given
            chain = next(chain for chain in model.get_chains()
                         if chain.id == chain_id)
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    prefix = code + chain_id + str(position)
                    # Only save positions between first and last
                    if position in range(first, last + 1):
                        mutations[prefix] = [
                            prefix + aa for aa in AA if aa != code
                        ]
    else:
        for chain in model.get_chains():
            for residue in chain:
                if pp.is_aa(residue):
                    code = pp.three_to_one(residue.get_resname())
                    position = residue.id[1]
                    chain_id = chain.id
                    prefix = code + chain_id + str(position)
                    mutations[prefix] = [
                        prefix + aa for aa in AA if aa != code
                    ]
    return mutations
Esempio n. 8
0
def get_missing_sidechains(pdb_dataset, output_scwrl):
    """Get residues that are missing atoms."""
    for pdb_filename in db.get_structures_filenames(pdb_dataset):
        biopy_structure = db.parse_biopython_structure(pdb_filename)
        pdb_name = db.get_pdb_name(pdb_filename)
        missing = 0
        scwrl_list = []
        logging.info("Processing {:}".format(pdb_name))
        for model in biopy_structure:
            for chain in model:
                for i, residue in enumerate(chain):
                    res_name = residue.resname
                    if res_name not in expected:
                        logging.warning("Non-standard residue found: {:}. "
                                        "Skipping.".format(res_name))
                        continue
                    res_code = poly.three_to_one(res_name)
                    res_id = residue.id[1]
                    curr_count = len(
                        Bio.PDB.Selection.unfold_entities(residue, 'A'))
                    if curr_count != expected[res_name]:
                        logging.debug(
                            "Missing residue {:} at position {:} (with id {:})"
                            " which has {:} instead of the expected {:} atoms."
                            .format(res_name, i, res_id, curr_count,
                                    expected[res_name]))
                        missing += 1
                        scwrl_list.append(res_code.upper())
                    else:
                        scwrl_list.append(res_code.lower())

        logging.debug("Missing {:} residue total".format(missing))
        with open(output_scwrl, 'w') as f:
            f.write("".join(scwrl_list))
Esempio n. 9
0
def modeller_get_chain_seqs(target_protein, target_chain, version):
    target_path = path.join(PATHS.modeller, target_protein + target_chain)
    target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent'

    pdb_file_path = path.join(target_path, target_pdb_fname)
    if not path.isfile(pdb_file_path):
        LOGGER.warning('File %s not found' % pdb_file_path)
        return None, None
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    structure_id = path.basename(target_pdb_fname).split('.')[0]
    try:
        structure = parser.get_structure(structure_id, pdb_file_path)
    except:
        print(
            "ERROR: failed parser.get_structure(structure_id, pdb_fname) for "
            + target_pdb_fname)
        return None
    model = structure[0]
    try:
        chain = model[target_chain]
    except KeyError:
        return None
    chain_lst = []
    for res in chain.get_residues():
        if is_aa(res) and res.get_id()[0] == ' ':
            if res.resname == 'UNK' or res.resname == 'ASX':
                chain_lst.append('-')
            elif res.resname == 'SEC':
                chain_lst.append('U')
            else:
                chain_lst.append(Polypeptide.three_to_one(res.resname))

    return chain_lst, chain
Esempio n. 10
0
    def get_sequence(self, chain_id):
        """
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        """
        from Bio.PDB import Polypeptide

        chain = self.structure.child_list[0].child_dict[chain_id]
        # print(chain.child_list)
        sequence_list = []
        for residue in chain.child_list:
            try:
                poly_short = Polypeptide.three_to_one(residue.resname)
            except KeyError:  # probably the start of only HOH -> discard rest
                # print(poly_short)
                break
            # print(poly_short)
            sequence_list.append(poly_short)
        sequence = ''.join(sequence_list)
        return sequence
Esempio n. 11
0
def residue_seq_to_one(seq):
    """
    Standard mapping from 3-letters amino acid type encoding to one.
    """
    three_to_one = lambda r: Polypeptide.three_to_one(r.name)\
        if r.name in Polypeptide.standard_aa_names else 'U'
    return list(map(three_to_one, seq))
Esempio n. 12
0
    def select_ref_atoms (self, fragment, ref_pdbio_struct, use_similar=False):

        for chain in ref_pdbio_struct:
            for res in chain:
                try:
                    gn = self.get_generic_number(res)
                    if gn == fragment.rotamer.residue.display_generic_number.label:
                        logger.info("Ref {}:{}\tFragment {}:{}".format(polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment.rotamer.residue.display_generic_number.label))
                        if use_similar:
                            for rule in self.similarity_rules:
                                if polypeptide.three_to_one(res.resname) in rule[self.similarity_dict["target_residue"]] and fragment.rotamer.residue.amino_acid in rule[self.similarity_dict["target_residue"]] and fragment.interaction_type.slug in rule[self.similarity_dict["interaction_type"]]:
                                    return [res['CA'], res['N'], res['O']] 
                        else:
                            return [res['CA'], res['N'], res['O']] 
                except Exception as msg:
                    continue
        return []                  
Esempio n. 13
0
 def get_chain_sequence(self, chain):
     """
     Returns a sequence string of a given chain.
     """
     return "".join([
         polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
         for x in self.residues[chain] if x.resname in self.residue_list
     ])
Esempio n. 14
0
 def get_peptide_sequence(self, residues):
     """
     Returns a sequence string of a given list of Bio.PDB.Residue objects.
     """
     return "".join([
         polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
         for x in residues if x.resname in self.residue_list
     ])
Esempio n. 15
0
 def constructor(self, recalculate):
     chain_obj = global_stuff.the_obj_manager.get_variable(pdb_chain_wrapper(self.params), recalculate)
     # write the seq file at location + name
     raw_seq_string = ''.join([Polypeptide.three_to_one(res.resname) for res in chain_obj])
     seq = Bio.Seq.Seq(raw_seq_string)
     seq_record = Bio.SeqRecord.SeqRecord(seq)
     SeqIO.write(seq_record, self.get_file_location(), 'fasta')
     return open(self.get_file_location(),'r')
Esempio n. 16
0
	def getClearPeptideSeq(self):
		if not self.__peptide:
			self.printerr('getClearPeptideSeq(): PEPTIDE (' + self.__name +') IS EMPTY\n')
			return 0
		s = ''
		for r in list(self.__peptide):
			s = s + Polypeptide.three_to_one(r.get_resname())
		return s
Esempio n. 17
0
 def parse_structure(self):
     for residue in self.structure.get_residues():
         if PDB.is_aa(residue,
                      standard=True):  #only consider standard 20 residues
             res = residue.id[1]
             if res not in self.residues:  #dont doublecount mutated residues	(ex. 1ORC)
                 self.residues.append(res)
                 self.d_sequence[res] = Polypeptide.three_to_one(
                     Residue.Residue.get_resname(residue))
Esempio n. 18
0
	def calcDistMatrices(self, key1, key2): 	# calculate and store distance matrix to self.__d_matrices for a pair of regions
		res_list_1 = self.getRegion(key1)	# key1 and key2 refer to keys of self.__regions_res dictionary
		res_list_2 = self.getRegion(key2)
		
		if not res_list_1 or not res_list_2:
			self.printerr('calcDistMatrices(): RESIDUE LIST IS EMPTY\n')
			return 0
			
		values = []
		for res1 in res_list_1:
			values.append([])	
			for res2 in res_list_2:	
				values[len(values)-1].append(residuesMinDist(res1, res2))
				
		rows = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_1]	
		cols = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_2]
		mat = pd.DataFrame(values, index = rows, columns = cols)
		self.__d_matrices.update({(key1, key2): mat})
		return 1
Esempio n. 19
0
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     sequence = self.get_amino_residues(chain_id)
     return ''.join(
         Polypeptide.three_to_one(res.get_resname()) for res in sequence)
Esempio n. 20
0
def pdb2seq(pdbname):
    import Bio.PDB.Polypeptide as bio
    seq = ""
    with open(pdbname, "r") as pdb:
        prev_n, n = 0, 0
        for line in pdb:
            line = line.strip("\n")
            if line[:4] == "ATOM":
                n = int(line[23:26])
                if n != prev_n:
                    aa = line[17:20]
                    seq += bio.three_to_one(aa)
                    prev_n = n
    return (seq)
 def create_structure_rotamer(PDB_residue, residue_object, structure):
     out_stream = StringIO()
     io = PDBIO()
     # print(PDB_residue)
     io.set_structure(PDB_residue)
     io.save(out_stream)
     pdbdata = PdbData.objects.get_or_create(pdb=out_stream.getvalue())[0]
     missing_atoms = atom_num_dict[Polypeptide.three_to_one(
         PDB_residue.get_resname())] > len(PDB_residue.get_unpacked_list())
     rot = Rotamer(missing_atoms=missing_atoms,
                   pdbdata=pdbdata,
                   residue=residue_object,
                   structure=structure)
     return rot
Esempio n. 22
0
def write_FASTAs(PDB_ID, chains):
    polypeptide_IDs = []
    for chain_ID, residues in chains.items():
        if residues and Polypeptide.is_aa(residues[0][0]):
            polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID)
            polypeptide_IDs.append(polypeptide_ID)
            sequence = []
            for resname, resseq, icode in residues:
                try:
                    sequence.append(Polypeptide.three_to_one(resname))
                except KeyError:
                    sequence.append('X')
            with open('{}.fasta'.format(polypeptide_ID), mode='w') as f:
                f.write('>{}\n'.format(polypeptide_ID))
                f.write('{}\n'.format(''.join(sequence)))
    return polypeptide_IDs
    def get_sequence( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''
        chain = self.structure.child_list[0].child_dict[chain_id]
        sequence = ""
        for residue in chain:
            if Polypeptide.is_aa(residue):
                long_name = residue.get_resname()
                sequence += Polypeptide.three_to_one(long_name)

        return sequence
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     chains = list(self.structure.get_chains())
     for x in chains:
         if x.id == chain_id:
             ret = ""
             for res in x.get_unpacked_list():
                 if res.resname != 'HOH':
                     ret += PP.three_to_one(res.resname)
             return ret
     return None
Esempio n. 25
0
def residue_list_to_string(residues, with_ids=False):
    """Convert list of residues to string."""
    for residue in residues:
        if residue.resname == 'HID':
            residue.resname = 'HIS'
        elif residue.resname == 'CYX':
            residue.resname = 'CYS'
        elif residue.resname == 'ASX':
            residue.resname = 'ASP'
        elif residue.resname == 'GLX':
            residue.resname = 'GLY'

    seq = [poly.three_to_one(residue.resname) for residue in residues
           if residue.resname != 'SEC' and residue.resname != 'PYL']
    ids = [residue.residue for residue in residues
           if residue.resname != 'SEC' and residue.resname != 'PYL']
    if with_ids:
        return "".join(seq), ids
    else:
        return "".join(seq)
Esempio n. 26
0
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     sequence = ''
     for model in self.structure:
         chain = model[chain_id]
         residues = chain.get_residues()
         for residue in residues:
             try:
                 sequence = sequence + pp.three_to_one(
                     residue.get_resname())
             except:
                 continue
     return sequence
Esempio n. 27
0
def extract_seqs(structure, defmodel):
    '''
    Uses Biopython to count the numer of chains and to extract the
    each chain's sequence as a list of sequences.
    Called by: clean_and_sort()
    '''
    nchains = 0
    for model in structure:
        if model.id == defmodel:
            seqs = []
            chain_ids = []
            for chain in model:
                nchains += 1
                seqlist = []
                for residue in chain:
                    if bpp_poly.is_aa(residue.get_resname(), standard=True):
                        seqlist.append(
                            bpp_poly.three_to_one(residue.get_resname()))
                    else:
                        seqlist.append('X')
                seq = str("".join(seqlist))
                seqs.append(seq)
                chain_ids.append(chain.id)
    return nchains, seqs, chain_ids
Esempio n. 28
0
def get_structure_seqrecords(model):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        model: Biopython Model object of a Structure

    Returns:
        list: List of SeqRecords

    """

    structure_seq_records = []

    # Loop over each chain of the PDB
    for chain in model:
        tracker = 0
        chain_seq = ''
        chain_resnums = []

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            res_id = res.id
            res_num = res_id[1]
            res_icode = res_id[2]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                end_tracker = res_num
                res_aa_one = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if res_icode != ' ':
                        chain_seq += res_aa_one
                        chain_resnums.append(res_num)
                        tracker = end_tracker + 1
                        continue
                    else:
                        multiplier = (end_tracker - tracker - 1)
                        chain_seq += 'X' * multiplier
                        # Residue numbers for unresolved or nonstandard residues are Infinite
                        chain_resnums.extend([float("Inf")] * multiplier)

                chain_seq += res_aa_one
                chain_resnums.append(res_num)
                tracker = end_tracker

            else:
                continue

        chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein),
                                     id=chain.get_id())
        chain_seq_record.letter_annotations[
            'structure_resnums'] = chain_resnums
        structure_seq_records.append(chain_seq_record)

    return structure_seq_records
Esempio n. 29
0
def build_matrix(
        path: str,
        filename: str,
        truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict:
    """Build the input matrix for one protein.

    Args:
        path: path of the pdb file.
        filename: name of the file (without extension).
        truncate_log: tqdm logger

    Returns:
        Build matrix dictionary
    """
    PROTEIN_SEQ_MAX_LEN = 4000
    protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)]
                      for y in range(10)]
    protein_structure = PDBParser().get_structure(filename, path)
    protein_model = list(protein_structure.get_models())
    protein_chains = list(protein_model[0].get_chains())

    col = 0

    try:
        for chain in protein_chains:
            protein_residues = list(chain.get_residues())

            for residue in protein_residues:
                if Polypeptide.is_aa(residue.get_resname(), standard=True):
                    atoms = list(residue.get_atoms())
                    x = []
                    y = []
                    z = []

                    for atom in atoms:
                        vec = atom.get_vector()
                        x.append(vec.__getitem__(0))
                        y.append(vec.__getitem__(1))
                        z.append(vec.__getitem__(2))

                    # calculate position of residue
                    x = round(mean(x))
                    y = round(mean(y))
                    z = round(mean(z))

                    # one letter code
                    code = Polypeptide.three_to_one(residue.get_resname())

                    aa = amino_acid[code]
                    protein_matrix[0][col] = aa["code"]
                    protein_matrix[1][col] = x
                    protein_matrix[2][col] = y
                    protein_matrix[3][col] = z
                    protein_matrix[4][col] = aa["hydropathy"]
                    protein_matrix[5][col] = aa["hydropathy_index"]
                    protein_matrix[6][col] = aa["acidity_basicity"]
                    protein_matrix[7][col] = aa["mass"]
                    protein_matrix[8][col] = aa["isoelectric_point"]
                    protein_matrix[9][col] = aa["charge"]

                # Even if the current residue is not amino acid we increase the col.
                # 0 is save at this position if it is not an amino acid.
                col = col + 1

    except IndexError:
        if truncate_log is not None:
            truncate_log.set_description_str(
                f"Protein {filename} is truncated.")

    # Prepare dict so it can be load to vaex dataframe
    dic: BuildMatrixDict = {
        "seq": [[]],
        "x_pos": [[]],
        "y_pos": [[]],
        "z_pos": [[]],
        "hydropathy": [[]],
        "hydropathy_index": [[]],
        "acidity_basicity": [[]],
        "mass": [[]],
        "isoelectric_point": [[]],
        "charge": [[]],
    }

    for i in range(10):
        dic[col_name[i]] = pyarrow.array(
            [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]])

    return dic
Esempio n. 30
0
def get_structure_seqs(pdb_file, file_type):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        pdb_file: Path to PDB file

    Returns:
        dict: Dictionary of:
        {chain_id: sequence}

    """

    # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but
    # it seems like biopython capitalizes it to chain L

    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    structure_seqs = {}

    # Loop over each chain of the PDB
    for chain in model:
        chain_seq = ''
        tracker = 0

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            # res_num = res.id[1]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                full_id = res.get_full_id()
                end_tracker = full_id[3][1]
                i_code = full_id[3][2]
                aa = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if i_code != ' ':
                        chain_seq += aa
                        tracker = end_tracker + 1
                        continue
                    else:
                        chain_seq += 'X' * (end_tracker - tracker - 1)

                chain_seq += aa
                tracker = end_tracker

            else:
                continue

        structure_seqs[chain.get_id()] = chain_seq

    return structure_seqs
Esempio n. 31
0
    def create_residues(self, args):

        schemes = {
            'gpcrdb': {'type': False},
            'gpcrdba': {
                'type': 'structure',
                'seq_based': 'bw',
            },
            'gpcrdbb': {
                'type': 'structure',
                'seq_based': 'woot',
            },
            'gpcrdbc': {
                'type': 'structure',
                'seq_based': 'pin',
            },
            'gpcrdbf': {
                'type': 'structure',
                'seq_based': 'wang',
            },
            'bw': {'type': 'sequence'},
            'woot': {'type': 'sequence'},
            'pin': {'type': 'sequence'},
            'wang': {'type': 'sequence'},
        }

        for scheme_name, scheme in schemes.items():
            schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name)
            mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt'])
            if os.path.isfile(mapping_file):
                with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file:
                    schemes[scheme_name]['table'] = {}
                    for row in scheme_table_file:
                        split_row = shlex.split(row)
                        schemes[scheme_name]['table'][split_row[0]] = split_row[1]
        missing_proteins = []
        self.logger.info('CREATING RESIDUES')
        for arg in args:
            if os.path.exists(os.sep.join([self.dump_source_dir, arg])):
                residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r')
                self.logger.info('Parsing residue data from {}'.format(arg))
            else:
                print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                continue
            for line in residue_data_fh:
                id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug...
                if prot_name in missing_proteins:
                    continue
                
                # fetch schemes and conversion tables
                #Checking if the protein exists in the db
                try:
                    pconf = ProteinConformation.objects.get(protein__entry_name=prot_name,
                        state__slug=settings.DEFAULT_PROTEIN_STATE)
                except ProteinConformation.DoesNotExist as e:
                    missing_proteins.append(prot_name)
                    continue
                #Checking if given residue already exists in the db
                try:
                    Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num)
                    continue
                except Residue.DoesNotExist as e:
                    pass

                r = Residue()
                r.protein_conformation = pconf
                r.sequence_number = int(res_num)
                r.amino_acid = polypeptide.three_to_one(res_name.upper())
                
                generic_numbers = []
                
                try:
                    r.save()
                    self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number,
                        r.amino_acid, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format(
                        r.sequence_number, r.amino_acid, pconf.protein.entry_name))
                    continue
                  
                # residue segment
                dump_segment = sec_str_name
                try:
                    r.protein_segment = ProteinSegment.objects.get(slug=dump_segment)
                except:
                    self.logger.error('Failed to fetch protein segment {}'.format(dump_segment))

                # generic number
                if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'):
                    # separate bulge number (1241 - > 124 + 1)
                    bulge_prime = ''
                    dump_oliveira = str(oli)
                    if len(dump_oliveira) == 4:
                        bulge_prime = dump_oliveira[3]
                        dump_oliveira = dump_oliveira[:3]
                    dump_gpcrdb = gpcrdb[:4]
                    dump_seq_based = bw

                    # default gpcrdb number
                    def_gpcrdb = False
                    if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']:
                        default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + 
                            bulge_prime)
                        try:
                            def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label,
                                scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'])
                        except ResidueGenericNumber.DoesNotExist as e:
                            def_gpcrdb = ResidueGenericNumber()
                            def_gpcrdb.label = default_label
                            def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']
                            def_gpcrdb.protein_segment = r.protein_segment
                            def_gpcrdb.save()
                            self.logger.info('Created generic number {:s} in numbering scheme {:s}'
                                .format(default_label,
                                schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name))
                                    
                    # if default number was found/added successfully, process the alternative numbers
                    if def_gpcrdb:
                        # add default generic number to residue record
                        r.generic_number = def_gpcrdb

                        # dict of sequence-based numbers, for use in structure-based numbers (5.46x461)
                        seq_based_labels = {}

                        # sequence-based schemes first (the sequence-based numbers are needed for the
                        # structure based schemes)
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'sequence':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']:
                                    seq_based_label = dump_seq_based
                                # if not convert the number to the correct scheme
                                else:
                                    slug = pconf.protein.residue_numbering_scheme.slug
                                    for d, c in schemes[schemes[slug]['seq_based']]['table'].items():
                                        if c == dump_seq_based:
                                            seq_based_label = scheme['table'][d]
                                            break

                                # fetch/insert the number
                                try:
                                    seq_based = ResidueGenericNumber.objects.get(label=seq_based_label,
                                        scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    seq_based = ResidueGenericNumber()
                                    seq_based.label = seq_based_label
                                    seq_based.scheme = scheme['obj']
                                    seq_based.protein_segment = r.protein_segment
                                    seq_based.save()
                                r.alternative_generic_numbers.add(seq_based)

                                # add added number to the dict for later use
                                seq_based_labels[scheme_name] = seq_based_label
                                                
                        # structure-based numbers
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'structure':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    struct_based_label = dump_gpcrdb + bulge_prime
                                # if not convert the number to the correct scheme
                                else:
                                    for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items():
                                        if c == dump_gpcrdb:
                                            struct_based_label = scheme['table'][d] + bulge_prime
                                            break

                                # add the sequence-based label (5x461 -> 5.46x461)
                                split_struct_based_label = struct_based_label.split('x')
                                struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' +
                                    split_struct_based_label[1])

                                # fetch/insert the number
                                try:
                                    struct_based = ResidueGenericNumber.objects.get(
                                        label=struct_based_label, scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    struct_based = ResidueGenericNumber()
                                    struct_based.label = struct_based_label
                                    struct_based.scheme = scheme['obj']
                                    struct_based.protein_segment = r.protein_segment
                                    struct_based.save()
                                                
                                # add to residue as a display number or alternative number?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    r.display_generic_number = struct_based
                                else:
                                    r.alternative_generic_numbers.add(struct_based)
                try:
                    r.save()
                    self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                        res_name, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error(
                        'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                            res_name, pconf.protein.entry_name))
        self.logger.info('COMPLETED CREATING RESIDUES')
Esempio n. 32
0
def upload(request):
    # Processo Pipeline concluido - Falta somente pegar o melhor modelo gerado.
    # Rodrigo 27/07/2020.
    
    if request.method == 'POST':
        
        #diretorio = ""

        proteina = request.FILES['proteina']
        template = request.FILES['documento']

        # Cadeia da Sequencia
        cadeiaS = request.POST.get('cadeiaS').upper()

        #Cadeia do Template
        cadeiaT = request.POST.get('cadeiaT').upper()

        fs = FileSystemStorage()
        
        fs.save(proteina.name, proteina)
        fs.save(template.name, template)
        
        w = open("media\\"+ template.name +".fasta","w")
        w.write(">"+template.name+"\n")
        cadeia = cadeiaT # Aqui eu estou deixando Fixo "A", mas no caso não é o correto, deveria verificar uma forma de identificar. Cadeia Template * Update 23/09
        comeco = 0
        fim = 0
        pdb = open('media\\' + template.name).readlines()
        for linha in pdb:
            if linha[0:4] == "ATOM" and linha[21] == cadeia and linha[13:15] == 'CA': # ver o que é CA
                resname3 = linha[17:20]
                if comeco == 0:
                    comeco = int(linha[22:26])
                if int(linha[22:26]) > fim:
                    fim = int(linha[22:26])
                resname1 = Polypeptide.three_to_one(resname3)
                w.write(resname1)
        w.write("\n")
        w.close()
        os.system("type media\\"+template.name+".fasta > media\\alinha.fasta")
        os.system("type media\\"+proteina.name+" >> media\\alinha.fasta")
        
        # run clustal-w
        os.system("clustalw2 -infile=media\\alinha.fasta -output=pir")
        #subprocess.call(["clustalw2.exe", "-infile='media\\alinha.fasta' -output='pir'"])
        aln = open("media\\alinha.pir").readlines()
        new_aln = open("media\\new_alinha.pir","w")
        tipo = 0 #0 = PDB; 1 = SEQ
        seq = open("media\\" + proteina.name)
        seq_final = ""
        for linha in seq:
            if linha[0] != ">":
                seq_final += linha.strip()
        tamanho_seq = len(seq_final)
        print("tamanho da seq = "+str(tamanho_seq))
        for linha in aln:
            if linha[0] == ">":
                if tipo == 0 and linha != "\n":
                    new_aln.write(">P1;"+template.name+"\n")
                    new_aln.write("structure:"+template.name+":"+str(comeco)+":"+cadeia+":"+str(fim)+":"+cadeia+"::::\n")
                    tipo = tipo+1
                elif tipo == 1:
                    new_aln.write(">P1;"+proteina.name+"\n")
                    new_aln.write("sequence:"+proteina.name+":"+str(1)+":"+str(cadeiaS)+":"+str(tamanho_seq)+":"+str(cadeiaS)+"::::") # Mesma coisa aqui, porém aqui é a Cadeia da Sequencia que vamos modelar!
            else:
                new_aln.write(linha)
        new_aln.close()
        seq.close()
        # ****************************************************************************************
        # modeller
        # ****************************************************************************************
        criaScript(proteina, template)

        os.system("media\\Modeller.lnk")

        # Atualizar o arquivo.bat dinamico para ir de acordo com a pasta  


        atualizaArquivoBAT(proteina.name)
        max_id = inserirDiretorio(proteina.name)

        os.system("media\\clear.lnk")

        inserirArquivos(proteina.name, max_id)

        d = Banco()

        diretorios = d.getDiretorios()

        print(diretorios[0][0])

        # result = subprocess.run(['dir', '*.py'], stdout=subprocess.PIPE)
        # result.stdout

        # print(result.stdout)    
        return render(request, 'pipeline/upload.html', {'resultado': '1', 'diretorios': diretorios[0:] } ) # Falta aqui! 


    d = Banco()

    diretorios = d.getDiretorios()

    #print(diretorios[0][0])
    
    # listaUL = '<ul class="prot-list">'
    # contador = 0

    # for dire in diretorios:

    #     listaUL += '<li class="prot-item"><span><a href="">'+ dire[contador][contador] +'</a></span></li>'
    #     contador += 1
    
    # listaUL += '</ul>'

    return render(request, 'pipeline/upload.html')
    def handle(self, *args, **options):
        startTime = datetime.datetime.now()
        self.options = options
        if self.options["purge"]:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith="_a",
                protein_conformation__protein__family__parent__parent__name=
                "Alpha").delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith="_a",
                protein__family__parent__parent__name="Alpha").delete()
            Protein.objects.filter(
                entry_name__endswith="_a",
                family__parent__parent__name="Alpha").delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options["only_signprot_structures"]:
            # Building protein and protconf objects for g protein structure in complex
            if options["s"]:
                scs = SignprotComplex.objects.filter(
                    structure__pdb_code__index__in=[
                        i.upper() for i in options["s"]
                    ])
            else:
                scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    "Protein, ProteinConformation and Residue build for alpha subunit of {} is building"
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            "_a")
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug="mod")
                        alpha_protein.source = ProteinSource.objects.get(
                            name="OTHER")
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + "_a")
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug="active")
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure("struct",
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        if "CA" in res and res.id[0] == " ":
                            nums.append(res.get_id()[1])

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ""
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == "6OIJ" and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split("\n")
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith("SEQADV"):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)",
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and "CONFLICT" in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    if options["debug"]:
                        print(sc)
                        print(mutations)
                        print(shifted_mutations)
                        print(mismatches)
                        print("======")
                        print(remaining_mismatches)
                        pprint.pprint(pdb_num_dict)

                    no_seqnum_shift = [
                        '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U',
                        '7L1V'
                    ]

                    # Check if HN is mutated to GNAI1 for the scFv16 stabilizer
                    if sc.protein.entry_name != 'gnai1_human' and len(
                            remaining_mismatches) > 0:
                        target_HN = resis.filter(protein_segment__slug='HN')
                        gnai1_HN = Residue.objects.filter(
                            protein_conformation__protein__entry_name=
                            'gnai1_human',
                            protein_segment__slug='HN')
                        pdb_HN_seq = ''
                        for num, val in pdb_num_dict.items():
                            if num <= target_HN.reverse()[0].sequence_number:
                                pdb_HN_seq += Polypeptide.three_to_one(
                                    val[0].get_resname())
                        if options['debug']:
                            print('Checking if HN is gnai1_human')
                            print(pdb_HN_seq)
                            print(''.join(
                                gnai1_HN.values_list('amino_acid', flat=True)))
                        gnai1_HN_seq = ''.join(
                            gnai1_HN.values_list('amino_acid', flat=True))
                        pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq,
                                                      3, -4, -3, -1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        length, match = 0, 0
                        for r, t in zip(ref_seq, temp_seq):
                            if options['debug']:
                                print(r, t)
                            if t != '-':
                                if r == t:
                                    match += 1
                                length += 1
                        identity = match / length * 100
                        if options['debug']:
                            print(identity)
                        if identity > 85:
                            if sc.structure.pdb_code.index not in ['7DFL']:
                                no_seqnum_shift.append(
                                    sc.structure.pdb_code.index)
                            if options['debug']:
                                print(
                                    'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction'
                                    .format(round(identity)))

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(
                            remaining_mismatches
                    ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift:
                        ppb = PPBuilder()
                        seq = ""
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        if sc.structure.pdb_code.index in [
                                '7JVQ', '7L1U', '7L1V'
                        ]:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 3, -4, -3, -1)
                        else:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])

                        # Custom fix for A->G mutation at pos 18
                        if sc.structure.pdb_code.index == '7JJO':
                            ref_seq = ref_seq[:18] + ref_seq[19:]
                            temp_seq = temp_seq[:17] + temp_seq[18:]
                        # Custom alignment fixes
                        elif sc.structure.pdb_code.index == '7DFL':
                            ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                            temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                        elif sc.structure.pdb_code.index == '7JOZ':
                            temp_seq = temp_seq[:67] + (
                                '-' * 14) + 'FNGDS' + temp_seq[86:]
                        elif sc.structure.pdb_code.index == '7AUE':
                            ref_seq = ref_seq[:31].replace('-',
                                                           '') + ref_seq[31:]
                            temp_seq = (
                                9 *
                                '-') + temp_seq[2:5] + temp_seq[5:54].replace(
                                    '-', '') + temp_seq[54:]
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            if options["debug"]:
                                print(i, ref, temp)  # alignment check
                            if ref != "-" and temp != "-":
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == "-":
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == "-":
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        # Custom fix for 7JJO isoform difference
                        if sc.structure.pdb_code.index in [
                                '7JJO', '7JOZ', '7AUE'
                        ]:
                            pdb_num_dict = OrderedDict()
                            for wt_res, st_res in wt_pdb_dict.items():
                                if type(st_res) == type([]):
                                    pdb_num_dict[wt_res.sequence_number] = [
                                        st_res[0], wt_res
                                    ]
                        else:
                            for i, r in enumerate(remaining_mismatches):
                                # Adjust for shifted residue when residue is a match
                                if r[0].get_id()[1] - remaining_mismatches[
                                        i - 1][0].get_id()[1] > 1:
                                    pdb_num_dict[r[0].get_id()[1] -
                                                 1][1] = pdb_wt_dict[chain[
                                                     r[0].get_id()[1] - 1]]
                                # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                                if r[0].get_id()[1] in shifted_mutations:
                                    pdb_num_dict[
                                        r[0].get_id()[1]][1] = resis.get(
                                            sequence_number=shifted_mutations[
                                                r[0].get_id()[1]][2])
                                # Adjust for shift
                                else:
                                    pdb_num_dict[r[0].get_id()
                                                 [1]][1] = pdb_wt_dict[r[0]]
                            if sc.structure.pdb_code.index == '7JVQ':
                                pdb_num_dict[198][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=346)
                                pdb_num_dict[235][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=383)
                            elif sc.structure.pdb_code.index == '6PB0':
                                pdb_num_dict[205][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=205)
                    ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    elif sc.structure.pdb_code.index == "6WHA":
                        ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV"
                        temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV"
                        pdb_num_dict = OrderedDict()
                        temp_resis = [res for res in chain]
                        temp_i = 0
                        mapped_cgns = []
                        for i, aa in enumerate(temp_seq):
                            if aa != "-":
                                ref_split_on_gaps = ref_seq[:i + 1].split("-")
                                ref_seqnum = i - (len(ref_split_on_gaps) -
                                                  1) + 1
                                res = resis.get(sequence_number=ref_seqnum)
                                if res.display_generic_number.label in mapped_cgns:
                                    next_presumed_cgn = self.get_next_presumed_cgn(
                                        res)
                                    if next_presumed_cgn:
                                        res = next_presumed_cgn
                                        while res and res.display_generic_number.label in mapped_cgns:
                                            res = self.get_next_presumed_cgn(
                                                res)
                                    else:
                                        print(
                                            "Error: {} CGN does not exist. Incorrect mapping of {} in {}"
                                            .format(next_presumed_cgn,
                                                    chain[nums[temp_i]],
                                                    sc.structure))
                                mapped_cgns.append(
                                    res.display_generic_number.label)
                                pdb_num_dict[nums[temp_i]] = [
                                    chain[nums[temp_i]], res
                                ]
                                temp_i += 1

                    bulked_rotamers = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            res_obj.save()
                            rot = self.create_structure_rotamer(
                                val[0], res_obj, sc.structure)
                            bulked_rotamers.append(rot)
                        else:
                            self.logger.info(
                                "Skipped {} as no annotation was present, while building for alpha subunit of {}"
                                .format(val[1], sc))
                    if options["debug"]:
                        pprint.pprint(pdb_num_dict)
                    Rotamer.objects.bulk_create(bulked_rotamers)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished"
                        .format(sc))
                except Exception as msg:
                    if options["debug"]:
                        print("Error: ", sc, msg)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed"
                        .format(sc))

        if not options["s"]:
            ### Build SignprotStructure objects from non-complex signprots
            g_prot_alphas = Protein.objects.filter(
                family__slug__startswith="100_001",
                accession__isnull=False)  #.filter(entry_name="gnai1_human")
            complex_structures = SignprotComplex.objects.all().values_list(
                "structure__pdb_code__index", flat=True)
            for a in g_prot_alphas:
                pdb_list = get_pdb_ids(a.accession)
                for pdb in pdb_list:
                    if pdb not in complex_structures:
                        try:
                            data = self.fetch_gprot_data(pdb, a)
                            if data:
                                self.build_g_prot_struct(a, pdb, data)
                        except Exception as msg:
                            self.logger.error(
                                "SignprotStructure of {} {} failed\n{}: {}".
                                format(a.entry_name, pdb, type(msg), msg))

        if options["debug"]:
            print(datetime.datetime.now() - startTime)
Esempio n. 34
0
        return ppl

if __name__ == '__main__':
        pdb = argv[1]
        cha = argv[2]
        output = open(argv[1]+".angles","w")
        for model in Bio.PDB.PDBParser().get_structure(pdb, pdb+".pdb"):
                chain = model[cha]
                output.write("##### PDB "+pdb+" Chain "+chain.get_id()+"\n" )
                polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
                for poly_index, poly in enumerate(polypeptides) :
                        chain_angle = []
                        for j, angles_list in enumerate(get_angle_list(poly)):
                                torsion_angles = []
                                resseq = poly[j].id[1]
                                for i in range(len(angles_list)):
                                        try:
                                                torsion_angles.append(float(angles_list[i])*57.2957795)
                                        except:
                                                torsion_angles.append("-999")
                                output.write(str(resseq)+"\t"+Polypeptide.three_to_one(poly[j].resname))
                                
                                for angle in torsion_angles[0:2]:
                                    if angle == "-999":
                                        output.write("\t-999")
                                    else:
                                        output.write("\t{0:.2f}".format(angle))
                                chain_angle.append(torsion_angles)
                                output.write("\n")
output.close()
Esempio n. 35
0
 def get_chain_sequence(self, chain):
     return "".join([
         polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
         for x in chain if x.resname in self.residue_list
     ])