Example #1
0
 def test_polypeptide(self):
     """Tests on polypetide class and methods."""
     p = PDBParser(PERMISSIVE=True)
     pdb1 = "PDB/1A8O.pdb"
     s = p.get_structure("scr", pdb1)
     ppb = PPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()),
                      "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()),
                      "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     phi_psi = pp[0].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3)
     self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3)
     phi_psi = pp[1].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3)
     self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3)
     phi_psi = pp[2].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3)
     ppb = CaPPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()),
                      "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()),
                      "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     self.assertEqual([ca.serial_number for ca in pp[0].get_ca_list()], [
         10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131,
         139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242,
         251, 260, 267, 276, 284
     ])
     taus = pp[1].get_tau_list()
     self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3)
     self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3)
     self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3)
     thetas = pp[2].get_theta_list()
     self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3)
     self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3)
     self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
Example #2
0
def run_test():
    from Bio.PDB import PDBParser, PPBuilder, CaPPBuilder


    # first make a PDB parser object
    p=PDBParser(PERMISSIVE=1) 

    # get the structure, call it "example"
    structure=p.get_structure("example", "PDB/a_structure.pdb")

    # now loop over content and print some info
    for model in structure.get_list():
        model_id=model.get_id()
        print "Model %i contains %i chains." % (model_id, len(model))
        for chain in model.get_list():
            chain_id=chain.get_id()
            print "\tChain '%s' contains %i residues." % (chain_id, len(chain))
            for residue in chain.get_list():
                residue_id=residue.get_id()
                hetfield, resseq, icode=residue_id
                print "\t\tResidue ('%s', %i, '%s') contains %i atoms." % (hetfield, resseq, icode, len(residue))
                # check if there is disorder due to a point mutation --- this is rare
                if residue.is_disordered()==2:
                    print "\t\t\tThere is a point mutation present in the crystal at this position."
                    s="\t\t\tResidues at this position are "
                    for resname in residue.disordered_get_id_list():
                        s=s+resname+" "
                    print s[:-1]+"."
                # count the number of disordered atoms
                if residue.is_disordered()==1:
                    disordered_count=0
                    for atom in residue.get_list():
                        if atom.is_disordered():
                            disordered_count=disordered_count+1
                    if disordered_count>0:
                        print "\t\t\tThe residue contains %i disordered atoms." % disordered_count


    print "Polypeptides using C-N"
    ppb=PPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "Polypeptides using CA-CA"
    ppb=CaPPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "NeighborSearch test"
    quick_neighbor_search_test()
Example #3
0
def run_test():
    from Bio.PDB import PDBParser, PPBuilder, CaPPBuilder

    # first make a PDB parser object
    p = PDBParser(PERMISSIVE=1)

    # get the structure, call it "example"
    structure = p.get_structure("example", "PDB/a_structure.pdb")

    # now loop over content and print some info
    for model in structure.get_list():
        model_id = model.get_id()
        print "Model %i contains %i chains." % (model_id, len(model))
        for chain in model.get_list():
            chain_id = chain.get_id()
            print "\tChain '%s' contains %i residues." % (chain_id, len(chain))
            for residue in chain.get_list():
                residue_id = residue.get_id()
                hetfield, resseq, icode = residue_id
                print "\t\tResidue ('%s', %i, '%s') contains %i atoms." % (
                    hetfield, resseq, icode, len(residue))
                # check if there is disorder due to a point mutation --- this is rare
                if residue.is_disordered() == 2:
                    print "\t\t\tThere is a point mutation present in the crystal at this position."
                    s = "\t\t\tResidues at this position are "
                    for resname in residue.disordered_get_id_list():
                        s = s + resname + " "
                    print s[:-1] + "."
                # count the number of disordered atoms
                if residue.is_disordered() == 1:
                    disordered_count = 0
                    for atom in residue.get_list():
                        if atom.is_disordered():
                            disordered_count = disordered_count + 1
                    if disordered_count > 0:
                        print "\t\t\tThe residue contains %i disordered atoms." % disordered_count

    print "Polypeptides using C-N"
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "Polypeptides using CA-CA"
    ppb = CaPPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "NeighborSearch test"
    quick_neighbor_search_test()
Example #4
0
def write_backbone_angles(chain,
                          region=None,
                          offset=0,
                          outfile=sys.stdout,
                          header=False):
    """
    Write Psi/Phi angles from a pdb file
    """
    if region is None:
        region = (0, float('inf'))

    polypeptide_builder = PPBuilder()
    polypeptides = polypeptide_builder.build_peptides(chain)

    if header:
        print(HEADER, file=outfile)

    for peptide in polypeptides:
        angles = peptide.get_phi_psi_list()
        for residue, (phi, psi) in zip(peptide, angles):
            position = residue.get_id()[1]
            if region[0] <= position <= region[1]:
                print(chain.id,
                      position,
                      seq1(residue.get_resname()),
                      position + offset,
                      'NA' if phi is None else phi * RAD_FACTOR,
                      'NA' if psi is None else psi * RAD_FACTOR,
                      sep='\t',
                      file=outfile)
def CreateJoinedFastas(input_PDB_objects):
    """
	Joins many PDB objects and creates a FASTA file with all objects joined.

	Arguments:

	input_PDB_objects: list of PDB objects whose sequence will be added to the FASTA file.
	"""

    polipeptide = PPBuilder()
    first_line = True
    filename = ""

    # Create FASTA files.
    for obj in input_PDB_objects:
        filename = filename + obj.get_id() + "_"
    filename = filename + ".fa"
    joined_fasta = open(filename, 'w')

    # Write FASTA files.
    for obj in input_PDB_objects:
        if first_line:
            joined_fasta.write(">" + obj.get_id() + "\n")
            first_line = False
        else:
            joined_fasta.write("\n" + ">" + obj.get_id() + "\n")
        for polipep in polipeptide.build_peptides(obj):
            joined_fasta.write(str(polipep.get_sequence()))

    return filename
def get_secondary_structure(structure):
    rama_ss_ranges = [(-180, -180, 80, 60, 'E', 'blue'),
                      (-180, 50, 80, 130, 'E', 'blue'),
                      (-100, -180, 100, 60, 'P', 'green'),
                      (-100, 50, 100, 130, 'P', 'green'),
                      (-180, -120, 180, 170, 'H', 'red'),
                      (0, -180, 180, 360, 'L', 'yellow')]

    # Calculate PSI and PHI
    ppb = PPBuilder()  # PolyPeptideBuilder
    ss = ["" for x in range(N)]
    for chain in structure:
        for pp in ppb.build_peptides(chain):
            phi_psi = pp.get_phi_psi_list(
            )  # [(phi_residue_1, psi_residue_1), ...]
            for i, residue in enumerate(pp):
                # print(model, chain, i, residue, phi_psi[i])
                # Convert radians to degrees and remove first and last value that are None
                if phi_psi[i][0] is not None and phi_psi[i][1] is not None:
                    for x, y, w, h, ss_c, color in rama_ss_ranges:
                        if x <= phi_psi[i][0] < x + w and y <= phi_psi[i][
                                1] < y + h:
                            ss[i] = ss_c
                            break
    return ss
def SplitChain(PDB_objects):
    """
	Splits a list of PDB files by chain creating one PDB and one FASTA file per chain.
	
	Arguments:

	PDB_objects: list of PDB objects (with many chains) generated by the PDB parser.
	"""

    File_prefix = []

    for pdb in PDB_objects:
        chain_names = set()
        io = PDBIO()

        # Creates a PDB file for each chain of the original file.
        for chain in pdb.get_chains():
            if chain.get_id() not in chain_names:
                io.set_structure(chain)
                io.save(pdb.get_id() + "_" + chain.get_id() + ".pdb")
                File_prefix.append(pdb.get_id() + "_" + chain.get_id())

                # Creates a FASTA file for each chain of the original file.
                polipeptide = PPBuilder()
                for pp in polipeptide.build_peptides(pdb):
                    fasta = open(pdb.get_id() + "_" + chain.get_id() + ".fa",
                                 "w")
                    fasta.write(">" + pdb.get_id() + "_" + chain.get_id() +
                                "\n")
                    fasta.write(str(pp.get_sequence()))

                chain_names.add(chain.get_id())

    return File_prefix
Example #8
0
    def test_ppbuilder_torsion(self):
        """Test phi/psi angles calculated with PPBuilder."""
        ppb = PPBuilder()
        pp = ppb.build_peptides(self.structure)

        phi_psi = pp[0].get_phi_psi_list()
        self.assertIsNone(phi_psi[0][0])
        self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3)
        self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3)
        self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3)
        self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3)
        self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3)

        phi_psi = pp[1].get_phi_psi_list()
        self.assertIsNone(phi_psi[0][0])
        self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3)
        self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3)
        self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3)
        self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3)
        self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3)

        phi_psi = pp[2].get_phi_psi_list()
        self.assertIsNone(phi_psi[0][0])
        self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3)
        self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3)
        self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3)
        self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3)
        self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3)
Example #9
0
    def compute_secondary_structure(self, model):
        """
        This function defines all the secondary structures of the model passed in input
        :param model: one model
        :return: the matrix of secondary structures
        """

        # Calculate PSI and PHI
        ppb = PPBuilder()
        rama = {
        }  # { chain : [[residue_1, ...], [phi_residue_1, ...], [psi_residue_2, ...] ] }

        residue_found = 0
        for chain in model:
            for pp in ppb.build_peptides(chain):
                phi_psi = pp.get_phi_psi_list()

                for i, residue in enumerate(pp):

                    if phi_psi[i][0] is not None and phi_psi[i][1] is not None:
                        # Conversion to degrees when the values are not None (for first and last)
                        rama.setdefault(chain.id, [[], [], []])
                        rama[chain.id][0].append(residue)
                        rama[chain.id][1].append(math.degrees(phi_psi[i][0]))
                        rama[chain.id][2].append(math.degrees(phi_psi[i][1]))
                    else:
                        # Adding of Nan if the angles are None (for first and last)
                        rama.setdefault(chain.id, [[], [], []])
                        rama[chain.id][0].append(residue)
                        rama[chain.id][1].append(math.nan)
                        rama[chain.id][2].append(math.nan)

                    residue_found += 1

        # Eventual nan-padding if something goes wrong during the angle computation
        if residue_found < self._residues:
            for i in range(self._residues - residue_found):
                rama.setdefault('Z', [[], [], []])
                rama['Z'][0].append(None)
                rama['Z'][1].append(math.nan)
                rama['Z'][2].append(math.nan)

        # Comparison of the angles with the Ramachandran regions
        ss = []
        for chain_id in rama:
            for residue, phi, psi in zip(*rama[chain_id]):
                ss_class = None
                if math.isnan(phi) and math.isnan(psi):
                    # If nan (angles not available) insert a symbol indicating this situation
                    ss_class = '-'
                else:
                    # Determine the correspondent region and store it
                    for x, y, width, height, ss_c, color in self._ranges:
                        if x <= phi < x + width and y <= psi < y + height:
                            ss_class = ss_c
                            break

                ss.append(ss_class)

        return ss
Example #10
0
 def test_polypeptide(self):
     """Tests on polypetide class and methods."""
     p = PDBParser(PERMISSIVE=True)
     pdb1 = "PDB/1A8O.pdb"
     s = p.get_structure("scr", pdb1)
     ppb = PPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     phi_psi = pp[0].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3)
     self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3)
     phi_psi = pp[1].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3)
     self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3)
     phi_psi = pp[2].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3)
     ppb = CaPPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     self.assertEqual([ca.serial_number for ca in pp[0].get_ca_list()], [10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131, 139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242, 251, 260, 267, 276, 284])
     taus = pp[1].get_tau_list()
     self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3)
     self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3)
     self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3)
     thetas = pp[2].get_theta_list()
     self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3)
     self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3)
     self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
Example #11
0
 def test_c_n(self):
     """Extract polypeptides using C-N."""
     ppbuild = PPBuilder()
     polypeptides = ppbuild.build_peptides(self.structure[1])
     self.assertEqual(len(polypeptides), 1)
     pp = polypeptides[0]
     # Check the start and end positions
     self.assertEqual(pp[0].get_id()[1], 2)
     self.assertEqual(pp[-1].get_id()[1], 86)
Example #12
0
 def get_structure_sequence(struct):
     # type: (Structure) -> str
     """
     Gets the structure sequence using PPBuilder
     :param struct: Structure object
     :return: struct sequence
     """
     ppb = PPBuilder()
     return ''.join(
         [str(pp.get_sequence()) for pp in ppb.build_peptides(struct)])
def is_protein(chain):
    """
        Check if chain is a protein.

    :param chain:
    :return:
    """
    ppb = PPBuilder()
    for pp in ppb.build_peptides(chain):
        if len(pp.get_sequence()) > 0:
            return True
    return False
def chain_to_one_pp(chain):
    ppb = PPBuilder()

    polypeptides = ppb.build_peptides(chain)

    if len(polypeptides) != 1:
        print('warning ', len(polypeptides),
              ' polypeptides from one chain, extending first pp')

        for pp in polypeptides[1:]:
            polypeptides[0].extend(pp)

    return polypeptides[0]
Example #15
0
def structure_filtered_dca_get_sequence_from_structure(structure):

    from Bio.PDB import PPBuilder

    sequence = ""

    ppb = PPBuilder(radius=10.0)

    for pp in ppb.build_peptides(structure, aa_only=False):

        sequence += '%s\n' % pp.get_sequence()

    return sequence.replace('\n', '')
Example #16
0
def run(infile, splitpdb):
    parser = PDBParser()
    struct = parser.get_structure('mystruct', infile)
    ppb = PPBuilder()

    basename = os.path.basename(infile)
    prefix = os.path.splitext(basename)[0]
    if splitpdb == 0:  # We do NOT split the PDB and fasta files!
        seqfile = open(prefix + '.fasta', 'w')
        pdbio = PDBIO_RPL.PDBIO()
        pdbio.set_structure(struct)
        cleanfile = prefix + '_clean.pdb'
        pdbio.save(cleanfile)
    ListChains = []
    for model in struct:
        for chain in model:
            ListChains.append(chain.id)
            ListPpdb = ppb.build_peptides(chain)
            if (len(ListPpdb) > 0):
                for index, pp in enumerate(ListPpdb):
                    #                    print(chain.id,index,pp.get_sequence(),pp
                    if splitpdb == 1:  # We split the PDB and fasta files!
                        seqfile = open(
                            prefix + '_' + chain.id + '.' + str(index) +
                            '.fasta', 'w')
                    seq = pp.get_sequence()
                    seqfile.write('>%s %s\n' % (prefix + '_chain_' + chain.id +
                                                '_' + str(index), len(seq)))
                    seqfile.write('%s' % seq)
                    seqfile.write('\n')
                    if splitpdb == 1:  # We split the PDB and fasta files!
                        seqfile.close()
                        startres = pp[0].id[1]
                        endres = pp[-1].id[1]
                        ofile = prefix + '_' + chain.id + '.' + str(
                            index) + '.pdb'
                        Dice_RPL.extract(struct, chain.id, startres, endres,
                                         ofile)
            else:
                #               Also split chains that do not consist of amino acids!
                ChainList = chain.get_list()
                startres = ChainList[0].id[1]
                endres = ChainList[0].id[-1]
                ofile = prefix + '_' + chain.id + '.' + str(index) + '.pdb'
                Dice_RPL.extract(struct, chain.id, startres, endres, ofile)
    if splitpdb == 0:  # We do NOT split the PDB and fasta files!
        seqfile.close()

    return ListChains
Example #17
0
 def test_c_n(self):
     """Extract polypeptides using C-N."""
     ppbuild = PPBuilder()
     polypeptides = ppbuild.build_peptides(self.structure[1])
     self.assertEqual(len(polypeptides), 1)
     pp = polypeptides[0]
     # Check the start and end positions
     self.assertEqual(pp[0].get_id()[1], 2)
     self.assertEqual(pp[-1].get_id()[1], 86)
     # Check the sequence
     s = pp.get_sequence()
     self.assertTrue(isinstance(s, Seq))
     self.assertEqual(s.alphabet, generic_protein)
     self.assertEqual("RCGSQGGGSTCPGLRCCSIWGWCGDSEPYCGRTCENKCWSGER"
                      "SDHRCGAAVGNPPCGQDRCCSVHGWCGGGNDYCSGGNCQYRC",
                      str(s))
Example #18
0
    def get_sequence(self, chain_id):
        """
			Input:
				self: Use Biopython.PDB structure which has been stored in an object variable
				chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
						depends on the specific protein and the resulting structure)
			Return:
				Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
				in a Biopython.PDB structure as a string.
		"""
        sequence = 'SEQWENCE'

        ppb = PPBuilder()

        for pp in ppb.build_peptides(self.structure[0][chain_id]):
            return pp.get_sequence()
Example #19
0
 def test_c_n(self):
     """Extract polypeptides using C-N."""
     ppbuild = PPBuilder()
     polypeptides = ppbuild.build_peptides(self.structure[1])
     self.assertEqual(len(polypeptides), 1)
     pp = polypeptides[0]
     # Check the start and end positions
     self.assertEqual(pp[0].get_id()[1], 2)
     self.assertEqual(pp[-1].get_id()[1], 86)
     # Check the sequence
     s = pp.get_sequence()
     self.assertTrue(isinstance(s, Seq))
     self.assertEqual(s.alphabet, generic_protein)
     self.assertEqual("RCGSQGGGSTCPGLRCCSIWGWCGDSEPYCGRTCENKCWSGER"
                      "SDHRCGAAVGNPPCGQDRCCSVHGWCGGGNDYCSGGNCQYRC",
                      str(s))
Example #20
0
    def test_ppbuilder_real_nonstd(self):
        """Test PPBuilder on real PDB file allowing non-standard amino acids."""
        ppb = PPBuilder()
        pp = ppb.build_peptides(self.structure, False)

        self.assertEqual(len(pp), 1)

        # Check the start and end positions
        self.assertEqual(pp[0][0].get_id()[1], 151)
        self.assertEqual(pp[0][-1].get_id()[1], 220)

        # Check the sequence
        s = pp[0].get_sequence()
        self.assertIsInstance(s, Seq)
        # Here non-standard MSE are shown as M
        self.assertEqual(
            "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG",
            s)
Example #21
0
	def getSeqLocation(self, seq):		# return sequence position and chain id
		ppb=PPBuilder()
		bltpep = ppb.build_peptides(self.__struct[0])
		for pp in bltpep: 
			beg = 0
			end = 0
			s = str(pp.get_sequence())
			ind = s.find(seq, 0, len(s))
			if (ind != -1):
				beg = beg + ind
				end = beg + len(seq) - 1
				chain = pp[0].get_parent().get_id()
				break	
		if beg == end == 0:	
			line = '\n' + seq + ' not found in '+str(self.__struct.get_id()) + '!\n'
			self.printerr(line)
			return None, None, None
		return beg, end, chain
Example #22
0
def get_sequence(pdb, chain):
        pdb_parser = PDBParser(PERMISSIVE=0)                    # The PERMISSIVE instruction allows PDBs presenting errors.
        pdb_structure = pdb_parser.get_structure(pdb,pdb)

	pdb_chain = pdb_structure[0][chain]
	ppb=PPBuilder()
	Sequence = ""
	for pp in ppb.build_peptides(pdb_chain):
		Sequence = Sequence + pp.get_sequence()

	io = PDBIO()
	io.set_structure(pdb_structure)
	output = pdb[-8:-4] +"_"+chain+".pdb"
#        output = pdb
	out = open(output[:-4]+chain+".fasta.txt","w")
	out.write(">"+pdb[:-4]+chain+"\n")
	out.write(str(Sequence)+"\n")
	out.close()
	io.save(output,SelectChains(chain))
Example #23
0
	def getRegionsResidues(self):		# fill self.__regions_res dictionary with list of residues
		ppb=PPBuilder()			# for every region contained in self.__regions_res
		res = []
		bltpep = ppb.build_peptides(self.__struct[0])
		for key in self.__regions_res:
			for pp in bltpep: 
				s = str(pp.get_sequence())
				reg_seq = list(self.__regions.get_group(key)['tcr_region_seq'])[0]
				ind = s.find(reg_seq, 0, len(s))
				if (ind != -1):
					for i in range(ind, ind + len(reg_seq)):
						res.append(pp[i])
					self.__regions_res[key] = res
					break
			if not res:	
				line = '\n' + reg_seq + ' not found in '+ self.__name + '!\n'
				self.printerr('getRegionResidues(): ' + line)
				return 0
			res = []
		return 1
Example #24
0
def get_pp(pdb, chain, start, length, seq):
    """retrieve the residiues for a given pdb file and chain as polypeptides"""
    f = make_filename(pdb)
    p = PDBParser(PERMISSIVE=1)
    pdb_struct = p.get_structure(
        pdb, f)  # Load the pdb structure pdb contained on the file f.
    pdb_chain = pdb_struct[0][
        chain]  # Select the right Chain of the structure.
    ppb = PPBuilder()  # Initialize a peptide builder.
    peptides = ppb.build_peptides(
        pdb_chain)  # Load the given chain as a peptide.
    for i, pep in enumerate(peptides):
        if str(pep.get_sequence()).find(seq) != -1:
            start = str(pep.get_sequence()).find(seq)
            break
    if start > 0 and (start + length + 2) <= len(pep):
        pp = pep[(start - 1):(start + length + 2)]
        return pp
    else:
        raise
Example #25
0
def get_sequence(pdb, chain):
    if chain is "%":
        chain = " "
    warnings.filterwarnings('always', message='.*discontinuous at.*')
    pdb_parser = PDBParser(
        PERMISSIVE=0, QUIET=True
    )  # The PERMISSIVE instruction allows PDBs presenting errors.
    pdb_structure = pdb_parser.get_structure(pdb, pdb)

    pdb_chain = pdb_structure[0][chain]
    ppb = PPBuilder()
    Sequence = ""
    for pp in ppb.build_peptides(pdb_chain, aa_only=False):
        Sequence = Sequence + pp.get_sequence()

    io = PDBIO()
    io.set_structure(pdb_structure)
    output = pdb[0:-4] + ".pdb"
    out = open(output[:-4] + ".fasta.atom", "w")
    out.write(">" + pdb[0:-4] + "\n")
    out.write(str(Sequence) + "\n")
    out.close()
Example #26
0
def get_sequence(pdb, chain, first, last, output):
    pdb_parser = PDBParser(PERMISSIVE=0)                    # The PERMISSIVE instruction allows PDBs presenting errors.
    pdb_structure = pdb_parser.get_structure(pdb,pdb)

    pdb_chain = pdb_structure[0][chain]
    ppb=PPBuilder()
    Sequence = ""
    for pp in ppb.build_peptides(pdb_chain):
        Sequence = Sequence + pp.get_sequence()

    io = PDBIO()
    io.set_structure(pdb_structure)
#        if pdb[-5] == chain:
#            output = pdb
#        else:
#            output = pdb[:-4]+chain+".pdb"
### writing out sequence to fasta
#    out = open(output[:-4]+".fasta.txt","w")
#    out.write(">"+output[:-4]+"\n")
#        out.write(str(Sequence[first-1: last-2])+"\n")
#        out.close()
    io.save(output,SelectDomain(chain, first, last))
Example #27
0
    def test_ppbuilder_real(self):
        """Test PPBuilder on real PDB file."""
        ppb = PPBuilder()
        pp = ppb.build_peptides(self.structure)

        self.assertEqual(len(pp), 3)

        # Check termini
        self.assertEqual(pp[0][0].get_id()[1], 152)
        self.assertEqual(pp[0][-1].get_id()[1], 184)
        self.assertEqual(pp[1][0].get_id()[1], 186)
        self.assertEqual(pp[1][-1].get_id()[1], 213)
        self.assertEqual(pp[2][0].get_id()[1], 216)
        self.assertEqual(pp[2][-1].get_id()[1], 220)

        # Now check sequences
        pp0_seq = pp[0].get_sequence()
        pp1_seq = pp[1].get_sequence()
        pp2_seq = pp[2].get_sequence()
        self.assertIsInstance(pp0_seq, Seq)
        self.assertEqual(pp0_seq, "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
        self.assertEqual(pp1_seq, "TETLLVQNANPDCKTILKALGPGATLEE")
        self.assertEqual(pp2_seq, "TACQG")
Example #28
0
def get_sequence(pdb, chain):
    pdb_parser = PDBParser(
        PERMISSIVE=0
    )  # The PERMISSIVE instruction allows PDBs presenting errors.
    pdb_structure = pdb_parser.get_structure(pdb, pdb)

    pdb_chain = pdb_structure[0][chain]
    ppb = PPBuilder()
    Sequence = ""
    for pp in ppb.build_peptides(pdb_chain):
        Sequence = Sequence + pp.get_sequence()
    start = [residue.id[1] for residue in pdb_chain][0]
    if start is not 1:
        for residue in pdb_chain:
            residue.id = (' ', residue.id[1] - start + 1, ' ')
    io = PDBIO()
    io.set_structure(pdb_structure)
    #        output = pdb[-8:-4] +"_"+chain+".pdb"
    output = "renumbered_" + pdb
    #        out = open(output[:-4]+".fasta.txt","w")
    #        out.write(">"+pdb[-8:-4]+"_"+chain+"\n")
    #        out.write(str(Sequence))
    #        out.close()
    io.save(output, SelectChains(chain))
    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()

        # Building protein and protconf objects for g protein structure in complex
        scs = SignprotComplex.objects.all()
        for sc in scs:
            self.logger.info(
                'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                .format(sc))
            try:
                # Alpha subunit
                try:
                    alpha_protein = Protein.objects.get(
                        entry_name=sc.structure.pdb_code.index.lower() + '_a')
                except:
                    alpha_protein = Protein()
                    alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.accession = None
                    alpha_protein.name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.sequence = sc.protein.sequence
                    alpha_protein.family = sc.protein.family
                    alpha_protein.parent = sc.protein
                    alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                    alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                        slug='mod')
                    alpha_protein.source = ProteinSource.objects.get(
                        name='OTHER')
                    alpha_protein.species = sc.protein.species
                    alpha_protein.save()
                try:
                    alpha_protconf = ProteinConformation.objects.get(
                        protein__entry_name=sc.structure.pdb_code.index.lower(
                        ) + '_a')
                except:
                    alpha_protconf = ProteinConformation()
                    alpha_protconf.protein = alpha_protein
                    alpha_protconf.state = ProteinState.objects.get(
                        slug='active')
                    alpha_protconf.save()
                pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                s = pdbp.get_structure('struct',
                                       StringIO(sc.structure.pdb_data.pdb))
                chain = s[0][sc.alpha]
                nums = []
                for res in chain:
                    try:
                        res['CA']
                        nums.append(res.get_id()[1])
                    except:
                        pass

                resis = Residue.objects.filter(
                    protein_conformation__protein=sc.protein)
                num_i = 0
                temp_seq2 = ''
                pdb_num_dict = OrderedDict()
                # Create first alignment based on sequence numbers
                for n in nums:
                    if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                        nr = n + 6
                    else:
                        nr = n
                    pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)]
                # Find mismatches
                mismatches = []
                for n, res in pdb_num_dict.items():
                    if AA[res[0].get_resname()] != res[1].amino_acid:
                        mismatches.append(res)

                pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                seqadv = []
                for l in pdb_lines:
                    if l.startswith('SEQADV'):
                        seqadv.append(l)
                mutations, shifted_mutations = OrderedDict(), OrderedDict()
                # Search for annotated engineered mutations in pdb SEQADV
                for s in seqadv:
                    line_search = re.search(
                        'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                        s)
                    if line_search != None:
                        if line_search.group(2) == sc.alpha:
                            if line_search.group(
                                    4).strip() == sc.protein.accession:
                                if line_search.group(3) == line_search.group(
                                        6):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                else:
                                    shifted_mutations[int(
                                        line_search.group(3))] = [
                                            line_search.group(1),
                                            line_search.group(5),
                                            int(line_search.group(6))
                                        ]
                            else:
                                # Exception for 6G79
                                if line_search.group(3) != line_search.group(
                                        6) and 'CONFLICT' in line_search.group(
                                            7):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                # Exception for 5G53
                                if line_search.group(
                                        4).strip() != sc.protein.accession:
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                remaining_mismatches = []

                # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                for m in mismatches:
                    num = m[0].get_id()[1]
                    if num in mutations:
                        if m[0].get_resname() != mutations[num][0] and m[
                                1].amino_acid != AA[mutations[num][1]]:
                            remaining_mismatches.append(m)
                    elif num in shifted_mutations:
                        remaining_mismatches.append(m)
                    else:
                        remaining_mismatches.append(m)

                ### sanity check
                # print(mutations)
                # print(shifted_mutations)
                # print(mismatches)
                # print(remaining_mismatches)
                # pprint.pprint(pdb_num_dict)

                # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                if len(remaining_mismatches
                       ) > 0 and sc.structure.pdb_code.index not in [
                           '6OIJ', '6OY9', '6OYA'
                       ]:
                    ppb = PPBuilder()
                    seq = ''
                    for pp in ppb.build_peptides(chain, aa_only=False):
                        seq += str(pp.get_sequence())
                    pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2,
                                                  -1, -.5, -.1)
                    ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                    wt_pdb_dict = OrderedDict()
                    pdb_wt_dict = OrderedDict()
                    j, k = 0, 0
                    for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq,
                                            temp_seq):
                        if ref != '-' and temp != '-':
                            wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j]
                            j += 1
                            k += 1
                        elif ref == '-':
                            wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                            k += 1
                        elif temp == '-':
                            wt_pdb_dict[resis[j]] = i
                            pdb_wt_dict[i] = resis[j]
                            j += 1
                    for i, r in enumerate(remaining_mismatches):
                        # Adjust for shifted residue when residue is a match
                        if r[0].get_id()[1] - remaining_mismatches[
                                i - 1][0].get_id()[1] > 1:
                            pdb_num_dict[r[0].get_id()[1] -
                                         1][1] = pdb_wt_dict[chain[
                                             r[0].get_id()[1] - 1]]
                        # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                        if r[0].get_id()[1] in shifted_mutations:
                            pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                sequence_number=shifted_mutations[
                                    r[0].get_id()[1]][2])
                        # Adjust for shift
                        else:
                            pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[
                                r[0]]

                bulked_residues = []
                for key, val in pdb_num_dict.items():
                    # print(key, val) # sanity check
                    res_obj = Residue()
                    res_obj.sequence_number = val[0].get_id()[1]
                    res_obj.amino_acid = AA[val[0].get_resname()]
                    res_obj.display_generic_number = val[
                        1].display_generic_number
                    res_obj.generic_number = val[1].generic_number
                    res_obj.protein_conformation = alpha_protconf
                    res_obj.protein_segment = val[1].protein_segment
                    bulked_residues.append(res_obj)
                Residue.objects.bulk_create(bulked_residues)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                    .format(sc))
            except Exception as msg:
                print(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
                print(msg)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
from Bio.SeqRecord import SeqRecord

structures = []
pdb_ids = []
structures_dir = "pdb_structures"
parser = MMCIFParser()

# Read structures from IO
for item in listdir(structures_dir):
    if item.find('.') == -1:
        for subitem in listdir(structures_dir + "/" + item):
            print("Parsing " + subitem)
            structures.append(
                parser.get_structure(
                    subitem[:4], structures_dir + "/" + item + "/" + subitem))
            pdb_ids.append(subitem[:4])

# Extract peptide sequences and write to sequence_from_structure
ppb = PPBuilder()
for i, structure in enumerate(structures):
    pdb_id = pdb_ids[i]
    print(pdb_id)
    peptides = ppb.build_peptides(structure)
    seqs = []
    for peptide in peptides:
        seqs.append(peptide.get_sequence())
    sorted_seqs = sorted(seqs, key=len)

    AlignIO.write(
        MultipleSeqAlignment([SeqRecord(sorted_seqs[-1], id=pdb_id)]),
        "sequence_from_structure/" + pdb_id + ".fasta", "fasta")
Example #31
0
from Bio.PDB import PDBParser
from Bio.PDB import PPBuilder
from Bio.PDB import Polypeptide

item = '2bnr'	
structure = PDBParser().get_structure(item, '../pdbs/'+item+'.pdb')
ppb=PPBuilder()
peps = ppb.build_peptides(structure)

print structure.get_id()
print peps[0]
#print peps[0][1:-3]
print peps[0][3:9]
p = peps[0][3:9]
print peps[0][1].get_resname()
    if filename.endswith(".pdb"):
        # dataset_dict[filename] = idx
        dataset_filenames.append(filename)
        idx += 1

pdb_to_seq = {}

parser = PDBParser()
ppb = PPBuilder()
i = 0
for filename in dataset_filenames:
    with warnings.catch_warnings(record=True):
        with open(os.path.join(Constants.PDB_PATH, filename)) as f:
            structure = parser.get_structure(os.path.splitext(filename)[0], f)
    model = structure[0]
    for pp in ppb.build_peptides(model):
        #print(pp.get_sequence())
        pdb_to_seq[filename] = str(pp.get_sequence())
        break

file_to_ds = {}

with open(Constants.TRAIN_VAL_TEST_SPLIT_FILE_PATH) as file:
    split_d = json.load(file)
    for tr_val_or_test, filenames in split_d.items():
        for fn in filenames:
            file_to_ds[fn] = tr_val_or_test

seq_to_pdbs = {}

for pdb, seq in pdb_to_seq.items():
Example #33
0
 chain = struc[0][chainid]
 resnums = [resi.id[1] for resi in chain]
 #  calphas = [resi['CA'] for resi in chain]
 #print code + chainid
 # find gaps in numbering
 breaks = [
     j for i, j in enumerate(resnums) if i != 0 and j != resnums[i - 1] + 1
 ]
 #  dists = [j - calphas[i-1] for i,j in enumerate(calphas) if i != 0 ]
 # measure c-alpha distances
 #  breakdists = [j for i,j in enumerate(calphas) if i != 0 and (j - calphas[i-1]) > 4]
 #print breakdists
 #print breaks
 # use in built polypeptide builder
 ppb = PPBuilder()
 if len(ppb.build_peptides(struc[0][chainid])) > 1:
     with open("bad.5codes", 'a') as fout:
         fout.write(code + '\n')
     if False:
         #for pp in ppb.build_peptides(struc):
         print pp.get_sequence()
         io.set_structure(pp)
         io.save("/tmp/test.pdb")
 else:
     with open("good.5codes", 'a') as fout:
         fout.write(code + '\n')
 #print '\n'.join(map(str,dists))
 #if len(breaks) > 0:
 if False:
     print "breaks", breaks
     print resnums
Example #34
0
#       list[n].append(atom)
#       previous = atom
#   return list

if __name__ == "__main__":

    current_path = os.path.dirname(sys.argv[0])
    pdb_path = current_path + '../pdb/'
    pdb_id = '2vb1'


    structure = get_structure(pdb_id, pdb_path)
    model = structure[0]

    ppb = PPBuilder()
    pp_list = ppb.build_peptides(model)

    # orient
    orient(pp_list)

    # first split stage
    fs = first_split(pp_list)





    for seg in fs:
        pp = Polypeptide.Polypeptide(seg)
        print pp.get_sequence()
    def handle(self, *args, **options):
        startTime = datetime.datetime.now()
        self.options = options
        if self.options["purge"]:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith="_a",
                protein_conformation__protein__family__parent__parent__name=
                "Alpha").delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith="_a",
                protein__family__parent__parent__name="Alpha").delete()
            Protein.objects.filter(
                entry_name__endswith="_a",
                family__parent__parent__name="Alpha").delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options["only_signprot_structures"]:
            # Building protein and protconf objects for g protein structure in complex
            if options["s"]:
                scs = SignprotComplex.objects.filter(
                    structure__pdb_code__index__in=[
                        i.upper() for i in options["s"]
                    ])
            else:
                scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    "Protein, ProteinConformation and Residue build for alpha subunit of {} is building"
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            "_a")
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug="mod")
                        alpha_protein.source = ProteinSource.objects.get(
                            name="OTHER")
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + "_a")
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug="active")
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure("struct",
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        if "CA" in res and res.id[0] == " ":
                            nums.append(res.get_id()[1])

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ""
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == "6OIJ" and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split("\n")
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith("SEQADV"):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)",
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and "CONFLICT" in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    if options["debug"]:
                        print(sc)
                        print(mutations)
                        print(shifted_mutations)
                        print(mismatches)
                        print("======")
                        print(remaining_mismatches)
                        pprint.pprint(pdb_num_dict)

                    no_seqnum_shift = [
                        '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U',
                        '7L1V'
                    ]

                    # Check if HN is mutated to GNAI1 for the scFv16 stabilizer
                    if sc.protein.entry_name != 'gnai1_human' and len(
                            remaining_mismatches) > 0:
                        target_HN = resis.filter(protein_segment__slug='HN')
                        gnai1_HN = Residue.objects.filter(
                            protein_conformation__protein__entry_name=
                            'gnai1_human',
                            protein_segment__slug='HN')
                        pdb_HN_seq = ''
                        for num, val in pdb_num_dict.items():
                            if num <= target_HN.reverse()[0].sequence_number:
                                pdb_HN_seq += Polypeptide.three_to_one(
                                    val[0].get_resname())
                        if options['debug']:
                            print('Checking if HN is gnai1_human')
                            print(pdb_HN_seq)
                            print(''.join(
                                gnai1_HN.values_list('amino_acid', flat=True)))
                        gnai1_HN_seq = ''.join(
                            gnai1_HN.values_list('amino_acid', flat=True))
                        pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq,
                                                      3, -4, -3, -1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        length, match = 0, 0
                        for r, t in zip(ref_seq, temp_seq):
                            if options['debug']:
                                print(r, t)
                            if t != '-':
                                if r == t:
                                    match += 1
                                length += 1
                        identity = match / length * 100
                        if options['debug']:
                            print(identity)
                        if identity > 85:
                            if sc.structure.pdb_code.index not in ['7DFL']:
                                no_seqnum_shift.append(
                                    sc.structure.pdb_code.index)
                            if options['debug']:
                                print(
                                    'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction'
                                    .format(round(identity)))

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(
                            remaining_mismatches
                    ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift:
                        ppb = PPBuilder()
                        seq = ""
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        if sc.structure.pdb_code.index in [
                                '7JVQ', '7L1U', '7L1V'
                        ]:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 3, -4, -3, -1)
                        else:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])

                        # Custom fix for A->G mutation at pos 18
                        if sc.structure.pdb_code.index == '7JJO':
                            ref_seq = ref_seq[:18] + ref_seq[19:]
                            temp_seq = temp_seq[:17] + temp_seq[18:]
                        # Custom alignment fixes
                        elif sc.structure.pdb_code.index == '7DFL':
                            ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                            temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                        elif sc.structure.pdb_code.index == '7JOZ':
                            temp_seq = temp_seq[:67] + (
                                '-' * 14) + 'FNGDS' + temp_seq[86:]
                        elif sc.structure.pdb_code.index == '7AUE':
                            ref_seq = ref_seq[:31].replace('-',
                                                           '') + ref_seq[31:]
                            temp_seq = (
                                9 *
                                '-') + temp_seq[2:5] + temp_seq[5:54].replace(
                                    '-', '') + temp_seq[54:]
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            if options["debug"]:
                                print(i, ref, temp)  # alignment check
                            if ref != "-" and temp != "-":
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == "-":
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == "-":
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        # Custom fix for 7JJO isoform difference
                        if sc.structure.pdb_code.index in [
                                '7JJO', '7JOZ', '7AUE'
                        ]:
                            pdb_num_dict = OrderedDict()
                            for wt_res, st_res in wt_pdb_dict.items():
                                if type(st_res) == type([]):
                                    pdb_num_dict[wt_res.sequence_number] = [
                                        st_res[0], wt_res
                                    ]
                        else:
                            for i, r in enumerate(remaining_mismatches):
                                # Adjust for shifted residue when residue is a match
                                if r[0].get_id()[1] - remaining_mismatches[
                                        i - 1][0].get_id()[1] > 1:
                                    pdb_num_dict[r[0].get_id()[1] -
                                                 1][1] = pdb_wt_dict[chain[
                                                     r[0].get_id()[1] - 1]]
                                # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                                if r[0].get_id()[1] in shifted_mutations:
                                    pdb_num_dict[
                                        r[0].get_id()[1]][1] = resis.get(
                                            sequence_number=shifted_mutations[
                                                r[0].get_id()[1]][2])
                                # Adjust for shift
                                else:
                                    pdb_num_dict[r[0].get_id()
                                                 [1]][1] = pdb_wt_dict[r[0]]
                            if sc.structure.pdb_code.index == '7JVQ':
                                pdb_num_dict[198][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=346)
                                pdb_num_dict[235][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=383)
                            elif sc.structure.pdb_code.index == '6PB0':
                                pdb_num_dict[205][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=205)
                    ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    elif sc.structure.pdb_code.index == "6WHA":
                        ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV"
                        temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV"
                        pdb_num_dict = OrderedDict()
                        temp_resis = [res for res in chain]
                        temp_i = 0
                        mapped_cgns = []
                        for i, aa in enumerate(temp_seq):
                            if aa != "-":
                                ref_split_on_gaps = ref_seq[:i + 1].split("-")
                                ref_seqnum = i - (len(ref_split_on_gaps) -
                                                  1) + 1
                                res = resis.get(sequence_number=ref_seqnum)
                                if res.display_generic_number.label in mapped_cgns:
                                    next_presumed_cgn = self.get_next_presumed_cgn(
                                        res)
                                    if next_presumed_cgn:
                                        res = next_presumed_cgn
                                        while res and res.display_generic_number.label in mapped_cgns:
                                            res = self.get_next_presumed_cgn(
                                                res)
                                    else:
                                        print(
                                            "Error: {} CGN does not exist. Incorrect mapping of {} in {}"
                                            .format(next_presumed_cgn,
                                                    chain[nums[temp_i]],
                                                    sc.structure))
                                mapped_cgns.append(
                                    res.display_generic_number.label)
                                pdb_num_dict[nums[temp_i]] = [
                                    chain[nums[temp_i]], res
                                ]
                                temp_i += 1

                    bulked_rotamers = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            res_obj.save()
                            rot = self.create_structure_rotamer(
                                val[0], res_obj, sc.structure)
                            bulked_rotamers.append(rot)
                        else:
                            self.logger.info(
                                "Skipped {} as no annotation was present, while building for alpha subunit of {}"
                                .format(val[1], sc))
                    if options["debug"]:
                        pprint.pprint(pdb_num_dict)
                    Rotamer.objects.bulk_create(bulked_rotamers)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished"
                        .format(sc))
                except Exception as msg:
                    if options["debug"]:
                        print("Error: ", sc, msg)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed"
                        .format(sc))

        if not options["s"]:
            ### Build SignprotStructure objects from non-complex signprots
            g_prot_alphas = Protein.objects.filter(
                family__slug__startswith="100_001",
                accession__isnull=False)  #.filter(entry_name="gnai1_human")
            complex_structures = SignprotComplex.objects.all().values_list(
                "structure__pdb_code__index", flat=True)
            for a in g_prot_alphas:
                pdb_list = get_pdb_ids(a.accession)
                for pdb in pdb_list:
                    if pdb not in complex_structures:
                        try:
                            data = self.fetch_gprot_data(pdb, a)
                            if data:
                                self.build_g_prot_struct(a, pdb, data)
                        except Exception as msg:
                            self.logger.error(
                                "SignprotStructure of {} {} failed\n{}: {}".
                                format(a.entry_name, pdb, type(msg), msg))

        if options["debug"]:
            print(datetime.datetime.now() - startTime)
Example #36
0
    def create_rotamers(self, structure, pdb_path):
        wt_lookup = {} #used to match WT seq_number to WT residue record
        pdbseq = {} #used to keep track of pdbseq residue positions vs index in seq
        ref_positions = {} #WT postions in alignment
        mapped_seq = {} # index in contruct, tuple of AA and WT [position,AA]

        preferred_chain = structure.preferred_chain

        if len(preferred_chain.split(','))>1: #if A,B
            preferred_chain = preferred_chain.split(',')[0]


        AA = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D',
     'CYS':'C', 'GLN':'Q', 'GLU':'E', 'GLY':'G',
     'HIS':'H', 'ILE':'I', 'LEU':'L', 'LYS':'K',
     'MET':'M', 'PHE':'F', 'PRO':'P', 'SER':'S',
     'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}


        s = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', pdb_path)[0]
        chain = s[preferred_chain] #select only one chain (avoid n-mer receptors)
        ppb=PPBuilder()
        seq = ''
        i = 1

        check_1000 = 0
        for pp in ppb.build_peptides(chain): #remove >1000 pos (fusion protein / gprotein)
            for res in pp:
                id = res.id
                if id[1]<600: 
                    check_1000 += 1
                    #need check_1000 to catch structures where they lie in 1000s (4LDE, 4LDL, 4LDO, 4N4W, 4QKX)
                if id[1]>1000 and check_1000>200: 
                    chain.detach_child(id)

        for pp in ppb.build_peptides(chain): 
            seq += str(pp.get_sequence()) #get seq from fasta (only chain A)
            for residue in pp:
                residue_id = residue.get_full_id()
                chain = residue_id[2]
                if chain not in pdbseq:
                    pdbseq[chain] = {}
                pos = residue_id[3][1]
                pdbseq[chain][pos] = [i,AA[residue.resname]]
                i += 1

        parent_seq = str(structure.protein_conformation.protein.parent.sequence)

        rs = Residue.objects.filter(protein_conformation__protein=structure.protein_conformation.protein.parent).prefetch_related('display_generic_number','generic_number','protein_segment')

        for r in rs: #required to match WT position to a record (for duplication of GN values)
            wt_lookup[r.sequence_number] = r

        #align WT with structure seq -- make gaps penalties big, so to avoid too much overfitting
        pw2 = pairwise2.align.localms(parent_seq, seq, 2, -4, -4, -.1)

        gaps = 0
        unmapped_ref = {}
        for i, r in enumerate(pw2[0][0], 1): #loop over alignment to create lookups (track pos)
            #print(i,r,pw2[0][1][i-1]) #print alignment for sanity check
            if r == "-":
                gaps += 1
            if r != "-":
                ref_positions[i] = [i-gaps,r]
            elif r == "-":
                ref_positions[i] = [None,'-']

            if pw2[0][1][i-1]=='-':
                unmapped_ref[i-gaps] = '-'

        gaps = 0
        for i, r in enumerate(pw2[0][1], 1): #make second lookup
            if r == "-":
                gaps += 1
            if r != "-":
                mapped_seq[i-gaps] = [r,ref_positions[i]]


        pdb = structure.pdb_data.pdb
        protein_conformation=structure.protein_conformation
        temp = ''
        check = 0
        errors = 0
        mismatch_seq = 0
        match_seq = 0
        not_matched = 0
        matched_by_pos = 0
        aa_mismatch = 0

        pdblines_temp = pdb.splitlines()
        pdblines = []
        for line in pdblines_temp: #Get rid of all odd records
            if line.startswith('ATOM'):
                pdblines.append(line)
        pdblines.append('') #add a line to not "run out"

        for i,line in enumerate(pdblines):
            if line.startswith('ATOM'): 
                chain = line[21]
                if preferred_chain and chain!=preferred_chain: #If perferred is defined and is not the same as the current line, then skip
                    pass
                else:   
                    nextline = pdblines[i+1]
                    residue_number = line[22:26].strip()
                    if (check==0 or nextline[22:26].strip()==check) and nextline.startswith('TER')==False and nextline.startswith('ATOM')==True: #If this is either the begining or the same as previous line add to current rotamer
                        temp += line + "\n"
                        #print('same res',pdb.splitlines()[i+1])
                    else: #if this is a new residue
                        #print(pdb.splitlines()[i+1][22:26].strip(),check)
                        temp += line + "\n"
                        if int(check.strip())<2000:
                            residue = Residue()
                            residue.sequence_number = int(check.strip())
                            residue.amino_acid = AA[residue_name.upper()]
                            residue.protein_conformation = protein_conformation

                            #print(residue.sequence_number,residue.amino_acid) #sanity check
                            try:
                                seq_num_pos = pdbseq[chain][residue.sequence_number][0]
                            except:
                                #print('failed residue',pdb_path,residue.sequence_number)
                                temp = "" #start new line for rotamer
                                check = pdblines[i+1][22:26].strip()
                                continue
                            if seq_num_pos in mapped_seq:
                                if mapped_seq[seq_num_pos][1][0]==None:
                                    #print('no match found') #sanity check
                                    #print(residue.sequence_number,residue.amino_acid) #sanity check
                                    residue.display_generic_number = None
                                    residue.generic_number = None
                                    residue.protein_segment = None
                                    not_matched +=1
                                else:
                                    wt_r = wt_lookup[mapped_seq[seq_num_pos][1][0]]
                                    if residue.sequence_number!=wt_r.sequence_number and residue.amino_acid!=wt_r.amino_acid and residue.sequence_number in wt_lookup: #if pos numbers not work -- see if the pos number might be in WT and unmapped
                                        if wt_lookup[residue.sequence_number].amino_acid==residue.amino_acid:
                                            if residue.sequence_number in unmapped_ref: #WT was not mapped, so could be it
                                               # print(residue.sequence_number,residue.amino_acid) #sanity check
                                                #print('wrongly matched, better match on pos+aa',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                                wt_r = wt_lookup[residue.sequence_number]
                                                matched_by_pos +=1
                                                match_seq += 1
                                            else:
                                                mismatch_seq += 1
                                                #print('could have been matched, but already aligned to another position',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        else:
                                            #print('WT pos not same AA, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                            mismatch_seq += 1
                                    elif residue.sequence_number!=wt_r.sequence_number:
                                        #print('WT pos not same pos, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        mismatch_seq += 1
                                    elif residue.amino_acid!=wt_r.amino_acid:
                                        #print('aa mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        aa_mismatch += 1

                                    else:
                                        match_seq += 1
                                    if wt_r.generic_number is not None:
                                        residue.display_generic_number = wt_r.display_generic_number
                                        residue.generic_number = wt_r.generic_number 
                                    else:
                                        residue.display_generic_number = None
                                        residue.generic_number = None
                                        #print('no GN')
                                    residue.protein_segment = wt_r.protein_segment
                            else:
                                #print('wierd error') #sanity check
                                residue.display_generic_number = None
                                residue.generic_number = None
                                residue.protein_segment = None

                            #print('inserted',residue.sequence_number) #sanity check
                            residue.save()

                            rotamer_data, created = PdbData.objects.get_or_create(pdb=temp)
                            rotamer, created = Rotamer.objects.get_or_create(residue=residue, structure=structure, pdbdata=rotamer_data)

                        temp = "" #start new line for rotamer
                        check = pdblines[i+1][22:26].strip()
                    
                    check = pdblines[i+1][22:26].strip()
                chain = line[21]
                residue_name = line[17:20].title() #use title to get GLY to Gly so it matches
        #print(structure.pdb_code.index,'length',len(seq),len(mapped_seq),'mapped res',str(mismatch_seq+match_seq+aa_mismatch),'pos mismatch',mismatch_seq,'aa mismatch',aa_mismatch,'not mapped',not_matched,' mapping off, matched on pos,aa',matched_by_pos)
        return None
Example #37
0
def SuperimposeChains(final_files, temp_obj, PDB_bychain_objects, temp_chains):
    """
	Superimposes each target chain atoms to the corresponding template chain atoms.

	Arguments:

	temp_obj: object of the current template.
	PDB_bychain_objects: list of PDB objects corresponding to each target chain.
	temp_chains: dictionary with the correspondencies of template-target chains.
	"""

    i = 0
    ref_model = temp_obj[0]
    ppbuild = PPBuilder()
    template_chains = Selection.unfold_entities(temp_obj, 'C')
    min_len1 = min(
        list(
            map(lambda x: len(ppbuild.build_peptides(x)[0].get_sequence()),
                template_chains)))
    min_len2 = min(
        list(
            map(lambda x: len(ppbuild.build_peptides(x)[0].get_sequence()),
                PDB_bychain_objects)))
    min_len = min([min_len1, min_len2])
    atoms_to_be_aligned = range(2, min_len)

    # Perform the superimposition for each target chain.
    for sample_structure in PDB_bychain_objects:
        sample_model = sample_structure[0]
        ref_atoms = []
        sample_atoms = []

        # Superimpose the target chain with it's corresponding template chain.
        for ref_chain in ref_model:
            for key, val in temp_chains.items():
                if val == sample_structure.get_id():
                    if GeneralFunctions.GetNameWOChain(
                            key) == temp_obj.get_id():
                        temp_ch = key
            if temp_obj.get_id() + "_" + ref_chain.get_id() == temp_ch:
                for ref_res in ref_chain:
                    if ref_res.get_id(
                    )[1] in atoms_to_be_aligned:  # Ensure to superimpose the same number of atoms.
                        ref_atoms.append(
                            ref_res['CA'])  # Take only C-alfa atoms.

        for sample_chain in sample_model:
            for sample_res in sample_chain:
                if sample_res.get_id(
                )[1] in atoms_to_be_aligned:  # Ensure to superimpose the same number of atoms.
                    sample_atoms.append(
                        sample_res['CA'])  # Take only C-alfa atoms.

        # Superimpose.
        super_imposer = Superimposer()
        super_imposer.set_atoms(ref_atoms, sample_atoms)
        matrix = super_imposer.rotran

        # Apply rotation and translation.
        for atom in sample_structure.get_atoms():
            atom.transform(matrix[0], matrix[1])

        # Create a PDB file to save the new coordinates.
        io = PDBIO()
        io.set_structure(sample_structure)
        io.save(temp_obj.get_id() + "_" + str(i) + "_aligned.pdb",
                write_end=False)
        i += 1

    # Append each chain to a unique file.
    j = copy.copy(i)
    i = 1
    file = open(temp_obj.get_id() + "_0_aligned.pdb", 'a')
    final_files.append(temp_obj.get_id() + "_0_aligned.pdb")

    while i < j:
        file2 = open(temp_obj.get_id() + "_" + str(i) + "_aligned.pdb")
        for line in file2:
            file.write(line)
        i += 1
    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options['only_signprot_structures']:
            # Building protein and protconf objects for g protein structure in complex
            scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            '_a')
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')
                        alpha_protein.source = ProteinSource.objects.get(
                            name='OTHER')
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + '_a')
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug='active')
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure('struct',
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        try:
                            res['CA']
                            nums.append(res.get_id()[1])
                        except:
                            pass

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ''
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith('SEQADV'):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and 'CONFLICT' in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    ### sanity check
                    # print(sc)
                    # print(mutations)
                    # print(shifted_mutations)
                    # print(mismatches)
                    # print('======')
                    # print(remaining_mismatches)
                    # pprint.pprint(pdb_num_dict)

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(remaining_mismatches
                           ) > 0 and sc.structure.pdb_code.index not in [
                               '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA'
                           ]:
                        ppb = PPBuilder()
                        seq = ''
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        pw2 = pairwise2.align.localms(sc.protein.sequence, seq,
                                                      2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            # print(i, ref, temp) # alignment check
                            if ref != '-' and temp != '-':
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == '-':
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == '-':
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        for i, r in enumerate(remaining_mismatches):
                            # Adjust for shifted residue when residue is a match
                            if r[0].get_id()[1] - remaining_mismatches[
                                    i - 1][0].get_id()[1] > 1:
                                pdb_num_dict[r[0].get_id()[1] -
                                             1][1] = pdb_wt_dict[chain[
                                                 r[0].get_id()[1] - 1]]
                            # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                            if r[0].get_id()[1] in shifted_mutations:
                                pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                    sequence_number=shifted_mutations[
                                        r[0].get_id()[1]][2])
                            # Adjust for shift
                            else:
                                pdb_num_dict[r[0].get_id()
                                             [1]][1] = pdb_wt_dict[r[0]]
                    # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    # elif sc.structure.pdb_code.index=='6WHA':
                    #     ref_seq  = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                    #     temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV'
                    #     for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq):
                    #         print(i, ref, temp)
                    #     pprint.pprint(pdb_num_dict)

                    bulked_residues = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            bulked_residues.append(res_obj)
                        else:
                            self.logger.info(
                                'Skipped {} as no annotation was present, while building for alpha subunit of {}'
                                .format(val[1], sc))

                    Residue.objects.bulk_create(bulked_residues)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                        .format(sc))
                except Exception as msg:
                    #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc))
                    #print(msg)
                    #print(traceback.format_exc())
                    #exit(0)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                        .format(sc))

        ### Build SignprotStructure objects from non-complex signprots
        g_prot_alphas = Protein.objects.filter(
            family__slug__startswith='100_001',
            accession__isnull=False)  #.filter(entry_name='gnai1_human')
        complex_structures = SignprotComplex.objects.all().values_list(
            'structure__pdb_code__index', flat=True)
        for a in g_prot_alphas:
            pdb_list = get_pdb_ids(a.accession)
            for pdb in pdb_list:
                if pdb not in complex_structures:
                    try:
                        data = self.fetch_gprot_data(pdb, a)
                        if data:
                            self.build_g_prot_struct(a, pdb, data)
                    except Exception as msg:
                        self.logger.error(
                            'SignprotStructure of {} {} failed\n{}: {}'.format(
                                a.entry_name, pdb, type(msg), msg))